def demoFourGs(): ''' Demonstrate the performance of LCC on points drawn from a four gaussians ''' s=(640,480) dat = genNormalClusters(N=100, size=s) cList = ['red', 'blue','green','yellow'] img_truth = plotClusts(dat[0], dat[1], size=s, colors=[cList[i] for i in dat[1]], window=None) #generate normal hierarchical clustering off euclidean data points print "Generating Hierarchical Clustering on Raw Data" Z2 = spc.ward(scipy.array(dat[0])) clusts2 = spc.fcluster(Z2, 4, criterion="maxclust") img_HC = plotClusts(dat[0], clusts2, size=s, colors=[cList[i-1] for i in clusts2], window=None) #generate LCC clustering print "Generating LCC Clustering" (clusts, _,_,_) = pf.LatentConfigurationClustering(dat[0], pt_dist, 4, numtrees=27) img_LCC = plotClusts(dat[0], clusts, size=s, colors=[cList[i-1] for i in clusts], window=None) im = pv.ImageMontage([img_truth, img_LCC, img_HC], layout=(1,3), gutter=3, tileSize=(320,240), labels=None ) im.show(window="Truth vs. LCC vs. HC")
def test_scikit_vs_scipy(): """Test scikit ward with full connectivity (i.e. unstructured) vs scipy """ from scipy.sparse import lil_matrix n, p, k = 10, 5, 3 rnd = np.random.RandomState(0) connectivity = lil_matrix(np.ones((n, n))) for i in range(5): X = 0.1 * rnd.normal(size=(n, p)) X -= 4 * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.ward(X) children_ = out[:, :2].astype(np.int) children, _, n_leaves, _ = ward_tree(X, connectivity) cut = _hc_cut(k, children, n_leaves) cut_ = _hc_cut(k, children_, n_leaves) assess_same_labelling(cut, cut_) # Test error management in _hc_cut assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves)
def make_tree(X, C, method='single'): if method == 'single': tree = to_tree(single(C)) elif method == 'ward': tree = to_tree(ward(X)) elif method == 'average': tree = to_tree(average(C)) return Tree(root=construct_node(tree))
def plotHierarchichalClusterGraph(tf_idf_matrix, headlines_utf): dist = 1 - cosine_similarity(tf_idf_matrix) linkage_matrix = ward(dist) fig, ax = plt.subplots(figsize=(15, 20)) # set size dendrogram(linkage_matrix, orientation="right", labels=headlines_utf); plt.tick_params(axis= 'x', which='both', bottom='off', top='off', labelbottom='off') plt.tight_layout() plt.savefig('../plots/hierachichal_clusters.png', dpi=200)
def setUp(self): np.random.seed(0) x = np.random.rand(10) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) ids = np.arange(len(x)).astype(np.str) self.tree = TreeNode.from_linkage_matrix(lm, ids) # initialize tree with branch length and named internal nodes for i, n in enumerate(self.tree.postorder(include_self=True)): n.length = 1 if not n.is_tip(): n.name = "y%d" % i
def hierarchyCluster(dist,titles): linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances fig, ax = plt.subplots(figsize=(15, 20)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=titles); plt.tick_params(\ axis= 'x', # changes apply to the x-axis which='major', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='on') plt.tight_layout() #show plot with tight layout plt.show()
def _ward_cluster(X): """Clusters 1-corr using Ward distance Parameters ---------- X Returns ------- """ # pairwise (1-corr) of zscores D = pdist( X, metric="correlation" ) # return top branch split using ward linkage return fcluster( ward(D), 2, criterion="maxclust" )
def setUp(self): np.random.seed(0) self.table = pd.DataFrame(np.random.random((5, 5))) num_otus = 5 # otus x = np.random.rand(num_otus) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) self.tree = SquareDendrogram.from_tree(t) for i, n in enumerate(t.postorder()): if not n.is_tip(): n.name = "y%d" % i n.length = np.random.rand()*3
def hierachical_clustering(self): linkage_matrix = ward(self.__dist_matrix) #define the linkage_matrix using ward clustering pre-computed distances fig, ax = plt.subplots(figsize=(15, 9)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=titles); plt.tick_params(\ axis= 'x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') fig.set_tight_layout(True) #show plot with tight layout plt.show()
def test_cache_ntips(self): dm = DistanceMatrix.from_iterable([0, 1, 2, 3], lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) ids = np.arange(4).astype(np.str) t = mock.from_linkage_matrix(lm, ids) t._cache_ntips() self.assertEquals(t.leafcount, 4) self.assertEquals(t.children[0].leafcount, 2) self.assertEquals(t.children[1].leafcount, 2) self.assertEquals(t.children[0].children[0].leafcount, 1) self.assertEquals(t.children[0].children[1].leafcount, 1) self.assertEquals(t.children[1].children[0].leafcount, 1) self.assertEquals(t.children[1].children[1].leafcount, 1)
def knn(df, axis=None, labels=None): dist = 1 - cosine_similarity(df.values) # define the linkage_matrix using ward clustering pre-computed distances linkage_matrix = ward(dist) fig, ax = plt.subplots(figsize=(15, 20)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=labels) plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') plt.tight_layout()
def find_clusters(self, features): ''' Returns the clusters and their centroids.''' # 1. Cluster the data. totalClusters = int(round(features.shape[0] / 2)) distance = 1 - pairwise_distances(features, metric = "cosine") # Ward minimizes the sum of squared differences within all clusters. # It is a variance-minimizing approach, which is similar to the k-means objective function. linkage_matrix = ward(distance) clusters = fcluster(linkage_matrix, totalClusters, criterion = 'maxclust') print "Number of clusters:", totalClusters # 2. Find the centroid for each cluster. centroid = np.empty([totalClusters, features.shape[1]]) for i in range(1, totalClusters + 1): nCluster = np.where(clusters == i) centroid[i-1,:] = np.mean(features[nCluster], axis = 0) return (clusters, centroid)
def create_hierarchy(self, sim_matrix): linkage_matrix = ward(sim_matrix) fig, ax = plt.subplots(figsize=(15, 20)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=self.titles); plt.tick_params(\ axis= 'x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') plt.tight_layout() #show plot with tight layout #uncomment below to save figure plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters return
def lsa_dendrogram(lessonpath): # document-term matrix and document indices dtm, docindex, lessonname = dtm_matrix(lessonpath) # reconstructed dtm matrix using LSA and a reduced subspace of dimension 3 dtm2 = LSA_dtm(dtm, 3) # distance metric based on cosine similarity dist = 1 - cosine_similarity(dtm) dist = np.round(dist, 10) # linkage matrix linkage_matrix = ward(dist) # dendrogram show(dendrogram(linkage_matrix, orientation="right", labels=docindex))
def get_clusters(self, data, features=None, text_features=[], n_clusters=8, centroid_features=10, random_seeds=True, weights=[]): """ Applies Agglomerative hierarchial clustering using Ward's linkage Parameters ---------- data : Pandas DataFrame Data on which on apply clustering features : list, optional, default : all columns used as features Subset of columns in the data frame to be used as features text_features : list, optional, default : None List of features that are of type text. These are then vectorizer using TfidfVectorizer. n_clusters : int, optional, default: 8 The number of clusters to form as well as the number of centroids to generate. centroid_features : int, optional, default: 10 The number of most-important-features to return against each cluster centroid random_seeds : boolean, optional, default: False If False, uses clusters from kernel density estimation followed by thresholding as initial seeds. The number of clusters is also determined by results of kde and thus n_clusters parameter is ignored. Returns ------- result : tuple (labels, centroid_features) labels : cluster numbers against each row of the data passed centroids : dictionary map of most important features of each cluster """ X = self.encode_features(data, features, text_features) ipshell() dist = 1 - cosine_similarity(X) self.linkage_matrix = ward(dist) return (km.labels_, centroids)
def setUp(self): np.random.seed(0) self.table = pd.DataFrame(np.random.random((5, 5)), index=['0', '1', '2', '3', '4'], columns=['0', '1', '2', '3', '4']) num_otus = 5 # otus x = np.random.rand(num_otus) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) self.t = SquareDendrogram.from_tree(t) self.md = pd.Series(['a', 'a', 'a', 'b', 'b'], index=['0', '1', '2', '3', '4']) for i, n in enumerate(t.postorder()): if not n.is_tip(): n.name = "y%d" % i n.length = np.random.rand()*3 self.highlights = pd.DataFrame({'y8': ['#FF0000', '#00FF00'], 'y6': ['#0000FF', '#F0000F']}).T
def cluster_ndarray( profiles_arr, output_prefix="clustered", output_lists=False, threshold=25, criterion="maxclust", min_num_images=50, ): """ cluster_ndarray clusters images based on their radial profiles Parameters ---------- profiles_arr : np.ndarray radial profiles (or any other profiles, honestly) 2D np.ndarray output_prefix : str, optional output prefix for image lists0, by default "clustered" output_lists : bool, optional whether to output lists as text fiels, by default False threshold : int, optional distance according to criterion, by default 25 criterion : str, optional criterion for clustering, by default "maxclust" min_num_images : int, optional minimal number of images in single cluster, others will go to singletone, by default 50 Returns ------- Union[dict, list] Either: - Dictionary {cluster_num:[*image_and_event_lines]} -- if output_lists == False - List [output_list_1.lst, output_list_2.lst, ...] -- if output_lists == True """ profiles = np.array([elem[1] for elem in profiles_arr]) names = np.array([elem[0] for elem in profiles_arr]) # this actually does clustering Z = ward(pdist(profiles)) idx = fcluster(Z, t=threshold, criterion=criterion) # output lists clusters = defaultdict(lambda: set()) out_lists = set() for list_idx in tqdm(list(set(idx)), desc="Output lists"): belong_to_this_idx = np.where(idx == list_idx)[0] if len(belong_to_this_idx) < min_num_images: fout_name = f"{output_prefix}_singletone.lst" out_cluster_idx = -1 else: fout_name = f"{output_prefix}_{list_idx}.lst" out_cluster_idx = list_idx out_lists.add(fout_name) try: os.remove(fout_name) except OSError: pass # print output lists if you want to for name in names[belong_to_this_idx]: clusters[out_cluster_idx].add(name) if output_lists: with open(fout_name, "a") as fout: print(*clusters[out_cluster_idx], sep="\n", file=fout) if output_lists: return list(out_lists) else: return clusters
def cluster(carrel, type): """Apply dimension reduction to <carrel> and visualize the result. This is useful for determining how holistic <carrel> is. A carrel with many clusters is less holistic and probably means the number of latent topics (think "subjects") is high. On the other hand, you may observe clusters falling into distinct groups surrounding authors, titles, or sources. In other words, use this subcommand to learn the degree <carrel> is a hodgepodge of items or a collection of unrelated items. Example: rdr cluster homer See also: rdr tm --help""" # configure MAXIMUM = 0.95 MINIMUM = 2 STOPWORDS = 'english' EXTENSION = '.txt' # require from os import path, system, listdir from scipy.cluster.hierarchy import ward, dendrogram from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.manifold import MDS from sklearn.metrics.pairwise import cosine_similarity import matplotlib.pyplot as plt # sanity check checkForCarrel(carrel) # initialize localLibrary = configuration('localLibrary') directory = localLibrary / carrel / TXT filenames = [ path.join(directory, filename) for filename in listdir(directory) ] vectorizer = TfidfVectorizer(input='filename', max_df=MAXIMUM, min_df=MINIMUM, stop_words=STOPWORDS) matrix = vectorizer.fit_transform(filenames).toarray() distance = 1 - cosine_similarity(matrix) keys = [ path.basename(filename).replace(EXTENSION, '') for filename in filenames ] # branch according to type; dendrogram if type == 'dendrogram': linkage_matrix = ward(distance) dendrogram(linkage_matrix, orientation="right", labels=keys) plt.tight_layout() # cube elif type == 'cube': mds = MDS(n_components=3, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(distance) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(pos[:, 0], pos[:, 1], pos[:, 2]) for x, y, z, s in zip(pos[:, 0], pos[:, 1], pos[:, 2], keys): ax.text(x, y, z, s) # error else: click.echo(f"Error: Unknown value for TYPE: { type }") system('rdr cluster --help') # output plt.show()
def test_basic_plot(self): self.maxDiff = None exp_edges = {'dest_node': ['0', '1', '2', 'y3'], 'edge_color': ['#00FF00', '#00FF00', '#00FF00', '#FF0000'], 'edge_width': [2, 2, 2, 2], 'src_node': ['y3', 'y4', 'y3', 'y4'], 'x0': [338.2612593838583, 193.1688862557773, 338.2612593838583, 193.1688862557773], 'x1': [487.5, 12.499999999999972, 324.89684138234867, 338.2612593838583], 'y0': [271.7282256126416, 365.95231443706376, 271.7282256126416, 365.95231443706376], 'y1': [347.7691620070637, 483.2800610261029, 16.719938973897143, 271.7282256126416]} exp_nodes = {'child0': [np.nan, np.nan, np.nan, '0', '1'], 'child1': [np.nan, np.nan, np.nan, '2', 'y3'], 'color': ['#1C9099', '#1C9099', '#1C9099', '#FF999F', '#FF999F'], 'hover_var': [None, None, None, None, None], 'is_tip': [True, True, True, False, False], 'node_size': [10, 10, 10, 10, 10], 'x': [487.5, 12.499999999999972, 324.89684138234867, 338.26125938385832, 193.16888625577729], 'y': [347.7691620070637, 483.28006102610289, 16.719938973897143, 271.72822561264161, 365.95231443706376]} np.random.seed(0) num_otus = 3 # otus x = np.random.rand(num_otus) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) t = UnrootedDendrogram.from_tree(t) # incorporate colors in tree for i, n in enumerate(t.postorder(include_self=True)): if not n.is_tip(): n.name = "y%d" % i n.color = '#FF999F' n.edge_color = '#FF0000' n.node_size = 10 else: n.color = '#1C9099' n.edge_color = '#00FF00' n.node_size = 10 n.length = np.random.rand()*3 n.edge_width = 2 p = radialplot(t, node_color='color', edge_color='edge_color', node_size='node_size', edge_width='edge_width') for e in exp_edges.keys(): self.assertListEqual( list(p.renderers[0].data_source.data[e]), exp_edges[e]) for e in exp_nodes.keys(): self.assertListEqual( list(p.renderers[1].data_source.data[e]), exp_nodes[e]) self.assertTrue(isinstance(t, TreeNode))
# compute distance matrix distance_matrix = manhattan_distances(activities_binary_matrix) print(distance_matrix.shape) activity_names = [ 'Shopping', 'Antiquing', 'Site Seeing', 'Fine Dining', 'Casual Dining', 'Family Style Dining', 'Fast Food Dining', 'Museums', 'Indoor Pool', 'Outdoor Pool', 'Hiking', 'Gambling', 'Boating/Swimming', 'Fishing', 'Golfing', 'Boat Tours', 'Ride the Ducks', 'Amusement Park', 'Minigolf', 'Go-carting', 'Waterpark', 'Circus World', 'Tommy Bartlett Ski Show', 'Helicopter Rides', 'Horseback Riding', 'Stand Rock', 'Outdoor Attractions', 'Nearby Attractions', 'Movie Theater', 'Concert Theater', 'Bar/Pub Dancing', 'Shop Broadway', 'Bungee Jumping' ] linkage_matrix = ward(distance_matrix) fig, ax = plt.subplots(figsize=(15, 20)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=activity_names) plt.tick_params(\ axis = 'x', # changes apply to the x-axis which = 'both', # both major and minor ticks are affected bottom = 'off', # ticks along the bottom edge are off top = 'off', # ticks along the top edge are off labelbottom = 'off') plt.tight_layout() # show plot with tight layout # route figure to external file plt.savefig('plot_hierarchical_clustering_solution.png', dpi=200)
# -*- coding:utf-8 -*- import pickle import numpy as np from scipy.cluster.hierarchy import ward, dendrogram from matplotlib import pyplot as plt with open('countries_vectors.pickle', 'rb') as f: xs = pickle.load(f) countries = [] with open('countries2.txt', 'r') as f: for c in f: countries.append(c.strip()) X = np.array(xs) cluster = ward(X) print(cluster) dendrogram(cluster, labels=countries) plt.show()
#加上x,y标签 for i in range(len(df)): ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8) plt.show() # 绘图展示 #如果要存储图片可以把下面的#去掉 #plt.savefig('clusters_small_noaxes.png', dpi=200) plt.close() # 肉眼看还凑合是吧,聚类主题相关的,很多都在附近。那咱们再用层次聚类试试好了。 ######################### # 文本层次聚类 from scipy.cluster.hierarchy import ward, dendrogram linkage_matrix = ward(dist) # 定义linkage_matrix为ward型预算聚类 fig, ax = plt.subplots(figsize=(15, 20)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=titles) plt.tick_params( \ axis= 'x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') plt.tight_layout() # show plot with tight layout # 如果要保存图片,把下面的#去掉 # plt.savefig('ward_clusters.png', dpi=200)
""" 96の単語ベクトルに対して,Ward法による階層型クラスタリングを実行せよ.さらに,クラスタリング結果をデンドログラムとして可視化せよ. """ import pickle from collections import OrderedDict from scipy import io import numpy as np from sklearn.cluster import KMeans from scipy.cluster.hierarchy import ward, dendrogram from matplotlib import pyplot as plt f_in_dict = "dict_countries" f_in_matrix = "matrix_counties" with open(f_in_dict, "rb") as f: dict_index_t = pickle.load(f) matrix_x_300 = io.loadmat(f_in_matrix)['matrix_x_300'] ward = ward(matrix_x_300) print(ward) dendrogram(ward, labels=list(dict_index_t.keys()), leaf_font_size=8) plt.show()
centroids = kmeans.cluster_centers_ print(centroids) predict_label = kmeans.predict(train_x) for i in range(k): plt.scatter(centroids[i][2], centroids[i][3], c=color[i], marker='X', s=60) print(data) for (_data, _label) in zip(train_x, predict_label): plt.scatter(_data[2], _data[3], color=color[_label],alpha=0.3) plt.show() """ #合并聚类结果,插入到原数据集 result = pd.concat((data, pd.DataFrame(kmeans.labels_)), axis=1) #给单元格附名 result.rename({0: u'聚类'}, axis=1, inplace=True) #print(result) result.to_csv("car_cluster_result.csv", index=True) """ #层次聚类建模方法 from sklearn.cluster import KMeans, AgglomerativeClustering model = AgglomerativeClustering(linkage='ward', n_clusters=3) y = model.fit_predict(train_x) print(y) """ #可以用层次聚类的方式对KMeans方法进行可视化,结果大概率一致 #print(train_x) linkage_matrix = ward(train_x) dendrogram(linkage_matrix) plt.show()
def cluster_dendogram(corpus, titles=None, stemming=True, max_df=0.95, min_df=2, ngram=(1, 3), cleaning=simple_textcleaning, vectorizer='bow', stop_words=STOPWORDS, random_samples=0.3, figsize=(17, 9), **kwargs): """ plot hierarchical dendogram with similar texts. Parameters ---------- corpus: list titles: list list of titles, length must same with corpus. stemming: bool, (default=True) If True, sastrawi_stemmer will apply. max_df: float, (default=0.95) maximum of a word selected based on document frequency. min_df: int, (default=2) minimum of a word selected on based on document frequency. ngram: tuple, (default=(1,3)) n-grams size to train a corpus. cleaning: function, (default=simple_textcleaning) function to clean the corpus. stop_words: list, (default=STOPWORDS) list of stop words to remove. vectorizer: str, (default='bow') vectorizer technique. Allowed values: * ``'bow'`` - Bag of Word. * ``'tfidf'`` - Term frequency inverse Document Frequency. * ``'skip-gram'`` - Bag of Word with skipping certain n-grams. Returns ------- dictionary: {'linkage_matrix': linkage_matrix, 'titles': titles} """ if not isinstance(corpus, list): raise ValueError('corpus must be a list') if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if not isinstance(titles, list) and titles is not None: raise ValueError('titles must be a list or None') if titles: if len(titles) != len(corpus): raise ValueError('length of titles must be same with corpus') if not isinstance(vectorizer, str): raise ValueError('vectorizer must be a string') if not isinstance(stemming, bool): raise ValueError('bool must be a boolean') vectorizer = vectorizer.lower() if not vectorizer in ['tfidf', 'bow', 'skip-gram']: raise ValueError( "vectorizer must be in ['tfidf', 'bow', 'skip-gram']") if not isinstance(ngram, tuple): raise ValueError('ngram must be a tuple') if not len(ngram) == 2: raise ValueError('ngram size must equal to 2') if not isinstance(min_df, int): raise ValueError('min_df must be an integer') if not isinstance(max_df, float): raise ValueError('max_df must be a float') if min_df < 1: raise ValueError('min_df must be bigger than 0') if not (max_df <= 1 and max_df > 0): raise ValueError( 'max_df must be bigger than 0, less than or equal to 1') if vectorizer == 'tfidf': Vectorizer = TfidfVectorizer elif vectorizer == 'bow': Vectorizer = CountVectorizer elif vectorizer == 'skip-gram': Vectorizer = SkipGramVectorizer else: raise ValueError( "vectorizer must be in ['tfidf', 'bow', 'skip-gram']") if vectorizer == 'tfidf': Vectorizer = TfidfVectorizer elif vectorizer == 'bow': Vectorizer = CountVectorizer elif vectorizer == 'skip-gram': Vectorizer = SkipGramVectorizer else: raise ValueError( "vectorizer must be in ['tfidf', 'bow', 'skip-gram']") try: import matplotlib.pyplot as plt import seaborn as sns sns.set() except: raise Exception( 'matplotlib and seaborn not installed. Please install it and try again.' ) tf_vectorizer = Vectorizer(ngram_range=ngram, min_df=min_df, max_df=max_df, stop_words=stop_words, **kwargs) corpus = random.sample(corpus, k=int(random_samples * len(corpus))) if cleaning is not None: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) if stemming: for i in range(len(corpus)): corpus[i] = sastrawi(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stop_words])) tf_vectorizer.fit(text_clean) transformed_text_clean = tf_vectorizer.transform(text_clean) features = tf_vectorizer.get_feature_names() dist = 1 - cosine_similarity(transformed_text_clean) linkage_matrix = ward(dist) if not titles: titles = [] for i in range(transformed_text_clean.shape[0]): indices = np.argsort( np.array(transformed_text_clean[i].todense())[0])[::-1] titles.append(' '.join([features[i] for i in indices[:ngram[1]]])) plt.figure(figsize=figsize) ax = dendrogram(linkage_matrix, orientation='right', labels=titles) plt.tick_params( axis='x', which='both', bottom='off', top='off', labelbottom='off', ) plt.tight_layout() plt.show() return {'linkage_matrix': linkage_matrix, 'titles': titles}
clusterFrame.groupby('cluster').cluster.transform(len) > min_size] else: X_filtrado = clusterFrame makeScatterPlot(data=clusterFrame, outputName="./imagenes/scatterMatrix_caso3_" + algorithm_name, displayOutput=False) makeHeatmap(data=X_filtrado, outputName="./imagenes/heatmap_caso3_" + algorithm_name, displayOutput=False) if algorithm_name == 'AC': X_filtrado_normal = preprocessing.normalize(X_filtrado, norm='l2') linkage_array = ward(X_filtrado_normal) dendrogram(linkage_array, leaf_rotation=90., leaf_font_size=5.) plt.show() #plt.clf() results['N Clusters'] = n_clusters results['HC metric'] = met[0] results['SC metric'] = met[1] results['Time'] = timeAlg outputData[algorithm_name] = results latexCaso1 = createLatexDataFrame(data=outputData) f = open('caso3.txt', 'w') f.write(latexCaso1.to_latex())
plt.show() mglearn.plots.plot_kmeans_faces(km, pca, pca_x, x_people, y_people, people.target_names) ## Agglomerative Clustering from scipy.cluster.hierarchy import dendrogram, ward agglomerative = AgglomerativeClustering(n_clusters=40) labels_agg = agglomerative.fit_predict(pca_x) print("Cluster size with agglomerative clustering: {}"\ .format(np.bincount(labels_agg))) print("ARI btw KMeans and Agglomerative {:.2f}"\ .format(adjusted_rand_score(labels_agg, labels_km))) # low commonality linkage_array = ward(pca_x) plt.figure(figsize=(20, 5)) dendrogram(linkage_array, p=7, truncate_mode='level', no_labels=True) plt.xlabel("Simple index") plt.ylabel("Cluster distance") plt.show() for cluster in [10, 13, 19, 22, 36]: mask = labels_agg == cluster cluster_size = np.sum(mask) fig, axes = plt.subplots(1, 15, subplot_kw={ 'xticks': (), 'yticks': ()
plt.ylabel("pc2") plt.axis('equal') plt.show() df_new = pd.DataFrame(pca.get_covariance()) print(df_new) plt.matshow(pca.components_, cmap='twilight') plt.colorbar() plt.gca().xaxis.tick_bottom() plt.xticks(range(len(df.columns)), df.iloc[:, :].columns, rotation=90) plt.yticks(range(len(df2.columns)), df2.iloc[:, :].columns) plt.title("Main features") i, k = plt.ylim() plt.ylim(i + 0.5, k - 0.5) plt.show() from scipy.cluster.hierarchy import dendrogram, ward linkage_array = ward(df2) plt.figure(figsize=(20, 10)) dendrogram(linkage_array, truncate_mode='level', no_labels=True, p=10) plt.title("Dendrogram") plt.show() import SimpSOM as sps net = sps.somNet(20, 20, df2.values, PBC=True) net.train(0.01, 10000) net.save('filename_weights') # net.nodes_graph(colnum=0) # net.diff_graph() net.cluster(df2.values, type='qthresh')
## Distance metrics dist = euclidean_distances(dtm) cosdist = 1 - cosine_similarity(dtm) ## 2D Visualization mds = MDS(n_components=2, dissimilarity="precomputed", random_state=5193) pos = mds.fit_transform(cosdist) xs, ys = pos[:, 0], pos[:, 1] for x, y, name in zip(xs, ys, names): plt.scatter(x, y) plt.text(x, y, name) plt.title("Document Distances 2D Cartesian") plt.show() ## 3D Visualization mds = MDS(n_components=3, dissimilarity="precomputed", random_state=5193) pos = mds.fit_transform(cosdist) fig = plt.figure() ax = fig.add_subplot(111, projection="3d") ax.scatter(pos[:, 0], pos[:, 1], pos[:, 2]) for x, y, z, s in zip(pos[:, 0], pos[:, 1], pos[:, 2], names): ax.text(x, y, z, s) plt.title("Document Distances 3D Cartesian") plt.show() ## Hierarchical Clustering Visualization linkage_matrix = ward(cosdist) dendrogram(linkage_matrix, orientation="right", labels=names) plt.tight_layout() plt.title("Document Distances Hierarchical Clustering") plt.show()
def magic_cluster(input_matrix, output_path, out_name, num_clusters=5, num_terms_in_cluster=20, cluster_seed=3425, source=''): #todo: add logic to dynamically select optimal number of clusters #note: num_terms_in_cluster is only the number of terms to be REPORTED, the actual # number of terms is the full collection of all terms #convert our term tf_idf into a proper matrix (get rid of extraneous columns, and transpose) tfidf_matrix = input_matrix.copy() #were we given the terms matrix to cluster, or the concept matrix? if 'concept' in tfidf_matrix.columns: tfidf_matrix.drop( ['t_count', 'concept', 'd_count', 'tf', 'idf', 'tf_idf', 'weight'], axis=1, inplace=True) else: tfidf_matrix.drop( ['t_count', 'd_count', 'tf', 'idf', 'tf_idf', 'weight'], axis=1, inplace=True) tfidf_matrix.fillna(0, inplace=True) tfidf_matrix = tfidf_matrix.transpose() #Calculate cosine similarity of all terms to each other... dist = 1 - cosine_similarity(tfidf_matrix) #determine k-means clustering #km = KMeans(n_clusters=num_clusters) km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=1, verbose=0, random_state=cluster_seed) km.fit(tfidf_matrix) clusters = km.labels_.tolist() vocab_frame = pd.DataFrame(tfidf_matrix.columns) tfidf_matrix['cluster'] = clusters #tfidf_matrix['cluster'].value_counts() #how many DSI belong to each cluster? #what are the top terms from each of the clusters? cluster_terms = {} cluster_names = {} seed_name = "B" if cluster_seed == 3425: seed_name = "A" text_file = open( output_path + source + '_' + out_name + '_clusters.' + seed_name + '.txt', "w") #print("Top terms per cluster:\n") text_file.write("Top terms per cluster:\n") #sort cluster centers by proximity to centroid order_centroids = km.cluster_centers_.argsort()[:, ::-1] for i in range(num_clusters): terms_in_cluster = '' dsi_in_cluster = '' #print("Cluster %d words:" % i) text_file.write("Cluster %d words:\n" % i) for ind in order_centroids[i, :num_terms_in_cluster]: terms_in_cluster = terms_in_cluster + vocab_frame.iloc[ind][ 0] + ", " #print(terms_in_cluster[:-2]) #don't print the trailing ', ' text_file.write(terms_in_cluster[:-2]) #don't print the trailing ', ' cluster_terms[i] = terms_in_cluster[:-2] cluster_names[i] = cluster_terms[i].split( ', ')[:4] #only use the first 4 terms to "name" the cluster #print() text_file.write('\n\n') for dsi in tfidf_matrix[tfidf_matrix['cluster'] == i].index: dsi_in_cluster = dsi_in_cluster + dsi + ", " #print("DSI in cluster %d:" % i) text_file.write("DSI in cluster %d:\n" % i) #print(dsi_in_cluster[:-2]) #don't print the trailing ', ' text_file.write(dsi_in_cluster[:-2]) #don't print the trailing ', ' #print('\n\n') text_file.write('\n\n') text_file.close() del i, terms_in_cluster, dsi_in_cluster, ind, order_centroids, text_file MDS() # convert two components as we're plotting points in a two-dimensional plane # "precomputed" because we provide a distance matrix # we will also specify `random_state` so the plot is reproducible. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] #un-comment below to manually set up colors per clusters using a dict # also, find "cluster_colors" below, and un-comment that line to enable it #cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'} #un-comment the below section if you want to manually name the clusters # Be certain the dictionary is the same length as your declared number of clusters # cluster_names = {0: 'Immigration and border control', # 1: 'American governmental policy', # 2: 'Russian interference', # 3: 'International trade', # 4: 'Tax reform'} #create data frame that has the result of the MDS plus the cluster numbers and titles mappedDF = pd.DataFrame( dict(x=xs, y=ys, label=clusters, title=list(tfidf_matrix.index))) #group by cluster groups = mappedDF.groupby('label') # set up plot fig, ax = plt.subplots(figsize=(16, 12)) # set size ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling #iterate through groups to layer the plot for name, group in groups: ax.plot( group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], #color=cluster_colors[name], mec='none') ax.set_aspect('auto') ax.tick_params(\ axis= 'x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') ax.tick_params(\ axis= 'y', # changes apply to the y-axis which='both', # both major and minor ticks are affected left='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelleft='off') lgd = ax.legend(numpoints=1, bbox_to_anchor=(.5, -.3), loc=8, borderaxespad=0.) #lgd = ax.legend(numpoints=1, loc=0) #add label in x,y position with the label as the DSI# for i in range(len(mappedDF)): ax.text(mappedDF.loc[i]['x'], mappedDF.loc[i]['y'], mappedDF.loc[i]['title'], size=8) plt.title('DSI K-Means cluster assignment: ' + out_name) plt.margins(0.05, 0.1) #plt.show() #show the plot plt.savefig(output_path + source + out_name + '_kmeans.' + seed_name + '.png', bbox_extra_artists=(lgd, ), bbox_inches='tight', dpi=200) plt.close( 'all' ) #even though we don't show the plot, you need to explicitly close to free the memory #the 2D map is done, now prepare a dendrogram to visualize how clustering splits linkage_matrix = ward( dist ) #define the linkage_matrix using ward clustering pre-computed distances fig, ax = plt.subplots(figsize=(15, 30)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=list(tfidf_matrix.index)) plt.tick_params(\ axis= 'x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') plt.yticks(fontsize=14) plt.title('DSI Ward clustering dendrogram: ' + out_name) #plt.show() #plt.savefig(output_path + out_name + '_dendrogram' + seed_name + '.png', bbox_inches='tight', dpi=72) #Dendrogram is generated with ward distances on pre-computed values, it does not change based on KMeans seed # Therefore, don't write out an "A", and "B" version of the dendrogram # Todo: udpate dendrogram to better reflect the KMeans 2D map # Recommend you start here: https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/ plt.savefig(output_path + source + '_' + out_name + '_dendrogram.png', bbox_inches='tight', dpi=200) plt.close('all') #clear plot from memory return tfidf_matrix
count_vec = CountVectorizer(min_df=3) xx1 = count_vec.fit_transform(list1).toarray() word = count_vec.get_feature_names() print("word feature length: {}".format(len(word))) print(word) print(xx1.shape) print(xx1[0]) titles = word #------------------------------ 第四步 相似度计算 ------------------------------ df = pd.DataFrame(xx1) print(df.corr()) print(df.corr('spearman')) print(df.corr('kendall')) dist = df.corr() print(dist) print(dist.shape) #------------------------------ 第五步 可视化分析 ------------------------------ # define the linkage_matrix using ward clustering pre-computed distances linkage_matrix = ward(dist) fig, ax = plt.subplots(figsize=(15, 20)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=titles) # how plot with tight layout plt.tight_layout() # save figure as ward_clusters plt.savefig('Tree_word.png', dpi=200)
ward = AgglomerativeClustering(n_clusters=3, linkage="ward") n_samples = np.logspace(0.5, 3, 9) n_features = np.logspace(1, 3.5, 7) N_samples, N_features = np.meshgrid(n_samples, n_features) scikits_time = np.zeros(N_samples.shape) scipy_time = np.zeros(N_samples.shape) for i, n in enumerate(n_samples): for j, p in enumerate(n_features): X = np.random.normal(size=(n, p)) t0 = time.time() ward.fit(X) scikits_time[j, i] = time.time() - t0 t0 = time.time() hierarchy.ward(X) scipy_time[j, i] = time.time() - t0 ratio = scikits_time / scipy_time plt.figure("scikit-learn Ward's method benchmark results") plt.imshow(np.log(ratio), aspect="auto", origin="lower") plt.colorbar() plt.contour(ratio, levels=[1], colors="k") plt.yticks(range(len(n_features)), n_features.astype(np.int)) plt.ylabel("N features") plt.xticks(range(len(n_samples)), n_samples.astype(np.int)) plt.xlabel("N samples") plt.title("Scikit's time, in units of scipy time (log)") plt.show()
print("{:6.2f} segundos, ".format(tiempo),end='') if (k[name]>1): metric_CH[name] = metrics.calinski_harabaz_score(X_normal, cluster_predict[name]) metric_SC[name] = metrics.silhouette_score(X_normal, cluster_predict[name], metric='euclidean', sample_size=floor(0.1*len(X)), random_state=123456) print("CH index: {:9.3f}, ".format(metric_CH[name]),end='') print("SC: {:.5f}".format(metric_SC[name])) clusters = pd.DataFrame(cluster_predict[name],index=X.index,columns=['cluster']) X_cluster = pd.concat([X,clusters],axis=1) min_size = 5 X_filtrado = X_cluster[X_cluster.groupby('cluster').cluster.transform(len) > min_size] makeScatterPlot(X_filtrado) makeHeatmap(X_filtrado) clusters = pd.DataFrame(cluster_predict['Ward'],index=X.index,columns=['cluster']) X_cluster = pd.concat([X,clusters],axis=1) min_size = 5 X_filtrado = X_cluster[X_cluster.groupby('cluster').cluster.transform(len) > min_size] k_filtrado = len(set(X_filtrado['cluster'])) X_filtrado = X_filtrado.drop('cluster',1) X_filtrado_normal = X_filtrado.apply(norm_to_zero_one) linkage_array = hierarchy.ward(X_filtrado_normal) h_dict = hierarchy.dendrogram(linkage_array,orientation='left') sns.clustermap(X_filtrado_normal, method='ward', col_cluster=False, figsize=(15,7), cmap='YlGnBu', yticklabels=False)
ax.legend(numpoints=1) # 图例(legend)中每项只显示一个点 # 在坐标点为 x,y 处添加影片名作为标签(label) for i in range(len(df)): ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8) plt.show() # 展示绘图 # 以下注释语句可以保存需要的绘图 #plt.savefig('clusters_small_noaxes.png', dpi=200) plt.close() ##层次聚类 from scipy.cluster.hierarchy import ward, dendrogram linkage_matrix = ward(dist) # 聚类算法处理之前计算得到的距离,用 linkage_matrix 表示 fig, ax = plt.subplots(figsize=(15, 20)) # 设置大小 ax = dendrogram(linkage_matrix, orientation="right", labels=titles) plt.tick_params( axis='x', # 使用 x 坐标轴 which='both', # 同时使用主刻度标签(major ticks)和次刻度标签(minor ticks) bottom='off', # 取消底部边缘(bottom edge)标签 top='off', # 取消顶部边缘(top edge)标签 labelbottom='off') plt.tight_layout() # 展示紧凑的绘图布局 # 注释语句用来保存图片 #plt.savefig('ward_clusters.png', dpi=200) # 保存图片为 ward_clusters plt.close()
from sklearn.datasets import make_blobs X, y = make_blobs(random_state=1) agg = AgglomerativeClustering(n_clusters=3) assignment = agg.fit_predict(X) mglearn.discrete_scatter(X[:, 0], X[:, 1], assignment) import matplotlib.pyplot as plt plt.legend(["cluster 0", "cluster 1", "cluster 2"]) mglearn.plots.plot_agglomerative() #덴드로그램 from scipy.cluster.hierarchy import dendrogram, ward X, y = make_blobs(random_state=0, n_samples=12) linkage_array = ward(X) dendrogram(linkage_array) ax = plt.gca() bounds = ax.get_xbound() ax.plot(bounds, [7.25, 7.25], "--", c="k") ax.plot(bounds, [4, 4], "--", c="k") ax.text(bounds[1], 7.25, "two cluster", va="center", fontdict={"size": 15}) ax.text(bounds[1], 4, "three cluster", va="center", fontdict={"size": 15}) plt.xlabel("sample num") plt.ylabel("cluster distance") ##DBSCAN #random data from sklearn.cluster import DBSCAN
def cluster(): fileManager = managers.utility.loadFileManager() leq = '≤'.decode('utf-8') if request.method == "GET": # "GET" request occurs when the page is first loaded. if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALIZE_OPTIONS if 'hierarchyoption' not in session: session['hierarchyoption'] = constants.DEFAULT_HIERARCHICAL_OPTIONS labels = fileManager.getActiveLabels() thresholdOps = {} return render_template('cluster.html', labels=labels, thresholdOps=thresholdOps) if 'getdendro' in request.form: labelDict = fileManager.getActiveLabels() labels = [] for ind, label in labelDict.items(): labels.append(label) # Apply re-tokenisation and filters to DTM #countMatrix = fileManager.getMatrix(ARGUMENTS OMITTED) # Get options from request.form orientation = str(request.form['orientation']) title = request.form['title'] pruning = request.form['pruning'] pruning = int(request.form['pruning']) if pruning else 0 linkage = str(request.form['linkage']) metric = str(request.form['metric']) # Get active files allContents = [] # list of strings-of-text for each segment tempLabels = [] # list of labels for each segment for lFile in fileManager.files.values(): if lFile.active: contentElement = lFile.loadContents() allContents.append(contentElement) if request.form["file_" + str(lFile.id)] == lFile.label: tempLabels.append(lFile.label.encode("utf-8")) else: newLabel = request.form["file_" + str(lFile.id)].encode("utf-8") tempLabels.append(newLabel) # More options ngramSize = int(request.form['tokenSize']) useWordTokens = request.form['tokenType'] == 'word' try: useFreq = request.form['normalizeType'] == 'freq' useTfidf = request.form['normalizeType'] == 'tfidf' # if use TF/IDF normOption = "N/A" # only applicable when using "TF/IDF", set default value to N/A if useTfidf: if request.form['norm'] == 'l1': normOption = u'l1' elif request.form['norm'] == 'l2': normOption = u'l2' else: normOption = None except: useFreq = useTfidf = False normOption = None onlyCharGramsWithinWords = False if not useWordTokens: # if using character-grams # this option is disabled on the GUI, because countVectorizer count front and end markers as ' ' if this is true onlyCharGramsWithinWords = 'inWordsOnly' in request.form greyWord = 'greyword' in request.form MostFrequenWord = 'mfwcheckbox' in request.form Culling = 'cullcheckbox' in request.form showDeletedWord = False if 'greyword' or 'mfwcheckbox' or 'cullcheckbox' in request.form: if 'onlygreyword' in request.form: showDeletedWord = True if useWordTokens: tokenType = u'word' else: tokenType = u'char' if onlyCharGramsWithinWords: tokenType = u'char_wb' from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer vectorizer = CountVectorizer(input=u'content', encoding=u'utf-8', min_df=1, analyzer=tokenType, token_pattern=ur'(?u)\b[\w\']+\b', ngram_range=(ngramSize, ngramSize), stop_words=[], dtype=float, max_df=1.0) # make a (sparse) Document-Term-Matrix (DTM) to hold all counts DocTermSparseMatrix = vectorizer.fit_transform(allContents) dtm = DocTermSparseMatrix.toarray() from sklearn.metrics.pairwise import euclidean_distances from scipy.cluster.hierarchy import ward import matplotlib.pyplot as plt from scipy.cluster.hierarchy import average, weighted, ward, single, complete, dendrogram from scipy.cluster import hierarchy from scipy.spatial.distance import pdist if orientation == "left": orientation = "right" if orientation == "top": LEAF_ROTATION_DEGREE = 90 else: LEAF_ROTATION_DEGREE = 0 if linkage == "ward": dist = euclidean_distances(dtm) np.round(dist, 1) linkage_matrix = ward(dist) dendrogram(linkage_matrix, orientation=orientation, leaf_rotation=LEAF_ROTATION_DEGREE, labels=labels) Z = linkage_matrix else: Y = pdist(dtm, metric) Z = hierarchy.linkage(Y, method=linkage) dendrogram(Z, orientation=orientation, leaf_rotation=LEAF_ROTATION_DEGREE, labels=labels) plt.tight_layout() # fixes margins ## Conversion to Newick/ETE # Stuff we need from scipy.cluster.hierarchy import average, linkage, to_tree #from hcluster import linkage, to_tree from ete2 import Tree, TreeStyle, NodeStyle # Change it to a distance matrix T = to_tree(Z) # ete2 section root = Tree() root.dist = 0 root.name = "root" item2node = {T: root} to_visit = [T] while to_visit: node = to_visit.pop() cl_dist = node.dist /2.0 for ch_node in [node.left, node.right]: if ch_node: ch = Tree() ch.dist = cl_dist ch.name = str(ch_node.id) item2node[node].add_child(ch) item2node[ch_node] = ch to_visit.append(ch_node) # This is the ETE tree structure tree = root ts = TreeStyle() ts.show_leaf_name = True ts.show_branch_length = True ts.show_scale = False ts.scale = None if orientation == "top": ts.rotation = 90 ts.branch_vertical_margin = 10 # 10 pixels between adjacent branches # Draws nodes as small red spheres of diameter equal to 10 pixels nstyle = NodeStyle() nstyle["size"] = 0 # Replace the node labels for leaf in tree: k = leaf.name k = int(k) leaf.name = labels[k] # Apply node styles to nodes for n in tree.traverse(): n.set_style(nstyle) # Convert the ETE tree to Newick newick = tree.write() f = open('C:\\Users\\Scott\\Documents\\GitHub\\d3-dendro\\newickStr.txt', 'w') f.write(newick) f.close() # Save the image as .png... from os import path, makedirs # Using ETE folder = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER) if (not os.path.isdir(folder)): makedirs(folder) # saves dendrogram as a .png with pyplot plt.savefig(path.join(folder, constants.DENDROGRAM_PNG_FILENAME)) plt.close() # if orientation == "top": # plt.figure(figsize=(20,80)) # else: # plt.figure(figsize=(80,20)) pdfPageNumber, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold = utility.generateDendrogram( fileManager) session['dengenerated'] = True labels = fileManager.getActiveLabels() inconsistentOp = "0 " + leq + " t " + leq + " " + str(inconsistentMax) maxclustOp = "2 " + leq + " t " + leq + " " + str(maxclustMax) distanceOp = str(distanceMin) + " " + leq + " t " + leq + " " + str(distanceMax) monocritOp = str(monocritMin) + " " + leq + " t " + leq + " " + str(monocritMax) thresholdOps = {"inconsistent": inconsistentOp, "maxclust": maxclustOp, "distance": distanceOp, "monocrit": monocritOp} managers.utility.saveFileManager(fileManager) session_manager.cacheAnalysisOption() session_manager.cacheHierarchyOption() import random ver = random.random() * 100 return render_template('cluster.html', labels=labels, pdfPageNumber=pdfPageNumber, score=score, inconsistentMax=inconsistentMax, maxclustMax=maxclustMax, distanceMax=distanceMax, distanceMin=distanceMin, monocritMax=monocritMax, monocritMin=monocritMin, threshold=threshold, thresholdOps=thresholdOps, ver=ver)
def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False): """Ward clustering based on a Feature matrix. Recursively merges the pair of clusters that minimally increases within-cluster variance. The inertia matrix uses a Heapq-based representation. This is the structured version, that takes into account some topological structure between samples. Read more in the :ref:`User Guide <hierarchical_clustering>`. Parameters ---------- X : array-like of shape (n_samples, n_features) feature matrix representing n_samples samples to be clustered connectivity : sparse matrix, default=None connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. The matrix is assumed to be symmetric and only the upper triangular half is used. Default is None, i.e, the Ward algorithm is unstructured. n_clusters : int, default=None Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of samples. In this case, the complete tree is not computed, thus the 'children' output is of limited use, and the 'parents' output should rather be used. This option is valid only when specifying a connectivity matrix. return_distance : bool, default=None If True, return the distance between the clusters. Returns ------- children : ndarray of shape (n_nodes-1, 2) The children of each non-leaf node. Values less than `n_samples` correspond to leaves of the tree which are the original samples. A node `i` greater than or equal to `n_samples` is a non-leaf node and has children `children_[i - n_samples]`. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node `n_samples + i` n_connected_components : int The number of connected components in the graph. n_leaves : int The number of leaves in the tree parents : ndarray of shape (n_nodes,) or None The parent of each node. Only returned when a connectivity matrix is specified, elsewhere 'None' is returned. distances : ndarray of shape (n_nodes-1,) Only returned if return_distance is set to True (for compatibility). The distances between the centers of the nodes. `distances[i]` corresponds to a weighted euclidean distance between the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to leaves of the tree, then `distances[i]` is their unweighted euclidean distance. Distances are updated in the following way (from scipy.hierarchy.linkage): The new entry :math:`d(u,v)` is computed as follows, .. math:: d(u,v) = \\sqrt{\\frac{|v|+|s|} {T}d(v,s)^2 + \\frac{|v|+|t|} {T}d(v,t)^2 - \\frac{|v|} {T}d(s,t)^2} where :math:`u` is the newly joined cluster consisting of clusters :math:`s` and :math:`t`, :math:`v` is an unused cluster in the forest, :math:`T=|v|+|s|+|t|`, and :math:`|*|` is the cardinality of its argument. This is also known as the incremental algorithm. """ X = np.asarray(X) if X.ndim == 1: X = np.reshape(X, (-1, 1)) n_samples, n_features = X.shape if connectivity is None: from scipy.cluster import hierarchy # imports PIL if n_clusters is not None: warnings.warn('Partial build of the tree is implemented ' 'only for structured clustering (i.e. with ' 'explicit connectivity). The algorithm ' 'will build the full tree and only ' 'retain the lower branches required ' 'for the specified number of clusters', stacklevel=2) X = np.require(X, requirements="W") out = hierarchy.ward(X) children_ = out[:, :2].astype(np.intp) if return_distance: distances = out[:, 2] return children_, 1, n_samples, None, distances else: return children_, 1, n_samples, None connectivity, n_connected_components = _fix_connectivity( X, connectivity, affinity='euclidean') if n_clusters is None: n_nodes = 2 * n_samples - 1 else: if n_clusters > n_samples: raise ValueError('Cannot provide more clusters than samples. ' '%i n_clusters was asked, and there are %i ' 'samples.' % (n_clusters, n_samples)) n_nodes = 2 * n_samples - n_clusters # create inertia matrix coord_row = [] coord_col = [] A = [] for ind, row in enumerate(connectivity.rows): A.append(row) # We keep only the upper triangular for the moments # Generator expressions are faster than arrays on the following row = [i for i in row if i < ind] coord_row.extend(len(row) * [ind, ]) coord_col.extend(row) coord_row = np.array(coord_row, dtype=np.intp, order='C') coord_col = np.array(coord_col, dtype=np.intp, order='C') # build moments as a list moments_1 = np.zeros(n_nodes, order='C') moments_1[:n_samples] = 1 moments_2 = np.zeros((n_nodes, n_features), order='C') moments_2[:n_samples] = X inertia = np.empty(len(coord_row), dtype=np.float64, order='C') _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, inertia) inertia = list(zip(inertia, coord_row, coord_col)) heapify(inertia) # prepare the main fields parent = np.arange(n_nodes, dtype=np.intp) used_node = np.ones(n_nodes, dtype=bool) children = [] if return_distance: distances = np.empty(n_nodes - n_samples) not_visited = np.empty(n_nodes, dtype=np.int8, order='C') # recursive merge loop for k in range(n_samples, n_nodes): # identify the merge while True: inert, i, j = heappop(inertia) if used_node[i] and used_node[j]: break parent[i], parent[j] = k, k children.append((i, j)) used_node[i] = used_node[j] = False if return_distance: # store inertia value distances[k - n_samples] = inert # update the moments moments_1[k] = moments_1[i] + moments_1[j] moments_2[k] = moments_2[i] + moments_2[j] # update the structure matrix A and the inertia matrix coord_col = [] not_visited.fill(1) not_visited[k] = 0 _hierarchical._get_parents(A[i], coord_col, parent, not_visited) _hierarchical._get_parents(A[j], coord_col, parent, not_visited) # List comprehension is faster than a for loop [A[col].append(k) for col in coord_col] A.append(coord_col) coord_col = np.array(coord_col, dtype=np.intp, order='C') coord_row = np.empty(coord_col.shape, dtype=np.intp, order='C') coord_row.fill(k) n_additions = len(coord_row) ini = np.empty(n_additions, dtype=np.float64, order='C') _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, ini) # List comprehension is faster than a for loop [heappush(inertia, (ini[idx], k, coord_col[idx])) for idx in range(n_additions)] # Separate leaves in children (empty lists up to now) n_leaves = n_samples # sort children to get consistent output with unstructured version children = [c[::-1] for c in children] children = np.array(children) # return numpy array for efficient caching if return_distance: # 2 is scaling factor to compare w/ unstructured version distances = np.sqrt(2. * distances) return children, n_connected_components, n_leaves, parent, distances else: return children, n_connected_components, n_leaves, parent
def main(): fwr = codecs.open('lemmed_cluster_no_stopwords.txt', 'w', 'utf-8') lst = os.listdir(pth) titles = [] contents = [] for fl in lst: titles.append(fl.replace('.txt', '')) f = codecs.open(pth + fl, 'r', 'utf-8') cont = f.read() f.close() contents.append(cont) totalvocab_tokenized = [] for i in contents: allwords = tokenize_only(i) totalvocab_tokenized.extend(allwords) vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index=totalvocab_tokenized) fwr.write('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame\n') #vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed) tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, use_idf=True, tokenizer=tokenize_only, ngram_range=(1,3)) tfidf_matrix = tfidf_vectorizer.fit_transform(contents) for an in tfidf_matrix.shape: fwr.write(str(an) + '\t') fwr.write('\n') #fwr.write('\t'.join(tfidf_matrix.shape)) terms = tfidf_vectorizer.get_feature_names() dist = 1 - cosine_similarity(tfidf_matrix) num_clusters = 2 km = KMeans(n_clusters=num_clusters) km.fit(tfidf_matrix) clusters = km.labels_.tolist() joblib.dump(km, 'songs_cluster.pkl') #km = joblib.load('songs_cluster.pkl') #clusters = km.labels_.tolist() texts = { 'title': titles, 'content': contents, 'cluster': clusters } frame = pd.DataFrame(texts, index=[clusters], columns=['title', 'cluster']) fwr.write(str(frame['cluster'].value_counts())) fwr.write(u'Top terms per cluster:\n') order_centroids = km.cluster_centers_.argsort()[:, ::-1] for i in range(num_clusters): fwr.write("\nCluster %d words:" % i) for ind in order_centroids[i, :20]: try: #fwr.write(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore') + ', ') fwr.write(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0] + ', ') #print u' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore') except: pass #pass fwr.write("\nCluster %d titles:" % i) for title in frame.ix[i]['title'].values.tolist(): fwr.write(' %s,' % title) MDS() mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] #Visualizing document clusters #cluster_colors = {0: '#0000A0', 1: '#FF0000'} # #000000, #C0C0C0 cluster_colors = {0: '#000000', 1: '#C0C0C0'} cluster_names = {0: u'Meter anomaly', 1: u'Regular meter'} df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) groups = df.groupby('label') fig, ax = plt.subplots(figsize=(15, 15)) ax.margins(0.2) for name, group in groups: ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, label=cluster_names[name], color=cluster_colors[name], mec='none') ax.set_aspect('auto') ax.tick_params( axis= 'x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') ax.tick_params( axis= 'y', # changes apply to the y-axis which='both', # both major and minor ticks are affected left='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelleft='off') ax.legend(numpoints=1) #show legend with only 1 point for i in range(len(df)): ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=25) #plt.show() pylab.savefig('forms_cluster.png') # dendro linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances fig, ax = plt.subplots(figsize=(12, 10)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=titles) plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') plt.tight_layout() #show plot with tight layout #uncomment below to save figure plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters fwr.close() return 0
from scipy.cluster.hierarchy import dendrogram, ward, leaves_list import numpy as np import matplotlib.pyplot as plt import pickle if __name__ == '__main__': #country_name = dict() country_name = list() with open('country_vec.dump', 'rb') as feat_f: country_dict = pickle.load(feat_f) for id_num, item in enumerate(country_dict.items()): #country_name[item[0]] = id_num country_name.append(item[0]) if id_num == 0: country_mat = item[1] continue country_mat = np.vstack((country_mat, item[1])) h_cls = ward(country_mat) dendrogram(h_cls, labels=country_name) plt.show()
# ShowCase II: tsp is good for recovering images: elements = cv2.imread("shared/demo/zimo.jpg", flags=cv2.IMREAD_COLOR) elements = elements[:, :, 0] rng = default_rng(seed=1) i = rng.permutation(np.arange(elements.shape[0])) # j = rng.permutation(np.arange(elements.shape[0])) elements = elements[i] elements = elements[:, i] elements = np.load("shared/author_similarity_matrix.npy") print(elements.shape) save_pic(elements, "randomized") for i in range(1): Z = hierarchy.ward(elements) indices = hierarchy.leaves_list( hierarchy.optimal_leaf_ordering(Z, elements)) elements = elements[indices].T elements = elements[indices].T save_pic(elements, f"processed_{i}_olo") pca = PCA(n_components=1) pca.fit(elements) print(pca.components_.shape) indices = np.argsort(pca.components_.flatten()) elements = elements[indices].T elements = elements[indices].T
return summs def similarity(data): from sklearn.metrics.pairwise import cosine_similarity sims = cosine_similarity(data) return sims if True: reduced, v = reduce_data(bdata) simplified = summ_subs(reduced) similar = similarity(simplified) from scipy.cluster.hierarchy import ward clusters = ward(similar) subnames = sorted(substance_count.keys()) subcounts = [substance_count[key] for key in subnames] if True: tree = jsontree(clusters,2*clusters.shape[0],subnames,subcounts,1,np.nan) #tree = jsontree(clusters,2*clusters.shape[0],subnames,subcounts,np.nan,1000) with open(path+"gh-pages/tagtree.json","wb") as j: import json json.dump(tree,j) if True: reduced, v = reduce_data(ldata) similar = similarity(reduced.T) from scipy.cluster.hierarchy import ward clusters = ward(similar)
def getcountry(file_name): with open(file_name, 'r') as ff: country_l = [] for ii, line in enumerate(ff): if (ii % 2) == 1: if ' ' in line.strip(): line2 = line.replace(' ', '_', 100) country_l.append(line2.strip()) else: country_l.append(line.strip()) return country_l #print(country) if __name__ == "__main__": #country_l = getcountry('../chapter09/countries2.tsv') index_file = 'country_idx' ntx_file = 'country_MTX' MT_X = Get_MT_X(ntx_file) t_i = get_t_i(index_file) #ward = cl.AgglomerativeClustering(linkage='ward').fit_predict(MT_X) ward = ward(MT_X) print(ward) dendrogram(ward, labels=list(t_i.keys()), leaf_font_size=8) plt.show()
#!/usr/bin/env python # -*- coding: utf-8 -*- """ 98. Ward法によるクラスタリング 96の単語ベクトルに対して,Ward法による階層型クラスタリングを実行せよ.さらに,クラスタリング結果をデンドログラムとして可視化せよ. """ from n90 import load_model from n96 import get_country_vector import numpy as np from scipy.cluster.hierarchy import ward, dendrogram import matplotlib.pyplot as plt import sys vector = get_country_vector(load_model(sys.argv[1])) dendrogram(ward(np.array(list(vector.values()))), labels=list(vector.keys())) plt.show()
#!/usr/bin/env python3 import sys from scipy.cluster.hierarchy import ward, dendrogram, linkage, leaves_list import numpy as np from matplotlib import pyplot as plt from scipy.spatial.distance import pdist data = [] for line in open(sys.argv[1]): fields = line.rstrip("\r\n").split() gene = fields[0] cfu = fields[1] poly = fields[2] data.append([cfu, poly]) #print(data) z = ward(pdist(data)) y = leaves_list(z) fig = plt.figure(figsize=(20, 10)) dn = dendrogram(z) plt.tight_layout() plt.ylabel("") plt.xlabel("") #plt.set_title("") fig.savefig("dendro.png") plt.close(fig)
# compute distance matrix distance_matrix = manhattan_distances(activities_binary_matrix) print(distance_matrix.shape) activity_names = ['Shopping', 'Antiquing', 'Site Seeing', 'Fine Dining', 'Casual Dining', 'Family Style Dining', 'Fast Food Dining', 'Museums', 'Indoor Pool', 'Outdoor Pool', 'Hiking', 'Gambling', 'Boating/Swimming', 'Fishing', 'Golfing', 'Boat Tours', 'Ride the Ducks', 'Amusement Park', 'Minigolf', 'Go-carting', 'Waterpark', 'Circus World', 'Tommy Bartlett Ski Show', 'Helicopter Rides', 'Horseback Riding', 'Stand Rock', 'Outdoor Attractions', 'Nearby Attractions', 'Movie Theater', 'Concert Theater', 'Bar/Pub Dancing', 'Shop Broadway', 'Bungee Jumping'] linkage_matrix = ward(distance_matrix) fig, ax = plt.subplots(figsize=(15, 20)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=activity_names) plt.tick_params(\ axis = 'x', # changes apply to the x-axis which = 'both', # both major and minor ticks are affected bottom = 'off', # ticks along the bottom edge are off top = 'off', # ticks along the top edge are off labelbottom = 'off') plt.tight_layout() # show plot with tight layout # route figure to external file plt.savefig('plot_hierarchical_clustering_solution.png', dpi = 200)
def ward_tree(X, connectivity=None, n_components=None, copy=True, n_clusters=None): """Ward clustering based on a Feature matrix. Recursively merges the pair of clusters that minimally increases within-cluster variance. The inertia matrix uses a Heapq-based representation. This is the structured version, that takes into account some topological structure between samples. Parameters ---------- X : array of shape (n_samples, n_features) feature matrix representing n_samples samples to be clustered connectivity : sparse matrix. connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. The matrix is assumed to be symmetric and only the upper triangular half is used. Default is None, i.e, the Ward algorithm is unstructured. n_components : int (optional) Number of connected components. If None the number of connected components is estimated from the connectivity matrix. copy : bool (optional) Make a copy of connectivity or work inplace. If connectivity is not of LIL type there will be a copy in any case. n_clusters : int (optional) Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of samples. In this case, the complete tree is not computed, thus the 'children' output is of limited use, and the 'parents' output should rather be used. This option is valid only when specifying a connectivity matrix. Returns ------- children : 2D array, shape (n_nodes, 2) The children of each non-leaf node. Values less than `n_samples` refer to leaves of the tree. A greater value `i` indicates a node with children `children[i - n_samples]`. n_components : int The number of connected components in the graph. n_leaves : int The number of leaves in the tree parents : 1D array, shape (n_nodes, ) or None The parent of each node. Only returned when a connectivity matrix is specified, elsewhere 'None' is returned. """ X = np.asarray(X) if X.ndim == 1: X = np.reshape(X, (-1, 1)) n_samples, n_features = X.shape if connectivity is None: if n_clusters is not None: warnings.warn('Early stopping is implemented only for ' 'structured Ward clustering (i.e. with ' 'explicit connectivity.', stacklevel=2) out = hierarchy.ward(X) children_ = out[:, :2].astype(np.int) return children_, 1, n_samples, None # Compute the number of nodes if n_components is None: n_components, labels = cs_graph_components(connectivity) # Convert connectivity matrix to LIL with a copy if needed if sparse.isspmatrix_lil(connectivity) and copy: connectivity = connectivity.copy() elif not sparse.isspmatrix(connectivity): connectivity = sparse.lil_matrix(connectivity) else: connectivity = connectivity.tolil() if n_components > 1: warnings.warn("the number of connected components of the " "connectivity matrix is %d > 1. Completing it to avoid " "stopping the tree early." % n_components) connectivity = _fix_connectivity(X, connectivity, n_components, labels) n_components = 1 if n_clusters is None: n_nodes = 2 * n_samples - n_components else: assert n_clusters <= n_samples n_nodes = 2 * n_samples - n_clusters if (connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples): raise ValueError('Wrong shape for connectivity matrix: %s ' 'when X is %s' % (connectivity.shape, X.shape)) # create inertia matrix coord_row = [] coord_col = [] A = [] for ind, row in enumerate(connectivity.rows): A.append(row) # We keep only the upper triangular for the moments # Generator expressions are faster than arrays on the following row = [i for i in row if i < ind] coord_row.extend(len(row) * [ind, ]) coord_col.extend(row) coord_row = np.array(coord_row, dtype=np.int) coord_col = np.array(coord_col, dtype=np.int) # build moments as a list moments_1 = np.zeros(n_nodes) moments_1[:n_samples] = 1 moments_2 = np.zeros((n_nodes, n_features)) moments_2[:n_samples] = X inertia = np.empty(len(coord_row), dtype=np.float) _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, inertia) inertia = list(six.moves.zip(inertia, coord_row, coord_col)) heapify(inertia) # prepare the main fields parent = np.arange(n_nodes, dtype=np.int) heights = np.zeros(n_nodes) used_node = np.ones(n_nodes, dtype=bool) children = [] not_visited = np.empty(n_nodes, dtype=np.int8) # recursive merge loop for k in range(n_samples, n_nodes): # identify the merge while True: inert, i, j = heappop(inertia) if used_node[i] and used_node[j]: break parent[i], parent[j], heights[k] = k, k, inert children.append([i, j]) used_node[i] = used_node[j] = False # update the moments moments_1[k] = moments_1[i] + moments_1[j] moments_2[k] = moments_2[i] + moments_2[j] # update the structure matrix A and the inertia matrix coord_col = [] not_visited.fill(1) not_visited[k] = 0 _hierarchical._get_parents(A[i], coord_col, parent, not_visited) _hierarchical._get_parents(A[j], coord_col, parent, not_visited) # List comprehension is faster than a for loop [A[l].append(k) for l in coord_col] A.append(coord_col) coord_col = np.array(coord_col, dtype=np.int) coord_row = np.empty_like(coord_col) coord_row.fill(k) n_additions = len(coord_row) ini = np.empty(n_additions, dtype=np.float) _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, ini) # List comprehension is faster than a for loop [heappush(inertia, (ini[idx], k, coord_col[idx])) for idx in range(n_additions)] # Separate leaves in children (empty lists up to now) n_leaves = n_samples children = np.array(children) # return numpy array for efficient caching return children, n_components, n_leaves, parent
def ward_hierarchical_clustering(feature_matrix): cosine_distance = 1 - cosine_similarity(feature_matrix) linkage_matrix = ward(cosine_distance) return linkage_matrix
''' fig,axes = plt.subplots(2,5,subplot_kw={'xticks':(),'yticks':()},figsize=(12,4)) for center,ax in zip(km.cluster_centers_,axes.ravel()): ax.imshow(pca.inverse_transform(center).reshape(image_shape),vmin=0,vmax=1) plt.show() ''' agglomerative = AgglomerativeClustering(n_clusters=40) labels_agg = agglomerative.fit_predict(X_pca) print("Cluster sizes agglomerative clustering:{}".format( np.bincount(labels_agg))) print("ARI:{:.2f}".format(adjusted_rand_score(labels_agg, labels_km))) linkage_array = ward(X_pca) plt.figure(figsize=(20, 5)) dendrogram(linkage_array, p=7, truncate_mode='level', no_labels=True) plt.xlabel("Sample index") plt.ylabel("Cluster distance") ''' for cluster in range(max(labels)+1): mask = labels == cluster n_images = np.sum(mask) fig,axes = plt.subplots(1,n_images,figsize=(n_images*1.5,4),subplot_kw={'xticks':(),'yticks':()}) for image,label,ax in zip(X_people[mask],y_people[mask],axes): ax.imshow(image.reshape(image_shape), vmin=0, vmax=1) ax.set_title(people.target_names[label].split()[-1]) ''' n_clusters = 40
def ward_tree(X, connectivity=None, n_components=None, copy=None, n_clusters=None): """Ward clustering based on a Feature matrix. Recursively merges the pair of clusters that minimally increases within-cluster variance. The inertia matrix uses a Heapq-based representation. This is the structured version, that takes into account some topological structure between samples. Parameters ---------- X : array, shape (n_samples, n_features) feature matrix representing n_samples samples to be clustered connectivity : sparse matrix (optional). connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. The matrix is assumed to be symmetric and only the upper triangular half is used. Default is None, i.e, the Ward algorithm is unstructured. n_components : int (optional) Number of connected components. If None the number of connected components is estimated from the connectivity matrix. n_clusters : int (optional) Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of samples. In this case, the complete tree is not computed, thus the 'children' output is of limited use, and the 'parents' output should rather be used. This option is valid only when specifying a connectivity matrix. Returns ------- children : 2D array, shape (n_nodes, 2) The children of each non-leaf node. Values less than `n_samples` refer to leaves of the tree. A greater value `i` indicates a node with children `children[i - n_samples]`. n_components : int The number of connected components in the graph. n_leaves : int The number of leaves in the tree parents : 1D array, shape (n_nodes, ) or None The parent of each node. Only returned when a connectivity matrix is specified, elsewhere 'None' is returned. """ if copy is not None: warnings.warn("The copy argument is deprecated and will be removed " "in 0.16. The connectivity is now always copied.", DeprecationWarning) X = np.asarray(X) if X.ndim == 1: X = np.reshape(X, (-1, 1)) n_samples, n_features = X.shape if connectivity is None: from scipy.cluster import hierarchy # imports PIL if n_clusters is not None: warnings.warn('Partial build of the tree is implemented ' 'only for structured clustering (i.e. with ' 'explicit connectivity). The algorithm ' 'will build the full tree and only ' 'retain the lower branches required ' 'for the specified number of clusters', stacklevel=2) out = hierarchy.ward(X) children_ = out[:, :2].astype(np.intp) return children_, 1, n_samples, None connectivity = _fix_connectivity(X, connectivity, n_components=n_components) if n_clusters is None: n_nodes = 2 * n_samples - 1 else: if n_clusters > n_samples: raise ValueError('Cannot provide more clusters than samples. ' '%i n_clusters was asked, and there are %i samples.' % (n_clusters, n_samples)) n_nodes = 2 * n_samples - n_clusters # create inertia matrix coord_row = [] coord_col = [] A = [] for ind, row in enumerate(connectivity.rows): A.append(row) # We keep only the upper triangular for the moments # Generator expressions are faster than arrays on the following row = [i for i in row if i < ind] coord_row.extend(len(row) * [ind, ]) coord_col.extend(row) coord_row = np.array(coord_row, dtype=np.intp, order='C') coord_col = np.array(coord_col, dtype=np.intp, order='C') # build moments as a list moments_1 = np.zeros(n_nodes, order='C') moments_1[:n_samples] = 1 moments_2 = np.zeros((n_nodes, n_features), order='C') moments_2[:n_samples] = X inertia = np.empty(len(coord_row), dtype=np.float, order='C') _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, inertia) inertia = list(six.moves.zip(inertia, coord_row, coord_col)) heapify(inertia) # prepare the main fields parent = np.arange(n_nodes, dtype=np.intp) used_node = np.ones(n_nodes, dtype=bool) children = [] not_visited = np.empty(n_nodes, dtype=np.int8, order='C') # recursive merge loop for k in range(n_samples, n_nodes): # identify the merge while True: inert, i, j = heappop(inertia) if used_node[i] and used_node[j]: break parent[i], parent[j] = k, k children.append((i, j)) used_node[i] = used_node[j] = False # update the moments moments_1[k] = moments_1[i] + moments_1[j] moments_2[k] = moments_2[i] + moments_2[j] # update the structure matrix A and the inertia matrix coord_col = [] not_visited.fill(1) not_visited[k] = 0 _hierarchical._get_parents(A[i], coord_col, parent, not_visited) _hierarchical._get_parents(A[j], coord_col, parent, not_visited) # List comprehension is faster than a for loop [A[l].append(k) for l in coord_col] A.append(coord_col) coord_col = np.array(coord_col, dtype=np.intp, order='C') coord_row = np.empty(coord_col.shape, dtype=np.intp, order='C') coord_row.fill(k) n_additions = len(coord_row) ini = np.empty(n_additions, dtype=np.float, order='C') _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, ini) # List comprehension is faster than a for loop [heappush(inertia, (ini[idx], k, coord_col[idx])) for idx in range(n_additions)] # Separate leaves in children (empty lists up to now) n_leaves = n_samples children = np.array(children) # return numpy array for efficient caching return children, n_components, n_leaves, parent
def ejecutarPrograma(opcion): ''' Pasar archivos a una lista ''' path = '/Users/ulysesrico/data' documents = [] titles = [] dirs = os.listdir(path) for doc in dirs: if doc.endswith('.txt'): titles.append(doc) f = open(os.path.join(path, doc), 'r') words = f.read() documents.append(words) f.close() #Genera stopwords sw = stopwords.words('spanish') #Crea los vectores ya sin stopwords y genera matriz tf-idf tfidf_vectorizer = TfidfVectorizer(sw) tfidf_matrix = tfidf_vectorizer.fit_transform(documents) #Se crea diccionario diccionario = tfidf_vectorizer.get_feature_names() print print 'Corroborar tamaño de la matriz Documentos vs Términos' print tfidf_matrix.shape print print print 'Obteniendo similitud de coseno entre 2 documentos (si son iguales el valor es 1)' cosine = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[99:100]) print cosine print 'Cálculo de distancia' dist = 1 - cosine print dist print print 'Ángulo de separación de los documentos (grados)' angle_in_radians = math.acos(cosine) print math.degrees(angle_in_radians) print print 'Área de gráficos' print dist = 1 - cosine_similarity(tfidf_matrix) np.round(dist, 2) if opcion == 1: print 'Inicio' print 'Impresión de similitud de documentos por método de coseno' r = 1 d = 2 * r * (1 - cosine) circle1 = plt.Circle((0, 0), r, alpha=.5) circle2 = plt.Circle((d, 0), r, alpha=.5) ## set axis limits plt.ylim([-1.1, 1.1]) plt.xlim([-1.1, 1.1 + d]) fig = plt.gcf() fig.gca().add_artist(circle1) fig.gca().add_artist(circle2) print 'Fin' elif opcion == 2: print 'Inicio' print 'Clustering de distancia entre documntos' mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] names = [os.path.basename(fn).replace('.txt', '') for fn in titles] # color-blind-friendly palette for x, y, name in zip(xs, ys, names): color = 'orange' if "d1" in name else 'blue' plt.scatter(x, y, c=color) plt.text(x, y, name) plt.show() print 'Fin' elif opcion == 3: print 'Inicio' print 'Clustering de documentos en 3D' mds = MDS(n_components=3, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(pos[:, 0], pos[:, 1], pos[:, 2]) for x, y, z, s in zip(pos[:, 0], pos[:, 1], pos[:, 2], titles): ax.text(x, y, z, s) plt.show() print 'Fin' else: print 'Similitud entre documentos (Dibujar distancia entre ellos)' print 'Inicio' linkage_matrix = ward(dist) dendrogram(linkage_matrix, orientation="right", labels=titles) plt.tight_layout() plt.show() print 'Fin'
#samples = [2,3,4,5,6,7,8,9,10,11,13] samples = open("FDC.csv", "r"). read().split(",") samples = [ float(value) for value in samples] """ diff_samples = numpy.array(original_samples + [0])-numpy.array([0] + original_samples) diff_samples = list(diff_samples) difff_samples = numpy.array((diff_samples + [0])) - numpy.array([0] + diff_samples) """ #執行階層式分群作業 tsamples = numpy.array([samples]).transpose() distance = distance_matrix(tsamples, tsamples) hc = ward(distance) #print(hc) dendrogram(hc) def find_majority( array, index): if index < 5: start = 0 else: start = index - 5 end = index + 6 datas = array[start:end] counter = Counter(datas) [(majority, count)] = counter.most_common(1)
plt.show() # plots each center, the 5 closest faces to center, # and the 5 farthest in each cluster # As expected faces closer to the smoothed faces are facing # similar directions and have similar facial expressions # Faces that are far from center may have different orientations, # headwear, or facial expressions agglom = AgglomerativeClustering(n_clusters=10) labels_agg = agglom.fit_predict(X_pca) print 'Cluster sizes: {}'.format(np.bincount(labels_agg)) # Like kMeans, it creates relatively similarly sized clusters # print 'ARI: {:.2f}'.format(adjusted_rand_score(labels_agg, labels_km)) # They seem to be rather uncorrelated (0.09) linkage_arr = ward(X_pca) plt.figure(figsize=(20, 5)) dendrogram(linkage_arr, p=7, truncate_mode='level', no_labels=True) plt.xlabel('Sample index') plt.ylabel('Cluster distance') plt.show() # The plot shows branches vary in length # There does not seem to be a good cutoff for # classifying the data for cluster in range(10): mask = labels_agg == cluster fig, axes = plt.subplots(1, 10, subplot_kw={ 'xticks': (),
def ward_tree(X, connectivity=None, n_components=None, copy=True): """Ward clustering based on a Feature matrix. The inertia matrix uses a Heapq-based representation. This is the structured version, that takes into account a some topological structure between samples. Parameters ---------- X : array of shape (n_samples, n_features) feature matrix representing n_samples samples to be clustered connectivity : sparse matrix. connectivity matrix. Defines for each sample the neigbhoring samples following a given structure of the data. The matrix is assumed to be symmetric and only the upper triangular half is used. Default is None, i.e, the Ward algorithm is unstructured. n_components : int (optional) Number of connected components. If None the number of connected components is estimated from the connectivity matrix. copy : bool (optional) Make a copy of connectivity or work inplace. If connectivity is not of LIL type there will be a copy in any case. Returns ------- children : list of pairs. Lenght of n_nodes list of the children of each nodes. Leaves of the tree have empty list of children. n_components : sparse matrix. The number of connected components in the graph. n_leaves : int The number of leaves in the tree """ X = np.asarray(X) n_samples, n_features = X.shape if X.ndim == 1: X = np.reshape(X, (-1, 1)) if connectivity is None: out = hierarchy.ward(X) children_ = out[:, :2].astype(np.int) return children_, 1, n_samples # Compute the number of nodes if n_components is None: n_components, labels = cs_graph_components(connectivity) # Convert connectivity matrix to LIL with a copy if needed if sparse.isspmatrix_lil(connectivity) and copy: connectivity = connectivity.copy() else: connectivity = connectivity.tolil() if n_components > 1: warnings.warn("the number of connected components of the" " connectivity matrix is %d > 1. Completing it to avoid" " stopping the tree early." % n_components) connectivity = _fix_connectivity(X, connectivity, n_components, labels) n_components = 1 n_nodes = 2 * n_samples - n_components if (connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples): raise ValueError('Wrong shape for connectivity matrix: %s ' 'when X is %s' % (connectivity.shape, X.shape)) # Remove diagonal from connectivity matrix connectivity.setdiag(np.zeros(connectivity.shape[0])) # create inertia matrix coord_row = [] coord_col = [] A = [] for ind, row in enumerate(connectivity.rows): A.append(row) # We keep only the upper triangular for the moments # Generator expressions are faster than arrays on the following row = [i for i in row if i < ind] coord_row.extend(len(row) * [ind, ]) coord_col.extend(row) coord_row = np.array(coord_row, dtype=np.int) coord_col = np.array(coord_col, dtype=np.int) # build moments as a list moments_1 = np.zeros(n_nodes) moments_1[:n_samples] = 1 moments_2 = np.zeros((n_nodes, n_features)) moments_2[:n_samples] = X inertia = np.empty(len(coord_row), dtype=np.float) _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, inertia) inertia = zip(inertia, coord_row, coord_col) heapify(inertia) # prepare the main fields parent = np.arange(n_nodes, dtype=np.int) heights = np.zeros(n_nodes) used_node = np.ones(n_nodes, dtype=bool) children = [] visited = np.empty(n_nodes, dtype=bool) # recursive merge loop for k in xrange(n_samples, n_nodes): # identify the merge while True: inert, i, j = heappop(inertia) if used_node[i] and used_node[j]: break parent[i], parent[j], heights[k] = k, k, inert children.append([i, j]) used_node[i] = used_node[j] = False # update the moments moments_1[k] = moments_1[i] + moments_1[j] moments_2[k] = moments_2[i] + moments_2[j] # update the structure matrix A and the inertia matrix coord_col = [] visited[:] = False visited[k] = True for l in set(A[i]).union(A[j]): l = _hierarchical._get_parent(l, parent) if not visited[l]: visited[l] = True coord_col.append(l) A[l].append(k) A.append(coord_col) coord_col = np.array(coord_col, dtype=np.int) coord_row = np.empty_like(coord_col) coord_row.fill(k) ini = np.empty(len(coord_row), dtype=np.float) _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, ini) for tupl in itertools.izip(ini, coord_row, coord_col): heappush(inertia, tupl) # Separate leaves in children (empty lists up to now) n_leaves = n_samples children = np.array(children) # return numpy array for efficient caching return children, n_components, n_leaves
def missingVals(self): n = self.X_nan_bool_df.shape[0] if np.sum(self.X_nan_bool_df.to_numpy().ravel()) == 0: print(f'no missing values found') return nan_01 = self.X_nan_bool_df.to_numpy().astype(np.int16) feature_names = self.X_nan_bool_df.columns.to_list() feature_idx = np.arange(len(feature_names)) #nan_bool_stack=self.X_nan_bool_df.reset_index(drop=True,inplace=False).to_numpy().astype(np.uint8) plt.rcParams['font.size'] = '8' fig, (ax0, ax1, ax2, ax3) = plt.subplots(4, 1, figsize=(12, 16), dpi=200) feat_miss_count_ser = self.X_nan_bool_df.astype(np.int16).sum(axis=0) feat_miss_count_ser.plot.bar(ax=ax0, ) ax0.set_title('Missing Data Counts by Feature') pct_missing_list = [ f'{round(pct)}%' for pct in (100 * feat_miss_count_ser / n).tolist() ] self.addAnnotations(ax0, pct_missing_list) row_miss_count_ser = feat_miss_count_ser = self.X_nan_bool_df.astype( np.int16).sum(axis=1) ax1.bar(np.arange(n), row_miss_count_ser.to_numpy(), width=1) ax1.set_title('Missing Data Counts by Row') nan_01_sum = nan_01.sum(axis=0) has_nan_features = nan_01_sum > 0 nan_01_hasnan = nan_01[:, has_nan_features] hasnan_features = [ name for i, name in enumerate(feature_names) if has_nan_features[i] ] nan_corr = self.pearsonCorrelationMatrix(nan_01_hasnan) nan_corr_df = pd.DataFrame(data=nan_corr, columns=hasnan_features) self.nan_corr = nan_corr self.nan_corr_df = nan_corr_df corr_linkage = hierarchy.ward(nan_corr) dendro = hierarchy.dendrogram( #just used for ordering the features by the grouping corr_linkage, labels=hasnan_features, ax=None, no_plot=True, leaf_rotation=90) ax2.imshow(nan_01, aspect='auto', interpolation='none', cmap='plasma') colors = [plt.get_cmap('plasma')(value) for value in [255]] labels = ['missing data'] patches = [Patch(color=colors[i], label=labels[i]) for i in [0]] ax2.legend(handles=patches, bbox_to_anchor=(0, 1.1), loc=9, ncol=2, fontsize='large') ax2.set_xticks(feature_idx) ax2.set_xticklabels(feature_names, rotation='vertical', fontsize=6) ax2.set_title('Missing Data Layout') cp = ax3.imshow(nan_corr[dendro['leaves'], :][:, dendro['leaves']], aspect='equal', interpolation='none') fig.colorbar(cp, shrink=0.5) hasnan_feature_idx = np.arange(len(hasnan_features)) ax3.set_yticks(hasnan_feature_idx) ax3.set_xticks(hasnan_feature_idx) ax3.set_xticklabels(dendro['ivl'], rotation='vertical', fontsize=6) ax3.set_yticklabels(dendro['ivl'], fontsize=6) ax3.set_title('Missing Data Clustering Across Features') fig.tight_layout()
def ward_tree(X, connectivity=None, n_components=None, n_clusters=None, return_distance=False): """Ward clustering based on a Feature matrix. Recursively merges the pair of clusters that minimally increases within-cluster variance. The inertia matrix uses a Heapq-based representation. This is the structured version, that takes into account some topological structure between samples. Read more in the :ref:`User Guide <hierarchical_clustering>`. Parameters ---------- X : array, shape (n_samples, n_features) feature matrix representing n_samples samples to be clustered connectivity : sparse matrix (optional). connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. The matrix is assumed to be symmetric and only the upper triangular half is used. Default is None, i.e, the Ward algorithm is unstructured. n_components : int (optional) Number of connected components. If None the number of connected components is estimated from the connectivity matrix. NOTE: This parameter is now directly determined directly from the connectivity matrix and will be removed in 0.18 n_clusters : int (optional) Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of samples. In this case, the complete tree is not computed, thus the 'children' output is of limited use, and the 'parents' output should rather be used. This option is valid only when specifying a connectivity matrix. return_distance: bool (optional) If True, return the distance between the clusters. Returns ------- children : 2D array, shape (n_nodes-1, 2) The children of each non-leaf node. Values less than `n_samples` correspond to leaves of the tree which are the original samples. A node `i` greater than or equal to `n_samples` is a non-leaf node and has children `children_[i - n_samples]`. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node `n_samples + i` n_components : int The number of connected components in the graph. n_leaves : int The number of leaves in the tree parents : 1D array, shape (n_nodes, ) or None The parent of each node. Only returned when a connectivity matrix is specified, elsewhere 'None' is returned. distances : 1D array, shape (n_nodes-1, ) Only returned if return_distance is set to True (for compatibility). The distances between the centers of the nodes. `distances[i]` corresponds to a weighted euclidean distance between the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to leaves of the tree, then `distances[i]` is their unweighted euclidean distance. Distances are updated in the following way (from scipy.hierarchy.linkage): The new entry :math:`d(u,v)` is computed as follows, .. math:: d(u,v) = \\sqrt{\\frac{|v|+|s|} {T}d(v,s)^2 + \\frac{|v|+|t|} {T}d(v,t)^2 - \\frac{|v|} {T}d(s,t)^2} where :math:`u` is the newly joined cluster consisting of clusters :math:`s` and :math:`t`, :math:`v` is an unused cluster in the forest, :math:`T=|v|+|s|+|t|`, and :math:`|*|` is the cardinality of its argument. This is also known as the incremental algorithm. """ X = np.asarray(X) if X.ndim == 1: X = np.reshape(X, (-1, 1)) n_samples, n_features = X.shape if connectivity is None: from scipy.cluster import hierarchy # imports PIL if n_clusters is not None: warnings.warn('Partial build of the tree is implemented ' 'only for structured clustering (i.e. with ' 'explicit connectivity). The algorithm ' 'will build the full tree and only ' 'retain the lower branches required ' 'for the specified number of clusters', stacklevel=2) out = hierarchy.ward(X) children_ = out[:, :2].astype(np.intp) if return_distance: distances = out[:, 2] return children_, 1, n_samples, None, distances else: return children_, 1, n_samples, None if n_components is not None: warnings.warn( "n_components is now directly calculated from the connectivity " "matrix and will be removed in 0.18", DeprecationWarning) connectivity, n_components = _fix_connectivity(X, connectivity) if n_clusters is None: n_nodes = 2 * n_samples - 1 else: if n_clusters > n_samples: raise ValueError('Cannot provide more clusters than samples. ' '%i n_clusters was asked, and there are %i samples.' % (n_clusters, n_samples)) n_nodes = 2 * n_samples - n_clusters # create inertia matrix coord_row = [] coord_col = [] A = [] for ind, row in enumerate(connectivity.rows): A.append(row) # We keep only the upper triangular for the moments # Generator expressions are faster than arrays on the following row = [i for i in row if i < ind] coord_row.extend(len(row) * [ind, ]) coord_col.extend(row) coord_row = np.array(coord_row, dtype=np.intp, order='C') coord_col = np.array(coord_col, dtype=np.intp, order='C') # build moments as a list moments_1 = np.zeros(n_nodes, order='C') moments_1[:n_samples] = 1 moments_2 = np.zeros((n_nodes, n_features), order='C') moments_2[:n_samples] = X inertia = np.empty(len(coord_row), dtype=np.float64, order='C') _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, inertia) inertia = list(six.moves.zip(inertia, coord_row, coord_col)) heapify(inertia) # prepare the main fields parent = np.arange(n_nodes, dtype=np.intp) used_node = np.ones(n_nodes, dtype=bool) children = [] if return_distance: distances = np.empty(n_nodes - n_samples) not_visited = np.empty(n_nodes, dtype=np.int8, order='C') # recursive merge loop for k in range(n_samples, n_nodes): # identify the merge while True: inert, i, j = heappop(inertia) if used_node[i] and used_node[j]: break parent[i], parent[j] = k, k children.append((i, j)) used_node[i] = used_node[j] = False if return_distance: # store inertia value distances[k - n_samples] = inert # update the moments moments_1[k] = moments_1[i] + moments_1[j] moments_2[k] = moments_2[i] + moments_2[j] # update the structure matrix A and the inertia matrix coord_col = [] not_visited.fill(1) not_visited[k] = 0 _hierarchical._get_parents(A[i], coord_col, parent, not_visited) _hierarchical._get_parents(A[j], coord_col, parent, not_visited) # List comprehension is faster than a for loop [A[l].append(k) for l in coord_col] A.append(coord_col) coord_col = np.array(coord_col, dtype=np.intp, order='C') coord_row = np.empty(coord_col.shape, dtype=np.intp, order='C') coord_row.fill(k) n_additions = len(coord_row) ini = np.empty(n_additions, dtype=np.float64, order='C') _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, ini) # List comprehension is faster than a for loop [heappush(inertia, (ini[idx], k, coord_col[idx])) for idx in range(n_additions)] # Separate leaves in children (empty lists up to now) n_leaves = n_samples # sort children to get consistent output with unstructured version children = [c[::-1] for c in children] children = np.array(children) # return numpy array for efficient caching if return_distance: # 2 is scaling factor to compare w/ unstructured version distances = np.sqrt(2. * distances) return children, n_components, n_leaves, parent, distances else: return children, n_components, n_leaves, parent
if not kwargs.get('no_plot', False): for i, d in zip(ddata['icoord'], ddata['dcoord']): x = 0.5 * sum(i[1:3]) y = d[1] plt.plot(x, y, 'ro') plt.annotate("%.3g" % y, (x, y), xytext=(0, -8), textcoords='offset points', va='top', ha='center') return ddata #Ward's method wx = ward(wi) print('------------------------------------------') print('Macierz odległości: ') print('------------------------------------------') print(wi) print('------------------------------------------') print('Macierz po wykonaniu klastrowania: ') print('------------------------------------------') print(wx) z = hierarchy.linkage(wx, 'ward') ddata = augmented_dendrogram(z, color_threshold=5, truncate_mode='lastp') hierarchy.dendrogram(z, leaf_rotation=90, leaf_font_size=8, labels=data.index, color_threshold=10)
def repub_debate(): if len(sys.argv) < 2: print ("Run: python repub_debate.py < Input csv> ") sys.exit(1) data = pd.read_csv(sys.argv[1]) print(data) #data = data [~data.Speaker.isin(['MALE','SANTELLI','(UNKNOWN)','UNIDENTIFIED MALE','HARMAN', 'HARWOOD','CRAMER','EPPERSON','QUICK','QUINTANILLA'])] #Filter list for 4th republican debate data = data [~data.Speaker.isin(['MALE','BAKER','(UNKNOWN)','UNIDENTIFIED MALE','CAVUTO', 'BARTIROMO'])] print (('Unique Speakers: ', sorted(list(data.Speaker.unique())))) #Count the number of words each speaker spoke def countWords(speaker): speakerData = data[data.Speaker == speaker] allText = "" for index, row in speakerData.iterrows(): allText += str(row['Text'])+" " words_all = len(allText.split()) print (('Total words: ',speaker,': ', words_all)) for name in data.Speaker.unique(): countWords(name); def generatewordcloud(speaker, inputImageFileName, outputImageFileName): speakerData = data[data.Speaker == speaker] allText = "" for index, row in speakerData.iterrows(): allText += str(row['Text'])+" " #print (allText) ImageFile.LOAD_TRUNCATED_IMAGES = True img = Image.open(inputImageFileName) img = img.resize((980,1080), Image.ANTIALIAS) speakerArray = np.array(img) sl = STOPWORDS | stopwordshearing wc = WordCloud(background_color="white", max_words=500, mask=speakerArray, stopwords=sl) wc.generate(allText) # create coloring from image image_colors = ImageColorGenerator(speakerArray) wc.recolor(color_func=image_colors) wc.to_file(outputImageFileName) #Commenting out generating word cloud as I am testin gsomething else now # generatewordcloud('KASICH', "images/kasich.png", "images/wc_kasich.png"); # generatewordcloud("HUCKABEE", "images/huckabee.png", "images/wc_huckabee.png"); # generatewordcloud("BUSH", "images/bush.png", "images/wc_bush.png"); # generatewordcloud("RUBIO", "images/rubio.png", "images/wc_rubio.png"); # generatewordcloud("TRUMP", "images/trump.png", "images/wc_trump.png"); # generatewordcloud("CARSON", "images/carson.png", "images/wc_carson.png"); # generatewordcloud("FIORINA", "images/fiorina.png", "images/wc_fiorina.png"); # generatewordcloud("CRUZ", "images/cruz.png", "images/wc_cruz.png"); # generatewordcloud("CHRISTIE", "images/christie.png", "images/wc_christie.png"); # generatewordcloud("PAUL", "images/paul.png", "images/wc_paul.png"); def generateoverallwordcloud(inputImageFileName, outputImageFileName): allText = "" for index, row in data.iterrows(): allText += str(row['Text'])+" " #print (allText) ImageFile.LOAD_TRUNCATED_IMAGES = True img = Image.open(inputImageFileName) img = img.resize((980,1080), Image.ANTIALIAS) speakerArray = np.array(img) sl = STOPWORDS | stopwordshearing wc = WordCloud(background_color="white", max_words=500, mask=speakerArray, stopwords=sl) wc.generate(allText) # create coloring from image image_colors = ImageColorGenerator(speakerArray) wc.recolor(color_func=image_colors) wc.to_file(outputImageFileName) #generateoverallwordcloud("images/RepublicanLogo.png", "images/wc_rep_debate3.png"); #Count the number of words by each party member def getWords(speaker): global stopwordshearing speakerData = data[data.Speaker == speaker] allText = "" for index, row in speakerData.iterrows(): #s.translate(table, string.punctuation) allText += str(row['Text']).lower().translate(table)+" " allText = allText.replace("e-mail","email") allText = allText.replace("e- mail","email") allText = allText.replace("op-ed","oped") sl = STOPWORDS | stopwordshearing wc = WordCloud(background_color="white", max_words=2000, stopwords=sl, random_state=42) wc.generate(allText) wcdf = pd.DataFrame(wc.words_) wcdf.columns = ["word",speaker] return wcdf #Count the number of words in the entire transcript def getTotalWords(): global stopwordshearing speakerData = data allText = "" for index, row in speakerData.iterrows(): #s.translate(table, string.punctuation) allText += str(row['Text']).lower().translate(table)+" " allText = allText.replace("e-mail","email") allText = allText.replace("e- mail","email") allText = allText.replace("op-ed","oped") sl = STOPWORDS | stopwordshearing wc = WordCloud(background_color="white", max_words=2000, stopwords=sl, random_state=42) wc.generate(allText) wcdf = pd.DataFrame(wc.words_) wcdf.columns = ["word","Total"] return wcdf # Separate dataframes by Republican and Democrat's word frequencies df_dict ={} i=1 for name in data.Speaker.unique(): df_dict[name] = getWords(name) #print df_dict[name].head() if i == 1: rdwc = df_dict[name] else: rdwc = pd.merge(rdwc, df_dict[name], on = "word", how='outer') i += 1 df_dict["Total"] = getTotalWords() rdwc = pd.merge(rdwc,df_dict["Total"], on = "word", how='outer') print (rdwc.head()) rdwc=rdwc.fillna(0) rdwc.to_csv("wordfreq.csv") def getAllText(speaker): global stopwordshearing speakerData = data[data.Speaker == speaker] allText = "" for index, row in speakerData.iterrows(): #s.translate(table, string.punctuation) allText += str(row['Text']).lower().translate(table)+" " allText = allText.replace("e-mail","email") allText = allText.replace("e- mail","email") allText = allText.replace("op-ed","oped") return allText #Calculate using countvectorizer and also calculate the consine similarities df_list =[] speaker_list=[] i=1 for name in data.Speaker.unique(): df_list.append(getAllText(name)) speaker_list.append(name) #print(df_dict) vectorizer = CountVectorizer(input='content',stop_words=stop_words) dtm = vectorizer.fit_transform(df_list) vocab = vectorizer.get_feature_names() dtm = dtm.toarray() vocab = np.array(vocab) dist = 1 - cosine_similarity(dtm) np.round(dist, 2) print(dist[0,1]) print(dist[0,2]) mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] for x, y, name in zip(xs, ys, speaker_list): color = 'orange' if "CLINTON" in name else 'skyblue' plt.scatter(x, y, c=color) plt.text(x, y, name) plt.show() mds = MDS(n_components=3, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) from mpl_toolkits.mplot3d import Axes3D fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(pos[:, 0], pos[:, 1], pos[:, 2]) for x, y, z, s in zip(pos[:, 0], pos[:, 1], pos[:, 2], speaker_list): ax.text(x, y, z, s) plt.show() from scipy.cluster.hierarchy import ward, dendrogram linkage_matrix = ward(dist) names = speaker_list dendrogram(linkage_matrix, labels=names) plt.tight_layout() plt.show()
def cluster_dendogram( corpus: List[str], vectorizer, titles: List[str] = None, stemming: Callable = sastrawi, stop_words: List[str] = None, cleaning: Callable = simple_textcleaning, random_samples: float = 0.3, ngram: Tuple[int, int] = (1, 3), figsize: Tuple[int, int] = (17, 9), batch_size: int = 20, ): """ plot hierarchical dendogram with similar texts. Parameters ---------- corpus: List[str] vectorizer: class vectorizer class. num_clusters: int, (default=5) size of unsupervised clusters. titles: List[str], (default=None) list of titles, length must same with corpus. stemming: function, (default=sastrawi) function to stem the corpus. stop_words: List[str], (default=None) list of stop words to remove. If None, default is malaya.texts._text_functions.STOPWORDS cleaning: function, (default=simple_textcleaning) function to clean the corpus. random_samples: float, (default=0.3) random samples from the corpus, 0.3 means 30%. ngram: Tuple[int, int], (default=(1,3)) n-grams size to train a corpus. batch_size: int, (default=20) size of strings for each vectorization and attention. Only useful if use transformer vectorizer. Returns ------- dictionary: {'linkage_matrix': linkage_matrix, 'titles': titles} """ if not isinstance(stemming, collections.Callable) and stemming is not None: raise ValueError('stemming must be a callable type or None') if titles: if len(titles) != len(corpus): raise ValueError('length of titles must be same with corpus') if not hasattr(vectorizer, 'vectorize') and not hasattr(vectorizer, 'fit'): raise ValueError('vectorizer must has `fit` and `vectorize` methods') if not (random_samples < 1 and random_samples > 0): raise ValueError('random_samples must be between 0 and 1') try: import matplotlib.pyplot as plt import seaborn as sns from scipy.cluster.hierarchy import ward, dendrogram sns.set() except: raise Exception( 'matplotlib and seaborn not installed. Please install it and try again.' ) if stop_words is None: stop_words = STOPWORDS corpus = random.sample(corpus, k = int(random_samples * len(corpus))) if cleaning is not None: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) if stemming: for i in range(len(corpus)): corpus[i] = stemming(corpus[i]) text_clean = [] for text in corpus: text_clean.append( ' '.join([word for word in text.split() if word not in stop_words]) ) if hasattr(vectorizer, 'fit'): vectorizer.fit(text_clean) transformed_text_clean = vectorizer.transform(text_clean) features = vectorizer.get_feature_names() else: transformed_text_clean, attentions = [], [] for i in range(0, len(text_clean), batch_size): index = min(i + batch_size, len(text_clean)) transformed_text_clean.append( vectorizer.vectorize(text_clean[i:index]) ) attentions.extend(vectorizer.attention(text_clean[i:index])) transformed_text_clean = np.concatenate( transformed_text_clean, axis = 0 ) dist = 1 - cosine_similarity(transformed_text_clean) linkage_matrix = ward(dist) if not titles: titles = [] for i in range(transformed_text_clean.shape[0]): if hasattr(vectorizer, 'fit'): indices = np.argsort( np.array(transformed_text_clean[i].todense())[0] )[::-1] titles.append( ' '.join([features[i] for i in indices[: ngram[1]]]) ) else: attentions[i].sort(key = lambda x: x[1]) titles.append( ' '.join([i[0] for i in attentions[i][-ngram[1] :]]) ) plt.figure(figsize = figsize) ax = dendrogram(linkage_matrix, orientation = 'right', labels = titles) plt.tick_params( axis = 'x', which = 'both', bottom = 'off', top = 'off', labelbottom = 'off', ) plt.tight_layout() plt.show() return {'linkage_matrix': linkage_matrix, 'titles': titles}
import numpy as np from scipy.cluster.hierarchy import dendrogram, ward from matplotlib.pyplot import show from gensim.models import word2vec model = word2vec.Word2Vec.load("knock90_word2vec") country_list = list() vector_list = list() for country in open('country_list.txt'): country = country.strip('\n') if country in model: country_list.append(country) vector_list.append(model[country]) features = np.array(vector_list) clustring = ward(features) dendrogram(clustring, labels = country_list, orientation='left', leaf_font_size=10) show()
for line in file: # remove linebreak which is the last character of the string currentPlace = line[:-1] # add item to the list features.append(currentPlace) dist = 1 - cosine_similarity(X_train_vectorised) import matplotlib.pyplot as plt from scipy.cluster.hierarchy import ward, dendrogram linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances ''' fig, ax = plt.subplots(figsize=(100, 200)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=title); plt.tick_params(\ axis= 'x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') plt.tight_layout() #show plot with tight layout ''' c = list(range(2, 14)) clusters = [create_clusters(cl) for cl in c] # Logistic regression