def main(): # read the preprocessed data tfidf_matrix = load_sparse_csr() vocabulary = load_labels() # k-means clustering num_clusters = 6 km = KMeans(n_clusters=num_clusters, n_jobs=-1) km.fit(tfidf_matrix) # visualize the generated clusters visualize_clusters(tfidf_matrix, vocabulary, km)
# -*- coding: utf-8 -*- __author__ = 'Marco' from matplotlib import pyplot as plt from scipy.cluster.hierarchy import dendrogram, linkage import numpy as np from scipy.cluster.hierarchy import cophenet from scipy.spatial.distance import pdist from preprocessing import load_sparse_csr X = load_sparse_csr() # generate the linkage matrix Z = linkage(X.toarray(),'ward') # calculate full dendrogram plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('Job') plt.ylabel('distance') dendrogram( Z, leaf_rotation=180., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels ) plt.show()