def Agglomerative(input_data, index_to_check): # ros = RandomOverSampler(random_state=0) # X, y = split_train_test(input_data, index_to_check) y = k_to_one(input_data[:, 7:10]) print(y) X = input_data[:, :7] # X = StandardScaler().fit_transform(X) # X, y = ros.fit_sample(X, y) U, S, V = svd(X, full_matrices=False) datamatrix_projected = np.dot(X, V[1:3].T) N, M = X.shape # Perform hierarchical/agglomerative clustering on data matrix Maxclust = 6 Methods = ['average', 'complete', 'single'] Metrics = ['mahalanobis', 'euclidean'] fignumber = 1 for i in Methods: for j in Metrics: Method = i Metric = j Z = linkage(X, method=Method, metric=Metric) # Compute and display clusters by thresholding the dendrogram cls = fcluster(Z, criterion='maxclust', t=Maxclust) figure(fignumber) fignumber += 1 clusterplot(datamatrix_projected, cls.reshape(cls.shape[0], 1), y=y) title(Method + ' ' + Metric) # Display dendrogram # max_display_levels = 6 # figure(fignumber, figsize=(10, 4)) # fignumber+=1 # dendrogram(Z, truncate_mode='level', p=max_display_levels) show() print('Ran Exercise 10.2.1')
def hierarchical_cluster(remove_doc_index=None): ''' :param remove_doc_index: The index of a doc instance to be removed .It is used for removing potential outlier :return: ''' from Reading_data import * # Normalize data X = stats.zscore(X) # shuffle data # X_sparse = coo_matrix(X) # X, X_sparse, y = shuffle(X, X_sparse, y) if (remove_doc_index is not None): X = np.delete(X, (remove_doc_index), axis=0) y = np.delete(y, (remove_doc_index), None) y = y.T # Perform hierarchical/agglomerative clustering on data matrix methods = [ 'single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward' ] metrics = ['euclidean', 'cityblock', 'cosine'] Method = 'single' Metric = 'euclidean' Maxclust = 2 Z = linkage(X, method=Method, metric=Metric) # Compute and display clusters by thresholding the dendrogram cls = fcluster(Z, criterion='maxclust', t=Maxclust) F_Measure = mcs.f1_score(y, cls, average='weighted') figure(1) pca = PCA(n_components=2) X_r = pca.fit(X).transform(X) #If data is more than 2-dimensional it should be first projected onto the first two principal components clusterplot(X_r, cls.reshape(cls.shape[0], 1), y=y, x_label="PCA1", y_label="PCA2") # Display dendrogram max_display_levels = 20 figure(2) xlabel("doc instance") ylabel('Distance') title('F Measure: {0}'.format(round(F_Measure, 2))) dendrogram(Z, truncate_mode='level', p=max_display_levels) show()
def hierarchical_cluster(remove_doc_index=None ): ''' :param remove_doc_index: The index of a doc instance to be removed .It is used for removing potential outlier :return: ''' from Reading_data import * # Normalize data X = stats.zscore(X) # shuffle data # X_sparse = coo_matrix(X) # X, X_sparse, y = shuffle(X, X_sparse, y) if (remove_doc_index is not None): X = np.delete(X, (remove_doc_index), axis=0) y=np.delete(y,(remove_doc_index),None) y=y.T # Perform hierarchical/agglomerative clustering on data matrix methods = ['single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward'] metrics = ['euclidean', 'cityblock', 'cosine'] Method = 'single' Metric = 'euclidean' Maxclust = 2 Z = linkage(X, method=Method, metric=Metric) # Compute and display clusters by thresholding the dendrogram cls = fcluster(Z, criterion='maxclust', t=Maxclust) F_Measure=mcs.f1_score(y, cls, average='weighted') figure(1) pca = PCA(n_components=2) X_r = pca.fit(X).transform(X) #If data is more than 2-dimensional it should be first projected onto the first two principal components clusterplot(X_r, cls.reshape(cls.shape[0], 1), y=y, x_label="PCA1", y_label="PCA2") # Display dendrogram max_display_levels = 20 figure(2) xlabel("doc instance") ylabel('Distance') title('F Measure: {0}'.format(round(F_Measure,2))) dendrogram(Z, truncate_mode='level', p=max_display_levels) show()
def _clustering(self): # Get data print 'Get cluster data..' H = np.asmatrix(np.loadtxt(factoredHMatrix)).T words = set(open(attributFile).read().split()) y = range(len(words)) # clustering clusterNumber = 4 runNumber = 10 N, M = H.shape print 'Calculate k-means..' # K-means clustering: centroids, cls, inertia = k_means(H, clusterNumber, n_init=runNumber) print 'Plotting results..' # Plot results: figure(figsize=(14, 9)) clusterplot(H, cls, centroids, y) show()
K_optimal = KRange[index_of_max] print( 'The optimal number of clusters, according to GMM cross-validation, is {}'. format(K_optimal)) # Fit best Gaussian mixture model to X and plot result gmm = GaussianMixture(n_components=K_optimal, covariance_type=covar_type, n_init=reps).fit(X) cls = gmm.predict(X) # extract cluster labels cds = gmm.means_ # extract cluster centroids (means of gaussians) covs = gmm.covariances_ plt.figure(figsize=(12, 9)) clusterplot(X, clusterid=cls, centroids=cds, y=y, covars=covs) plt.title('Gaussian Mixture Model using {} clusters'.format(K_optimal)) plt.xlabel('PC 1') plt.ylabel('PC 2') plt.show() # Evaluate GMM model Rand_gmm, Jaccard_gmm, NMI_gmm = clusterval(y, cls) print('###################################################') print('# HIERARCHICAL CLUSTERING #') print('###################################################') Metric = 'euclidean' Maxclust = K_optimal max_display_levels = K_optimal Methods = ['single', 'complete', 'average', 'weighted', 'median', 'ward'] # We will try all these methods
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram # Load Matlab data file and extract variables of interest mat_data = loadmat('../Data/synth1.mat') X = np.matrix(mat_data['X']) y = np.matrix(mat_data['y']) attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] classNames = [name[0][0] for name in mat_data['classNames']] N, M = X.shape C = len(classNames) # Perform hierarchical/agglomerative clustering on data matrix Method = 'single' Metric = 'euclidean' Z = linkage(X, method=Method, metric=Metric) # Compute and display clusters by thresholding the dendrogram Maxclust = 4 cls = fcluster(Z, criterion='maxclust', t=Maxclust) figure(1) clusterplot(X, cls.reshape(cls.shape[0],1), y=y) # Display dendrogram max_display_levels=6 figure(2) dendrogram(Z, truncate_mode='level', p=max_display_levels) show()
# exercise 10.1.1 from pylab import * from scipy.io import loadmat from toolbox_02450 import clusterplot from sklearn.mixture import GMM # Load Matlab data file and extract variables of interest mat_data = loadmat('../Data/synth1.mat') X = np.matrix(mat_data['X']) y = np.matrix(mat_data['y']) attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] classNames = [name[0][0] for name in mat_data['classNames']] N, M = X.shape C = len(classNames) # Number of clusters K = 4 cov_type = 'diag' # type of covariance, you can try out 'diag' as well reps = 1 # number of fits with different initalizations, best result will be kept # Fit Gaussian mixture model gmm = GMM(n_components=K, covariance_type=cov_type, n_init=reps, params='wmc').fit(X) cls = gmm.predict(X) # extract cluster labels cds = gmm.means_ # extract cluster centroids (means of gaussians) covs = gmm.covars_ # extract cluster shapes (covariances of gaussians) if cov_type == 'diag': new_covs = np.zeros([K, M, M]) count = 0 for elem in covs: temp_m = np.zeros([M, M]) for i in range(len(elem)): temp_m[i][i] = elem[i]
# exercise 10.1.1 from pylab import * from scipy.io import loadmat from toolbox_02450 import clusterplot from sklearn.cluster import k_means # Load Matlab data file and extract variables of interest mat_data = loadmat('../Data/synth1.mat') X = np.matrix(mat_data['X']) y = np.matrix(mat_data['y']) attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] classNames = [name[0][0] for name in mat_data['classNames']] N, M = X.shape C = len(classNames) # Number of clusters: K = 4 # K-means clustering: centroids, cls, inertia = k_means(X, K) # Plot results: figure(figsize=(14, 9)) clusterplot(X, cls, centroids, y) show()
# Perform hierarchical/agglomerative clustering on data matrix #Method = 'single' Method = 'complete' #Method = 'centroid' Metric = 'euclidean' Z = linkage(X, method=Method, metric=Metric) # Compute and display clusters by thresholding the dendrogram Maxclust = C cls = fcluster(Z, criterion='maxclust', t=Maxclust) figure(1, figsize=(15, 15)) xlabel('PCA1') ylabel('PCA2') clusterplot(X, cls.reshape(cls.shape[0], 1), y=y) # Display dendrogram max_display_levels = 6 figure(2, figsize=(10, 4)) dendrogram(Z, truncate_mode='level', p=max_display_levels) show() # Calculate accuracy accuracy_hierarchical = sum( [cls[i] == y_classNames[i] for i in range(len(cls))]) / N print("Accuracy of the heirarchical clustering:", accuracy_hierarchical) ################################################ # GAUSSIAN MIXTURE MODEL
from toolbox_02450 import clusterplot from scipy.cluster.hierarchy import linkage, fcluster, dendrogram # Load Matlab data file and extract variables of interest mat_data = loadmat('../Data/synth1.mat') X = np.matrix(mat_data['X']) y = np.matrix(mat_data['y']) attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] classNames = [name[0][0] for name in mat_data['classNames']] N, M = X.shape C = len(classNames) # Perform hierarchical/agglomerative clustering on data matrix Method = 'single' Metric = 'euclidean' Z = linkage(X, method=Method, metric=Metric) # Compute and display clusters by thresholding the dendrogram Maxclust = 4 cls = fcluster(Z, criterion='maxclust', t=Maxclust) figure(1) clusterplot(X, cls.reshape(cls.shape[0], 1), y=y) # Display dendrogram max_display_levels = 6 figure(2) dendrogram(Z, truncate_mode='level', p=max_display_levels) show()
# Perform hierarchical/agglomerative clustering on data matrix Method = 'ward' Metric = 'euclidean' Z = linkage(X, method=Method, metric=Metric) # Compute and display clusters by thresholding the dendrogram Maxclust = 2 cls = fcluster(Z, criterion='maxclust', t=Maxclust) #figure(1) #clusterplot(X, cls.reshape(cls.shape[0],1), y=y) figure(figsize=(14, 9)) idx = [ 4, 1 ] # feature index, choose two features to use as x and y axis in the plot clusterplot(X[:, idx], clusterid=cls, y=y) #ylabel("glucose") #xlabel("insulin") show() # Display dendrogram max_display_levels = 6 figure(2, figsize=(10, 4)) dendrogram(Z, truncate_mode='level', p=max_display_levels) show() Rand, Jaccard, NMI = clusterval(y, cls) print(Rand, Jaccard, NMI) print('Ran Exercise 10.2.1')
cov_type = 'full' # e.g. 'full' or 'diag' # define the initialization procedure (initial value of means) initialization_method = 'random' # 'random' or 'kmeans' reps = 20 # number of fits with different initalizations, best result will be kept # Fit Gaussian mixture model gmm = GaussianMixture(n_components=K, covariance_type=cov_type, n_init=reps, tol=1e-6, reg_covar=1e-6, init_params=initialization_method).fit(X) cls = gmm.predict(X) # extract cluster labels cds = gmm.means_ # extract cluster centroids (means of gaussians) covs = gmm.covariances_ # extract cluster shapes (covariances of gaussians) if cov_type.lower() == 'diag': new_covs = np.zeros([K, M, M]) count = 0 for elem in covs: temp_m = np.zeros([M, M]) new_covs[count] = np.diag(elem) count += 1 covs = new_covs # Plot results: figure(figsize=(14,9)) clusterplot(X, clusterid=cls, centroids=cds, y=y, covars=covs) show() print('Ran Exercise 11.1.1')
classNames = ['Non diabetes', 'Diabetes'] N, M = X.shape C = len(classNames) # Perform hierarchical/agglomerative clustering on data matrix Method = 'ward' # complete #average # weighted # centroid Metric = 'euclidean' Z = linkage(X, method=Method, metric=Metric) # Compute and display clusters by thresholding the dendrogram Maxclust = 2 cls = fcluster(Z, criterion='maxclust', t=Maxclust) clsHie = pd.DataFrame(cls) clsHie.to_csv("clsHie.csv") figure(1) clusterplot(X[:, [0, 1]], cls.reshape(cls.shape[0], 1), y=y) savefig('hierarchicalScatterPlot.png', dpi=300) show() # Display dendrogram max_display_levels = 4 figure(2, figsize=(10, 4)) dendrogram(Z, truncate_mode='level', p=max_display_levels) savefig('hierachicalDenrogram', dpi=300) show() print('Ran Exercise 10.2.1')
N, M = X.shape C = len(classNames) # Number of clusters K = 4 cov_type = 'diag' # type of covariance, you can try out 'diag' as well reps = 1 # number of fits with different initalizations, best result will be kept # Fit Gaussian mixture model gmm = GMM(n_components=K, covariance_type=cov_type, n_init=reps, params='wmc').fit(X) cls = gmm.predict(X) # extract cluster labels cds = gmm.means_ # extract cluster centroids (means of gaussians) covs = gmm.covars_ # extract cluster shapes (covariances of gaussians) if cov_type == 'diag': new_covs = np.zeros([K,M,M]) count = 0 for elem in covs: temp_m = np.zeros([M,M]) for i in range(len(elem)): temp_m[i][i] = elem[i] new_covs[count] = temp_m count += 1 covs = new_covs # Plot results: figure(figsize=(14,9)) clusterplot(X, clusterid=cls, centroids=cds, y=y, covars=covs) show()
"13. Private Room", "14. Entire Home", "15. Shared Room" ] for i in cols: print(x_labels[i - 1]) for i in range(len(cols)): cols[i] = cols[i] - 1 X = X[:, cols] # Do PCA for plot pca = PCA(n_components=2) PCASpace = pca.fit_transform(X) # Perform hierarchical/agglomerative clustering on data matrix Method = 'complete' Metric = 'euclidean' Z = linkage(X, method=Method, metric=Metric) # Compute and display clusters by thresholding the dendrogram Maxclust = 6 cls = fcluster(Z, criterion='maxclust', t=Maxclust) figure(1) clusterplot(PCASpace, cls.reshape(cls.shape[0], 1), y=Y) # Display dendrogram max_display_levels = 6 figure(2, figsize=(10, 4)) dendrogram(Z, truncate_mode='level', p=max_display_levels) show()
print "z Shape: " + str(z.shape) print "Y Shape: " + str(Y.shape) print "V Shape: " + str(V.shape) print "X Shape: " + str(X.shape) print "y Shape: " + str(y.shape) print "cls Shape: " + str(cls.shape) print "cds Shape: " + str(cds.shape) print "covs Shape: " + str(covs.shape) if cov_type == 'diag': new_covs = np.zeros([K, M, M]) count = 0 for elem in covs: temp_m = np.zeros([M, M]) for i in range(len(elem)): temp_m[i][i] = elem[i] new_covs[count] = temp_m count += 1 covs = new_covs np.savetxt("cls.txt", cls) result = [abs(cls[i] - y[i]) for i in range(len(cls))] np.savetxt("result.txt", result) print "Result mean: " + str(np.mean(result)) #Plot results: figure(figsize=(10, 6)) clusterplot(z, clusterid=cls, centroids=cds, y=y, covars=5) show()
# exercise 10.1.1 from pylab import * from scipy.io import loadmat from toolbox_02450 import clusterplot from sklearn.mixture import GMM # Load Matlab data file and extract variables of interest mat_data = loadmat('../Data/synth1.mat') X = np.matrix(mat_data['X']) y = np.matrix(mat_data['y']) attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] classNames = [name[0][0] for name in mat_data['classNames']] N, M = X.shape C = len(classNames) # Number of clusters K = 4 cov_type = 'diag' # type of covariance, you can try out 'diag' as well reps = 1 # number of fits with different initalizations, best result will be kept # Fit Gaussian mixture model gmm = GMM(n_components=K, covariance_type=cov_type, n_init=reps, params='wmc').fit(X) cls = gmm.predict(X) # extract cluster labels cds = gmm.means_ # extract cluster centroids (means of gaussians) covs = gmm.covars_ # extract cluster shapes (covariances of gaussians)
if cov_type.lower() == 'diag': new_covs = np.zeros([K, M, M]) count = 0 for elem in covs: temp_m = np.zeros([M, M]) new_covs[count] = np.diag(elem) count += 1 covs = new_covs ## In case the number of features != 2, then a subset of features most be plotted instead. figure(figsize=(14, 9)) #idx = [3,4] # feature index, choose two features to use as x and y axis in the plot #clusterplot(X[:,idx], clusterid=cls, centroids=cds[:,idx], y=y2, covars=covs[:,idx,:][:,:,idx]) clusterplot(X, clusterid=cls, centroids=cds, y=y2, covars=covs) savefig('figures/GMM/clustering_GMM.png', bbox_inches='tight') show() # CLUSTERING 2 ########################################################################## Perform hierarchical/agglomerative clustering on data matrix #Method = 'single' #Method = 'complete' #Method = 'average' Method = 'weighted' #Method = 'centroid' #Method = 'median' #Method = 'ward' Metric = 'euclidean' Z = linkage(X, method=Method, metric=Metric)
# extract cluster centroids (means of gaussians) covs = gmm.covariances_ # extract cluster shapes (covariances of gaussians) if cov_type.lower() == 'diag': new_covs = np.zeros([K, M, M]) count = 0 for elem in covs: temp_m = np.zeros([M, M]) new_covs[count] = np.diag(elem) count += 1 covs = new_covs # Plot results: #figure(figsize=(14,9)) #clusterplot(X, clusterid=cls, centroids=cds, y=y, covars=covs) #show() print(cds) ## In case the number of features != 2, then a subset of features most be plotted instead. figure(figsize=(14, 9)) idx = [0, 1] # feature index, choose two features to use as x and y axis in the plot clusterplot(X[:, idx], clusterid=cls, centroids=cds[:, idx], y=y, covars=covs[:, idx, :][:, :, idx]) show() print(Rand, Jaccard, NMI)
def draw_GMM(input_data): X, y = split_train_test(input_data, 9) y = np.argmax(input_data[:, 7:], 1) U, S, V = svd(input_data[:, :], full_matrices=False) X = np.dot(input_data[:, :], V.T) # X = input_data N, M = X.shape # Number of clusters K = 5 cov_type = 'full' # type of covariance, you can try out 'diag' as well reps = 10 # number of fits with different initalizations, best result will be kept # Fit Gaussian mixture model gmm = GaussianMixture(n_components=K, covariance_type=cov_type, n_init=reps).fit(X) cls = gmm.predict(X) # extract cluster labels cds = gmm.means_ # extract cluster centroids (means of gaussians) covs = gmm.covariances_ # extract cluster shapes (covariances of gaussians) if cov_type == 'diag': new_covs = np.zeros([K, M, M]) if cov_type == 'full': new_covs = np.zeros([K, M, M]) count = 0 for elem in covs: temp_m = np.zeros([M, M]) for i in range(len(elem)): for j in range(len(elem)): temp_m[i][j] = elem[i][j] new_covs[count] = temp_m count += 1 covs = new_covs print(cds) # Plot results: # figure(figsize=(14, 9)) # clusterplot(X, clusterid=cls, centroids=cds, y=y, covars=covs) # show() ## In case the number of features != 2, then a subset of features most be plotted instead. figure(figsize=(14, 9)) idx = [ 0, 1 ] # feature index, choose two features to use as x and y axis in the plot clusterplot(X[:, idx], clusterid=cls, centroids=cds[:, idx], y=y, covars=covs[:, idx, :][:, :, idx]) title('Clusterplot with GMM with origin') show()
eval = pickle.load(eval_f) eval_f.close() rand = eval[0] jaccard = eval[1] nmi = eval[2] ######################################################## ######################################################## # PART 1 - GMM CLUSTERING print('==============================================') print('Best K: {0}'.format(bestK)) print('==============================================') # Cluster Plot figure clusterplot(PC, clusterid=clsGMM, centroids=cdsGMM, covars=covsGMM, y=Y) xticks(fontsize=14) yticks(fontsize=14) show() # Plot CV error per K figure plot(KRange, 2*CVE,'-ok') ylabel('Cross-validation Error', fontsize=14) xticks(fontsize=14) yticks(fontsize=14) xlabel('Number of clusters (K)', fontsize=14) show() ########################################################
for i, col_id in enumerate(range(2, 27)): X[:, i] = np.mat(doc.col_values(col_id, 2, 345)).T # Compute values of N, M and C. N = len(y) M = len(attributeNames) C = len(classNames) # Perform hierarchical/agglomerative clustering on data matrix Method = 'complete' Metric = 'euclidean' Z = linkage(X, method=Method, metric=Metric) # Compute and display clusters by thresholding the dendrogram Y = X - np.ones((N,1))*X.mean(0) U,S,V = svd(Y,full_matrices=False) V = V.T rho = (S*S) / (S*S).sum() Maxclust = 9 cls = fcluster(Z, criterion='maxclust', t=Maxclust) figure(1, figsize=(14,9)) clusterplot((Y*V)[:, :2], cls.reshape(cls.shape[0],1), y=y) # Display dendrogram max_display_levels=50 figure(2,figsize=(10,4)) dendrogram(Z, truncate_mode='level', p=max_display_levels) show()
# exercise 9.1.1 from pylab import * from scipy.io import loadmat from toolbox_02450 import clusterplot from sklearn.cluster import k_means # Load Matlab data file and extract variables of interest mat_data = loadmat('../Data/synth1.mat') X = np.matrix(mat_data['X']) y = np.matrix(mat_data['y']) attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] classNames = [name[0][0] for name in mat_data['classNames']] N, M = X.shape C = len(classNames) # Number of clusters: K = 4 # K-means clustering: centroids, cls, inertia = k_means(X,K) # Plot results: figure(figsize=(14,9)) clusterplot(X, cls, centroids, y) show()
elif item-1 == 0 and y[index] == 1: falseneg += 1 elif item-1 == 1 and y[index] == 0: falsepos += 1 else: print("something weird", index, item) print("Method:",Method) print("truepos:", truepos) print("trueneg:", trueneg) print("falsepos:", falsepos) print("falseneg:", falseneg) print("Percent right:", (truepos + trueneg)/len(a) * 100) # Plot clusters plt.figure(1, figsize=(10,8)) clusterplot(Z[:,0:2], a, y=y) plt.xlabel("PC1") plt.ylabel("PC2") plt.title("Hierarchical clustering") # Display dendrogram max_display_levels=4 plt.figure(2,figsize=(11,4)) dendrogram(link, truncate_mode='level', p=max_display_levels) plt.show() #%% testing all posibilites highscores=[] def testAll(): Maxclust = 2