def KM(domain, n_clusters): if domain == 'DietType': X = getDietTypeTFArray4DC() elif domain == 'ActType': X = getActTypeTFArray4DC() X = utilise.normArray(X) reduced_data = PCA(n_components=2).fit_transform(X) Inertia = [] Labels = [] # for n_clusters in range_n_clusters: for j in range(300): kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) kmeans.fit(reduced_data) inertia = kmeans.inertia_ Inertia.append(inertia) labels = kmeans.labels_ Labels.append(labels) min = np.min(Inertia) for i in range(len(Inertia)): if Inertia[i] == min: inertia = Inertia[i] labels = Labels[i] print domain,n_clusters,inertia, labels
def KM_nonslp(domain, n_clusters): if domain == 'DietType': X = dataGen4DietAct.genDietTypeTFArrayWithSlp() else: X = dataGen4DietAct.genActTypeTFArrayWithSlp() X = utilise.normArray(X) Inertia = [] Labels = [] for j in range(300): kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) kmeans.fit(X) inertia = kmeans.inertia_ Inertia.append(inertia) labels = kmeans.labels_ Labels.append(labels) min = np.min(Inertia) for i in range(len(Inertia)): if Inertia[i] == min: inertia = Inertia[i] labels = Labels[i] print domain, n_clusters, inertia, labels
def sihouetteScoreArtificialData(metric): df, cols = artificialDataGenerator.artificialData() for domain in Domain: print df.columns if domain == 'DietType': df_temp = df[[ 'alcoholD', 'caffeineD', 'compositeP', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP', 'seafood', 'snack', 'starchyP', 'vegetables' ]] else: df_temp = df[[ 'entertainmentRelax', 'others', 'social', 'sport', 'transportation1', 'transportation2', 'transportation3', 'workStudy' ]] X = df_temp.as_matrix() X = utilise.normArray(X) range_n_clusters = [2, 3, 4, 5, 6] for n_clusters in range_n_clusters: clusterer = KMeans(n_clusters=n_clusters, n_init=300) clusterer.fit(X) cluster_labels = clusterer.labels_ # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed clusters silhouette_avg = silhouette_score(X, cluster_labels) print(metric, domain, 'For n_clusters =', n_clusters, 'The average silhouette_score is :', silhouette_avg)
def sihouetteScore(metric): for domain in Domain: if metric == 'TF': if domain == 'DietType': X = dataGen4DietAct.genDietTypeTFArray() elif domain == 'ActType': X = dataGen4DietAct.genActTypeTFArray() elif metric == 'TFIDF': if domain == 'DietType': X = dataGen4DietAct.DietTypeTfidfArray() elif domain == 'ActType': X = dataGen4DietAct.ActTypeTfidfArray() X = utilise.normArray(X) reduced_data = PCA(n_components=2).fit_transform(X) range_n_clusters = [2, 3, 4, 5, 6] for n_clusters in range_n_clusters: clusterer = KMeans(n_clusters=n_clusters, n_init=300) clusterer.fit(reduced_data) cluster_labels = clusterer.labels_ # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed clusters silhouette_avg = silhouette_score(X, cluster_labels) print(metric, domain, 'For n_clusters =', n_clusters, 'The average silhouette_score is :', silhouette_avg)
def visTFIDFMatrix(): tfidf1 = utilise.normArray(dataGen4DietAct.ActItemTfidfArray()) # tfidf1 = dataGen4DietAct.ActItemTfidfArray() plt.figure() plt.matshow(tfidf1) plt.colorbar() plt.title('actTFIDFMatrix') plt.savefig('visTForTFIDFMatrix/actTFIDFMatrix') tfidf2 = utilise.normArray(dataGen4DietAct.DietItemTfidfArray()) # tfidf2 = dataGen4DietAct.DietItemTfidfArray() plt.figure() plt.matshow(tfidf2) plt.colorbar() plt.title('dietTFIDFMatrix') plt.savefig('visTForTFIDFMatrix/dietTFIDFMatrix') tfidf = utilise.genCombiArray(tfidf1, tfidf2) plt.figure() plt.matshow(tfidf) plt.colorbar() plt.title('actDietTFIDFMatrix') plt.savefig('visTForTFIDFMatrix/actDietTFIDFMatrix') tfidf2 = utilise.normArray(dataGen4DietAct.DietTypeTfidfArray()) # tfidf2 = dataGen4DietAct.DietTypeTfidfArray() plt.figure() plt.matshow(tfidf2) plt.colorbar() plt.title('dietTypeTFIDFMatrix') plt.savefig('visTForTFIDFMatrix/dietTypeTFIDFMatrix') tfidf1 = utilise.normArray(dataGen4DietAct.ActTypeTfidfArray()) # tfidf1 = dataGen4DietAct.ActTypeTfidfArray() plt.figure() plt.matshow(tfidf1) plt.colorbar() plt.title('actTypeTFIDFMatrix') plt.savefig('visTForTFIDFMatrix/actTypeTFIDFMatrix') tfidf = utilise.genCombiArray(tfidf1, tfidf2) plt.figure() plt.matshow(tfidf) plt.colorbar() plt.title('actDietTypeTFIDFMatrix') plt.savefig('visTForTFIDFMatrix/actDietTypeTFIDFMatrix')
def visTFMatrix(): tf_ActItem = utilise.normArray(dataGen4DietAct.genActItemTFArray()) # tf_ActItem = dataGen4DietAct.genActItemTFArray() plt.figure() plt.matshow(tf_ActItem) plt.colorbar() plt.title('actTFMatrix') plt.savefig('visTForTFIDFMatrix/actTFMatrix') tf_DietItem = utilise.normArray(dataGen4DietAct.genDietItemTFArray()) # tf_DietItem = dataGen4DietAct.genDietItemTFArray() plt.figure() plt.matshow(tf_DietItem) plt.colorbar() plt.title('dietTFMatrix') plt.savefig('visTForTFIDFMatrix/dietTFMatrix') tf = utilise.genCombiArray(tf_ActItem,tf_DietItem) plt.figure() plt.matshow(tf) plt.colorbar() plt.title('actDietTFMatrix') plt.savefig('visTForTFIDFMatrix/actDietTFMatrix') tf_DietType = utilise.normArray(dataGen4DietAct.genDietTypeTFArray()) # tf_DietType = dataGen4DietAct.genDietTypeTFArray() plt.figure() plt.matshow(tf_DietType) plt.colorbar() plt.title('dietTypeTFMatrix') plt.savefig('visTForTFIDFMatrix/dietTypeTFMatrix') tf_ActType = utilise.normArray(dataGen4DietAct.genActTypeTFArray()) # tf_ActType = dataGen4DietAct.genActTypeTFArray() plt.figure() plt.matshow(tf_ActType) plt.colorbar() plt.title('actTypeTFMatrix') plt.savefig('visTForTFIDFMatrix/actTypeTFMatrix') tf = utilise.genCombiArray(tf_ActType,tf_DietType) plt.figure() plt.matshow(tf) plt.colorbar() plt.title('actDietTypeTFMatrix') plt.savefig('visTForTFIDFMatrix/actDietTypeTFMatrix')
def KM_AtificialData(): df, cols = artificialDataGenerator.artificialData() for domain in Domain: print df.columns if domain == 'DietType': df_temp = df[[ 'alcoholD', 'caffeineD', 'compositeP', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP', 'seafood', 'snack', 'starchyP', 'vegetables' ]] else: df_temp = df[[ 'entertainmentRelax', 'others', 'social', 'sport', 'transportation1', 'transportation2', 'transportation3', 'workStudy' ]] X = df_temp.as_matrix() X = utilise.normArray(X) range_n_clusters = [2, 3, 4, 5, 6] for n_clusters in range_n_clusters: kmeans = KMeans(n_clusters=n_clusters, n_init=3000) kmeans.fit(X) labels = kmeans.labels_ inertia = kmeans.inertia_ plt.figure() reduced_data = PCA(n_components=2).fit_transform(X) N = np.max(labels) + 1 for k in range(N): class_members = labels == k if k == 0: for x in reduced_data[class_members]: plt.plot(x[0], x[1], 'go', markersize=5) if k == 1: for x in reduced_data[class_members]: plt.plot(x[0], x[1], 'ro', markersize=5) if k == 2: for x in reduced_data[class_members]: plt.plot(x[0], x[1], 'bo', markersize=5) if k == 3: for x in reduced_data[class_members]: plt.plot(x[0], x[1], 'yo', markersize=5) # for i in range(reduced_data.shape[0]): # plt.text(reduced_data[i, 0], reduced_data[i, 1],i) plt.title('K-means clustering (PCA-reduced data)') plt.savefig('visClustering' + domain + 'Pattern/KMeans_TF_artificial_' + str(n_clusters)) print domain, n_clusters, inertia, labels
def sihouetteScore4DC(metric): for domain in Domain: if domain == 'DietType': X = validation4DC.getDietTypeTFArray4DC() elif domain == 'ActType': X = validation4DC.getActTypeTFArray4DC() X = utilise.normArray(X) range_n_clusters = [2, 3, 4, 5, 6] for n_clusters in range_n_clusters: clusterer = KMeans(n_clusters=n_clusters, n_init=300) clusterer.fit(X) cluster_labels = clusterer.labels_ # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed clusters silhouette_avg = silhouette_score(X, cluster_labels) print(metric, domain, 'For n_clusters =', n_clusters, 'The average silhouette_score is :', silhouette_avg)
def singleSubjectDailyArray(domain, subjectID): ''' build daily item TFIDF normalization array ''' if domain == 'ActItem': item_dict = dataGen4DietAct.genActItemDict() elif domain == 'DietItem': item_dict = dataGen4DietAct.genDietItemDict() duration = dietActInfoRetrv.getDuration(subjectID) x = duration n = len(item_dict) dims = (x, n) array = np.zeros(dims) if domain == 'ActItem': for i in range(duration): ItemIndex = buildItemIndex.build_daily_single_activity_index( subjectID, i + 1) for key in item_dict: if "'" + item_dict[key] + "'" in ItemIndex: array[i, key] = ItemIndex["'" + item_dict[key] + "'"] if domain == 'DietItem': for i in range(duration): ItemIndex = buildItemIndex.build_daily_single_diet_index( subjectID, i + 1) for key in item_dict: if "'" + item_dict[key] + "'" in ItemIndex: array[i, key] = ItemIndex["'" + item_dict[key] + "'"] transformer = TfidfTransformer(norm=None) tfidf = transformer.fit_transform(array) aa = tfidf.toarray() tfidfNorm = utilise.normArray(aa) print tfidfNorm.shape return tfidfNorm
# -*- coding: utf-8 -*- """ Created on Wed Mar 16 20:12:45 2016 @author: jingjing """ from sklearn.cluster import DBSCAN import utilise Domain = ['DietType', 'ActType'] for domain in Domain: if domain == 'DietType': X = utilise.genDietTypeTFArray() elif domain == 'ActType': X = utilise.genActTypeTFArray() X = utilise.normArray(X) db = DBSCAN(0.8, 1).fit(X) labels = db.labels_ print db.components_ print labels
def buildSubAveInfo(): workbookW = xlwt.Workbook() ws = workbookW.add_sheet('AveInfo') groupAct = dietActInfoRetrv.getGroups(labelsActType) groupDiet = dietActInfoRetrv.getGroups(labelsDietType) Age, Gender, Height, Weight, BMI, FatFree, FatMass, PercFat, Vo2max = slpInfoRetrv.getDemoGInfo( ) SlpHours = slpInfoRetrv.getSlpHours() MedianHR = slpInfoRetrv.getMedianHR() MedianHRBefore = slpInfoRetrv.getMedianHRBefore() MedianHRAfter = slpInfoRetrv.getMedianHRAfter() titles = [ 'SubjId', 'ActGroup', 'DietGroup', 'HoursSleep', 'MedianHR', 'MedianHRBefore', 'MedianHRAfter', 'age', 'gender', 'height', 'weight', 'BMI', 'FatFreeMass', 'FatMass', 'PercFat', 'vo2max' ] for i in range(len(titles)): ws.write(0, i, titles[i]) rowW = 1 for index in range(len(sleep_list)): ws.write(rowW, 0, sleep_list[index]) for key in groupAct: if sleep_list[index] in groupAct[key]: ws.write(rowW, 1, key) break for key in groupDiet: if sleep_list[index] in groupDiet[key]: ws.write(rowW, 2, key) break ws.write(rowW, 1 + 2, SlpHours[index]) ws.write(rowW, 2 + 2, MedianHR[index]) ws.write(rowW, 3 + 2, MedianHRBefore[index]) ws.write(rowW, 4 + 2, MedianHRAfter[index]) ws.write(rowW, 5 + 2, Age[index]) ws.write(rowW, 6 + 2, Gender[index]) ws.write(rowW, 7 + 2, Height[index]) ws.write(rowW, 8 + 2, Weight[index]) ws.write(rowW, 9 + 2, BMI[index]) ws.write(rowW, 10 + 2, FatFree[index]) ws.write(rowW, 11 + 2, FatMass[index]) ws.write(rowW, 12 + 2, PercFat[index]) ws.write(rowW, 13 + 2, Vo2max[index]) rowW += 1 ws2 = workbookW.add_sheet('DietTF') row_labels = utilise.itemDict2list(dataGen4DietAct.genDietTypeDict()) X = utilise.normArray(dataGen4DietAct.genDietTypeTFArray()) ws2.write(0, 0, 'SubjId') ws2.write(0, 1, 'DietGroup') for i in range(len(row_labels)): ws2.write(0, i + 2, row_labels[i]) rowW = 1 for index in range(len(available_list)): ws2.write(rowW, 0, available_list[index]) for key in groupDiet: if available_list[index] in groupDiet[key]: ws2.write(rowW, 1, key) break for i in range(len(row_labels)): ws2.write(rowW, i + 2, X[index][i]) rowW += 1 ws3 = workbookW.add_sheet('ActTF') row_labels = utilise.itemDict2list(dataGen4DietAct.genActTypeDict()) X = utilise.normArray(dataGen4DietAct.genActTypeTFArray()) ws3.write(0, 0, 'SubjId') ws3.write(0, 1, 'ActGroup') for i in range(len(row_labels)): ws3.write(0, i + 2, row_labels[i]) rowW = 1 for index in range(len(available_list)): ws3.write(rowW, 0, available_list[index]) for key in groupAct: if available_list[index] in groupAct[key]: ws3.write(rowW, 1, key) break for i in range(len(row_labels)): ws3.write(rowW, i + 2, X[index][i]) rowW += 1 workbookW.save('SubAveInfo.xls')
def KM(domain, n_clusters): # if domain == 'DietType': # X = dataGen4DietAct.genDietTypeTFArray() # elif domain == 'ActType': # X = dataGen4DietAct.genActTypeTFArray() # X = utilise.normArray(X) # if domain == 'DietType': # Similarity_dict = utilise.SimilarityDict(domain,'TFEclud') # elif domain == 'ActType': # Similarity_dict = utilise.SimilarityDict(domain,'TFEclud') # X = visSimilarityMat.similarityDict2array(Similarity_dict,0) if domain == 'DietType': X = validation4DC.getDietTypeTFArray4DC() elif domain == 'ActType': X = validation4DC.getActTypeTFArray4DC() X = utilise.normArray(X) # print X # print X.shape Inertia = [] Labels = [] # range_n_clusters = [2, 3, 4, 5, 6] # range_n_clusters = [4] # for n_clusters in range_n_clusters: for j in range(300): kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) # kmeans.fit(reduced_data) kmeans.fit(X) inertia = kmeans.inertia_ Inertia.append(inertia) # print domain,inertia labels = kmeans.labels_ Labels.append(labels) # print labels min = np.min(Inertia) for i in range(len(Inertia)): if Inertia[i] == min: inertia = Inertia[i] labels = Labels[i] plt.figure() reduced_data = PCA(n_components=2).fit_transform(X) N = np.max(labels) + 1 for k in range(N): class_members = labels == k if k == 0: for x in reduced_data[class_members]: plt.plot(x[0], x[1], 'go', markersize=5) if k == 1: for x in reduced_data[class_members]: plt.plot(x[0], x[1], 'ro', markersize=5) if k == 2: for x in reduced_data[class_members]: plt.plot(x[0], x[1], 'bo', markersize=5) if k == 3: for x in reduced_data[class_members]: plt.plot(x[0], x[1], 'yo', markersize=5) for i in range(reduced_data.shape[0]): plt.text(reduced_data[i, 0], reduced_data[i, 1], i) plt.title('K-means clustering (PCA-reduced data)') plt.savefig('visClustering' + domain + 'Pattern/KMeans_TF_' + str(n_clusters)) # a,b = kMeans(X,2) # print b[:,0].shape # print a,b[:,0].ravel() # print sum(b[:,1].ravel()) print domain, n_clusters, inertia, labels
def clusteringKmeansLabelsNewSubs(): workbookW = xlwt.Workbook() ws = workbookW.add_sheet('sheet1') rowW = 0 df = newDataProcess.newSubInfo() for domain in Domain: print df.columns if domain == 'DietType': df_temp = df[['alcoholD','caffeineD','dairyP','eggP','fruitP','grainP','meatP','seafood','snack','starchyP','vegetables']] row_labels = df_temp.columns X = df_temp.as_matrix() X = utilise.normArray(X) kmeans = KMeans(n_clusters=2, n_init = 3000) kmeans.fit(X) labels = kmeans.labels_ else: df_temp = df[['leisure','social','sport','walk','car','bike','workStudy']] row_labels = df_temp.columns X = df_temp.as_matrix() X = utilise.normArray(X) kmeans = KMeans(n_clusters=3, n_init = 3000) kmeans.fit(X) labels = kmeans.labels_ # write the lables to excel file col = 0 for label in row_labels: ws.write(rowW,col,label) col += 1 rowW += 1 # print type(labels) plt.figure() n_clusters = np.max(labels) + 1 for k in range(n_clusters): class_members = labels == k group = [] for x in X[class_members]: group.append(x) group = np.array(group) meanVec = np.mean(group,axis=0) meanVec.tolist() stdVec = np.std(group,axis=0) stdVec.tolist() print stdVec # write the mean vector of each group to excel file col = 0 for value in meanVec: ws.write(rowW,col,value) col += 1 rowW += 1 # print meanVec firstMax = np.max(meanVec) # print firstMax tempVec = np.copy(meanVec) for j in range(X.shape[1]): if tempVec[j] == firstMax: tempVec[j] = 0 secondMax = np.max(tempVec) # print secondMax tempVec2 = np.copy(tempVec) for j in range(X.shape[1]): if tempVec2[j]==secondMax: tempVec2[j] = 0 thirdMax = np.max(tempVec2) # print thirdMaxO x = range(X.shape[1]) plt.plot(x,meanVec) # plt.errorbar(x,meanVec,yerr=stdVec) # print meanVec for j in range(X.shape[1]): # if meanVec[j] == firstMax: # if meanVec[j] == firstMax or meanVec[j] == secondMax: if meanVec[j] == firstMax or meanVec[j] == secondMax or meanVec[j] == thirdMax: ws.write(rowW,0,k) ws.write(rowW,1,domain) ws.write(rowW,2,row_labels[j]) ws.write(rowW,3,meanVec[j]) rowW += 1 print k,domain,n_clusters,meanVec[j],row_labels[j] plt.text(x[j],meanVec[j],row_labels[j]) # print row_labels # plt.xlabel(row_labels) plt.title(domain+'_TF_KMeans_'+str(n_clusters)) plt.savefig('visClustering'+domain+'Pattern/KMeans__TF_NewDataSubs_'+str(n_clusters)+'_groupFreq') workbookW.save('tempLabels.xls')
def bestLabel(labelsDietType,labelsActType): workbookW = xlwt.Workbook() ws = workbookW.add_sheet('sheet1') rowW = 0 for domain in Domain: if domain == 'DietType': labels = utilise.string2array(labelsDietType) row_labels = utilise.itemDict2list(dataGen4DietAct.genDietTypeDict()) X = dataGen4DietAct.genDietTypeTFArray() elif domain == 'ActType': labels = utilise.string2array(labelsActType) row_labels = utilise.itemDict2list(dataGen4DietAct.genActTypeDict()) X = dataGen4DietAct.genActTypeTFArray() X = utilise.normArray(X) # write the lables to excel file col = 0 for label in row_labels: ws.write(rowW,col,label) col += 1 rowW += 1 # print type(labels) plt.figure() n_clusters = np.max(labels) + 1 for k in range(n_clusters): class_members = labels == k group = [] for x in X[class_members]: group.append(x) group = np.array(group) meanVec = np.mean(group,axis=0) meanVec.tolist() stdVec = np.std(group,axis=0) stdVec.tolist() # write the mean vector of each group to excel file col = 0 for value in meanVec: ws.write(rowW,col,value) col += 1 rowW += 1 # print meanVec # we don't have to do normalization here, as the input X has already been normalized # totalSum = np.sum(meanVec[0]) # print totalSum # meanVec = meanVec/totalSum # # normalize the meanVec # firstMax = np.max(meanVec) # meanVec = meanVec/firstMax firstMax = np.max(meanVec) # print firstMax tempVec = np.copy(meanVec) for j in range(X.shape[1]): if tempVec[j] == firstMax: tempVec[j] = 0 secondMax = np.max(tempVec) # print secondMax tempVec2 = np.copy(tempVec) for j in range(X.shape[1]): if tempVec2[j]==secondMax: tempVec2[j] = 0 thirdMax = np.max(tempVec2) # print thirdMax x = range(X.shape[1]) plt.plot(x,meanVec) # print meanVec for j in range(X.shape[1]): # if meanVec[j] == firstMax: # if meanVec[j] == firstMax or meanVec[j] == secondMax: if meanVec[j] == firstMax or meanVec[j] == secondMax or meanVec[j] == thirdMax: print k,domain,n_clusters,meanVec[j],row_labels[j] plt.text(x[j],meanVec[j],row_labels[j]) # print row_labels # plt.xlabel(row_labels) plt.title(domain+'_TF_KMeans_'+str(n_clusters)) plt.savefig('visClustering'+domain+'Pattern/KMeans__TF_'+str(n_clusters)+'_groupFreq') workbookW.save('tempLabels.xls')
def HC(domain, para): if para in Metric: if para == 'TF': if domain == 'DietItem': X = dataGen4DietAct.genDietItemTFArray() elif domain == 'ActItem': X = dataGen4DietAct.genActItemTFArray() elif domain == 'DietType': X = dataGen4DietAct.genDietTypeTFArray() elif domain == 'ActType': X = dataGen4DietAct.genActTypeTFArray() elif para == 'TFIDF': if domain == 'DietItem': X = dataGen4DietAct.DietItemTfidfArray() elif domain == 'ActItem': X = dataGen4DietAct.ActItemTfidfArray() elif domain == 'DietType': X = dataGen4DietAct.DietTypeTfidfArray() elif domain == 'ActType': X = dataGen4DietAct.ActTypeTfidfArray() X = utilise.normArray(X) if para in Sim: Similarity_dict = {} if domain == 'DietItem': Similarity_dict = utilise.SimilarityDict(domain, para) elif domain == 'ActItem': Similarity_dict = utilise.SimilarityDict(domain, para) elif domain == 'DietType': Similarity_dict = utilise.SimilarityDict(domain, para) elif domain == 'ActType': Similarity_dict = utilise.SimilarityDict(domain, para) X = visSimilarityMat.similarityDict2array(Similarity_dict, 0) # method can be ward, complete, average method = 'ward' row_method = method row_metric = 'euclidean' column_method = method column_metric = 'euclidean' # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.pdist.html # d1 = ssd.pdist(X,'cosine') d1 = ssd.pdist(X) # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.squareform.html#scipy.spatial.distance.squareform D1 = ssd.squareform(d1) # full matrix # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage Y1 = sch.linkage(D1, method=row_method, metric=row_metric) row_idxing = sch.leaves_list(Y1) # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.pdist.html d2 = ssd.pdist(X.T) # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.squareform.html#scipy.spatial.distance.squareform D2 = ssd.squareform(d2) # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage Y2 = sch.linkage(D2, method=column_method, metric=column_metric) col_idxing = sch.leaves_list(Y2) heatmap_array = X[:, col_idxing][ row_idxing, :] #a numpy.ndarray or numpy.matrix, for this example, let's say mxn array top_dendrogram = Y2 #a (n-1) x 4 array side_dendrogram = Y1 #a (m-1) x 4 array row_labels = range(X.shape[0]) if para in Sim: col_labels = range(X.shape[1]) if para in Metric: if domain == 'DietItem': col_labels = utilise.itemDict2list( dataGen4DietAct.genDietItemDict()) elif domain == 'ActItem': col_labels = utilise.itemDict2list( dataGen4DietAct.genActItemDict()) elif domain == 'DietType': col_labels = utilise.itemDict2list( dataGen4DietAct.genDietTypeDict()) elif domain == 'ActType': col_labels = utilise.itemDict2list( dataGen4DietAct.genActTypeDict()) col_idxing = list(col_idxing) row_idxing = list(row_idxing) print col_idxing new_row_labels = [] new_col_labels = [] for i in range(len(row_idxing)): new_row_labels.append(str(row_labels[row_idxing[i]])) for j in range(len(col_idxing)): new_col_labels.append(str(col_labels[col_idxing[j]])) heatmap = pdh.DendroHeatMap(heat_map_data=heatmap_array, left_dendrogram=side_dendrogram, top_dendrogram=top_dendrogram) heatmap.title = 'HC_' + domain + '_' + para + '_' + method heatmap.row_labels = new_row_labels heatmap.col_labels = new_col_labels # heatmap.show() heatmap.export('VisClustering' + domain + 'Pattern/Hierarchy_' + para + '_' + method + '.png')