def AP(domain, sim):
    Similarity_dict = {}
    if domain == 'DietItem':
        Similarity_dict = utilise.SimilarityDict(domain, sim)
    elif domain == 'ActItem':
        Similarity_dict = utilise.SimilarityDict(domain, sim)
    elif domain == 'DietType':
        Similarity_dict = utilise.SimilarityDict(domain, sim)
    elif domain == 'ActType':
        Similarity_dict = utilise.SimilarityDict(domain, sim)
    X = visSimilarityMat.similarityDict2array(Similarity_dict, 0)

    af = AffinityPropagation(affinity="precomputed").fit(X)
    cluster_centers_indices = af.cluster_centers_indices_
    # print cluster_centers_indices
    # print type(cluster_centers_indices)
    labels = af.labels_
    print domain, sim, labels
    # print type(labels)
    n_clusters_ = len(cluster_centers_indices)
    # print('Estimated number of clusters: %d' % n_clusters_)

    X = PCA(n_components=2).fit_transform(X)

    plt.figure()
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(n_clusters_), colors):
        # print k,col
        class_members = labels == k
        cluster_center = X[cluster_centers_indices[k]]
        # print class_members
        # print cluster_center
        # plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
        # plt.plot(X[class_members, 1], X[class_members, 2], col + '.')
        # plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
        plt.plot(cluster_center[0],
                 cluster_center[1],
                 'o',
                 markerfacecolor=col,
                 markeredgecolor='k',
                 markersize=8)
        for x in X[class_members]:
            # print x
            # plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
            plt.plot(x[0], x[1], col + '.')
            plt.text(x[0], x[1], k)
            plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

    plt.title('AP_' + domain + '_' + sim + '_' + str(n_clusters_))
    plt.savefig('VisClustering' + domain + 'Pattern/AffinityPropagation_' +
                sim + '_' + str(n_clusters_))
def plotSimilarityMatrix(sim='TFIDFCosin'):
    actSimilarity_dict = utilise.SimilarityDict('ActItem', sim)
    # print actSimilarity_dict
    # print '\n'
    a = similarityDict2array(actSimilarity_dict, 0)
    plt.figure()
    plt.matshow(a)
    pl.pcolor(a)
    plt.colorbar()
    plt.title('actSimilarityMatrix_' + sim)
    plt.savefig('visSimilarityMatrix/actSimilarityMatrix_' + sim)

    dietSimilarity_dict = utilise.SimilarityDict('DietItem', sim)
    # print dietSimilarity_dict
    # print '\n'
    a = similarityDict2array(dietSimilarity_dict, 0)
    plt.figure()
    plt.matshow(a)
    plt.colorbar()
    plt.title('dietSimilarityMatrix_' + sim)
    plt.savefig('visSimilarityMatrix/dietSimilarityMatrix_' + sim)

    actTypeSimilarity_dict = utilise.SimilarityDict('ActType', sim)
    a = similarityDict2array(actTypeSimilarity_dict, 0)
    plt.figure()
    plt.matshow(a)
    plt.colorbar()
    plt.title('actTypeSimilarityMatrix_' + sim)
    plt.savefig('visSimilarityMatrix/actTypeSimilarityMatrix_' + sim)

    dietTypeSimilarity_dict = utilise.SimilarityDict('DietType', sim)
    a = similarityDict2array(dietTypeSimilarity_dict, 0)
    plt.figure()
    plt.matshow(a)
    plt.colorbar()
    plt.title('dietTypeSimilarityMatrix_' + sim)
    plt.savefig('visSimilarityMatrix/dietTypeSimilarityMatrix_' + sim)


# plotSimilarityMatrix('TFIDFCosin')
def plotSimilarityDistribution(sim='TFIDFCosin'):
    actSimilarity_dict = utilise.SimilarityDict('ActItem', sim)
    similarityList = similarityDict2list(actSimilarity_dict)
    plt.figure()
    plt.hist(similarityList)
    plt.title('actSimilarityDistribution_' + sim)
    plt.xlim(0.0, 1.0)
    plt.savefig('visSimilarityDistributionHist/actSimilarityDistribution_' +
                sim)

    dietSimilarity_dict = utilise.SimilarityDict('DietItem', sim)
    similarityList = similarityDict2list(dietSimilarity_dict)
    plt.figure()
    plt.hist(similarityList)
    plt.title('dietSimilarityDistribution_' + sim)
    plt.xlim(0.0, 1.0)
    plt.savefig('visSimilarityDistributionHist/dietSimilarityDistribution_' +
                sim)

    actTypeSimilarity_dict = utilise.SimilarityDict('ActType', sim)
    similarityList = similarityDict2list(actTypeSimilarity_dict)
    plt.figure()
    plt.hist(similarityList)
    plt.title('actTypeSimilarityDistribution_' + sim)
    plt.xlim(0.0, 1.0)
    plt.savefig(
        'visSimilarityDistributionHist/actTypeSimilarityDistribution_' + sim)

    dietTypeSimilarity_dict = utilise.SimilarityDict('DietType', sim)
    similarityList = similarityDict2list(dietTypeSimilarity_dict)
    plt.figure()
    plt.hist(similarityList)
    plt.title('dietTypeSimilarityDistribution_' + sim)
    plt.xlim(0.0, 1.0)
    plt.savefig(
        'visSimilarityDistributionHist/dietTypeSimilarityDistribution_' + sim)
Beispiel #4
0
def excelTable(domain,sim = 'TFIDFCosin'):
	available_list = ['039','044','045','048','049','050','051','052','053','054','056','057','058','059','060','061','063','064','065','066','067','068','069','070','071','072','073','074','075']
	workbook = xlwt.Workbook()
	ws = workbook.add_sheet('sheet1')
	row_index = 0
	for i in range(1,len(available_list)+1):
		ws.write(0,i,available_list[i-1])
	actSimilarity_dict = utilise.SimilarityDict(domain,sim)
	for subjectID in available_list:
		row_index += 1
		col_index = 0
		ws.write(row_index,0,subjectID)
		for subjectid in available_list:
			col_index += 1
			ws.write(row_index,col_index,actSimilarity_dict[row_index-1][col_index-1])
	workbook.save('SimilarityTableExcel/actTypeSimilarityTable_'+sim+'.xls')
"""
Created on Fri Jan 08 17:34:11 2016

@author: wu34
"""

from sklearn.cluster import SpectralClustering
import visSimilarityMat
import utilise

Domain = ['DietType', 'ActType']

# dist is to set the similarity measurement method, the default is TFIDFCosin
# jaccard,novelJaccard,TFIDFCosin,TFIDFEclud,TFCosin,TFEclud
dist = 'TFEclud'
for domain in Domain:
    dietSimilarity_dict = {}
    if domain == 'DietItem':
        Similarity_dict = utilise.SimilarityDict(domain, dist)
    elif domain == 'ActItem':
        Similarity_dict = utilise.SimilarityDict(domain, dist)
    elif domain == 'DietType':
        Similarity_dict = utilise.SimilarityDict(domain, dist)
    elif domain == 'ActType':
        Similarity_dict = utilise.SimilarityDict(domain, dist)
    X = visSimilarityMat.similarityDict2array(Similarity_dict, 0)

    af = SpectralClustering(affinity="precomputed").fit(X)
    labels = af.labels_
    print labels
def HC(domain, para):
    if para in Metric:
        if para == 'TF':
            if domain == 'DietItem':
                X = dataGen4DietAct.genDietItemTFArray()
            elif domain == 'ActItem':
                X = dataGen4DietAct.genActItemTFArray()
            elif domain == 'DietType':
                X = dataGen4DietAct.genDietTypeTFArray()
            elif domain == 'ActType':
                X = dataGen4DietAct.genActTypeTFArray()
        elif para == 'TFIDF':
            if domain == 'DietItem':
                X = dataGen4DietAct.DietItemTfidfArray()
            elif domain == 'ActItem':
                X = dataGen4DietAct.ActItemTfidfArray()
            elif domain == 'DietType':
                X = dataGen4DietAct.DietTypeTfidfArray()
            elif domain == 'ActType':
                X = dataGen4DietAct.ActTypeTfidfArray()
        X = utilise.normArray(X)

    if para in Sim:
        Similarity_dict = {}
        if domain == 'DietItem':
            Similarity_dict = utilise.SimilarityDict(domain, para)
        elif domain == 'ActItem':
            Similarity_dict = utilise.SimilarityDict(domain, para)
        elif domain == 'DietType':
            Similarity_dict = utilise.SimilarityDict(domain, para)
        elif domain == 'ActType':
            Similarity_dict = utilise.SimilarityDict(domain, para)
        X = visSimilarityMat.similarityDict2array(Similarity_dict, 0)

    # method can be ward, complete, average
    method = 'ward'
    row_method = method
    row_metric = 'euclidean'
    column_method = method
    column_metric = 'euclidean'

    # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.pdist.html
    # d1 = ssd.pdist(X,'cosine')
    d1 = ssd.pdist(X)
    # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.squareform.html#scipy.spatial.distance.squareform
    D1 = ssd.squareform(d1)  # full matrix
    # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    Y1 = sch.linkage(D1, method=row_method, metric=row_metric)
    row_idxing = sch.leaves_list(Y1)

    # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.pdist.html
    d2 = ssd.pdist(X.T)
    # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.squareform.html#scipy.spatial.distance.squareform
    D2 = ssd.squareform(d2)
    # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    Y2 = sch.linkage(D2, method=column_method, metric=column_metric)
    col_idxing = sch.leaves_list(Y2)

    heatmap_array = X[:, col_idxing][
        row_idxing, :]  #a numpy.ndarray or numpy.matrix, for this example, let's say mxn array
    top_dendrogram = Y2  #a (n-1) x 4 array
    side_dendrogram = Y1  #a (m-1) x 4 array

    row_labels = range(X.shape[0])
    if para in Sim:
        col_labels = range(X.shape[1])
    if para in Metric:
        if domain == 'DietItem':
            col_labels = utilise.itemDict2list(
                dataGen4DietAct.genDietItemDict())
        elif domain == 'ActItem':
            col_labels = utilise.itemDict2list(
                dataGen4DietAct.genActItemDict())
        elif domain == 'DietType':
            col_labels = utilise.itemDict2list(
                dataGen4DietAct.genDietTypeDict())
        elif domain == 'ActType':
            col_labels = utilise.itemDict2list(
                dataGen4DietAct.genActTypeDict())
    col_idxing = list(col_idxing)
    row_idxing = list(row_idxing)
    print col_idxing

    new_row_labels = []
    new_col_labels = []
    for i in range(len(row_idxing)):
        new_row_labels.append(str(row_labels[row_idxing[i]]))
    for j in range(len(col_idxing)):
        new_col_labels.append(str(col_labels[col_idxing[j]]))

    heatmap = pdh.DendroHeatMap(heat_map_data=heatmap_array,
                                left_dendrogram=side_dendrogram,
                                top_dendrogram=top_dendrogram)
    heatmap.title = 'HC_' + domain + '_' + para + '_' + method
    heatmap.row_labels = new_row_labels
    heatmap.col_labels = new_col_labels

    # heatmap.show()
    heatmap.export('VisClustering' + domain + 'Pattern/Hierarchy_' + para +
                   '_' + method + '.png')