def sihouetteScore(metric):
    for domain in Domain:
        if metric == 'TF':
            if domain == 'DietType':
                X = dataGen4DietAct.genDietTypeTFArray()
            elif domain == 'ActType':
                X = dataGen4DietAct.genActTypeTFArray()
        elif metric == 'TFIDF':
            if domain == 'DietType':
                X = dataGen4DietAct.DietTypeTfidfArray()
            elif domain == 'ActType':
                X = dataGen4DietAct.ActTypeTfidfArray()
        X = utilise.normArray(X)
        reduced_data = PCA(n_components=2).fit_transform(X)

        range_n_clusters = [2, 3, 4, 5, 6]

        for n_clusters in range_n_clusters:
            clusterer = KMeans(n_clusters=n_clusters, n_init=300)
            clusterer.fit(reduced_data)
            cluster_labels = clusterer.labels_

            # The silhouette_score gives the average value for all the samples.
            # This gives a perspective into the density and separation of the formed clusters
            silhouette_avg = silhouette_score(X, cluster_labels)
            print(metric, domain, 'For n_clusters =', n_clusters,
                  'The average silhouette_score is :', silhouette_avg)
Esempio n. 2
0
def getMeanVec(domain):
    '''
    get the intergroup mean TF vector 
    '''

    if domain == 'DietType':
        labels = labelsDietType
        X = dataGen4DietAct.genDietTypeTFArray()
    if domain == 'ActType':
        labels = labelsActType
        X = dataGen4DietAct.genActTypeTFArray()
    
    N = np.max(labels) + 1

    dims = (N,X.shape[1])
    MeanVec = np.zeros(dims)

    for k in range(N):
        class_members = labels == k
        number = 0
        sumVec = np.zeros(X.shape[1])
        
        for x in X[class_members]:
            number += 1
            sumVec += x 
        
        meanVec = sumVec/number 
        meanVec.tolist()
        
        firstMax = np.max(meanVec)
        meanVec = meanVec/firstMax
        
        MeanVec[k] = meanVec
    
    return MeanVec
Esempio n. 3
0
def getMeanVec(domain, groupID):
    '''
	get the intragroup mean TF vector 
	'''
    if domain == 'ActItem':
        labels = labelsActItem
        X = dataGen4DietAct.genActItemTFArray()
    if domain == 'DietItem':
        labels = labelsDietItem
        X = dataGen4DietAct.genDietItemTFArray()
    if domain == 'DietType':
        labels = labelsDietType
        X = dataGen4DietAct.genDietTypeTFArray()
    if domain == 'ActType':
        labels = labelsActType
        X = dataGen4DietAct.genActTypeTFArray()

    class_members = labels == groupID
    number = 0
    sumVec = np.zeros(X.shape[1])
    for x in X[class_members]:
        number += 1
        sumVec += x
    meanVec = sumVec / number
    meanVec.tolist()

    # firstMax = np.max(meanVec)
    # meanVec = meanVec/firstMax

    return meanVec
def visTFMatrix():
	tf_ActItem = utilise.normArray(dataGen4DietAct.genActItemTFArray())
	# tf_ActItem = dataGen4DietAct.genActItemTFArray()
	plt.figure()
	plt.matshow(tf_ActItem)
	plt.colorbar()
	plt.title('actTFMatrix')
	plt.savefig('visTForTFIDFMatrix/actTFMatrix')
	
	tf_DietItem = utilise.normArray(dataGen4DietAct.genDietItemTFArray())
	# tf_DietItem = dataGen4DietAct.genDietItemTFArray()
	plt.figure()
	plt.matshow(tf_DietItem)
	plt.colorbar()
	plt.title('dietTFMatrix')
	plt.savefig('visTForTFIDFMatrix/dietTFMatrix')
	
	tf = utilise.genCombiArray(tf_ActItem,tf_DietItem)
	plt.figure()
	plt.matshow(tf)
	plt.colorbar()
	plt.title('actDietTFMatrix')
	plt.savefig('visTForTFIDFMatrix/actDietTFMatrix')
	
	tf_DietType = utilise.normArray(dataGen4DietAct.genDietTypeTFArray())
	# tf_DietType = dataGen4DietAct.genDietTypeTFArray()
	plt.figure()
	plt.matshow(tf_DietType)
	plt.colorbar()
	plt.title('dietTypeTFMatrix')
	plt.savefig('visTForTFIDFMatrix/dietTypeTFMatrix')
	
	tf_ActType = utilise.normArray(dataGen4DietAct.genActTypeTFArray())
	# tf_ActType = dataGen4DietAct.genActTypeTFArray()
	plt.figure()
	plt.matshow(tf_ActType)
	plt.colorbar()
	plt.title('actTypeTFMatrix')
	plt.savefig('visTForTFIDFMatrix/actTypeTFMatrix')
	
	tf = utilise.genCombiArray(tf_ActType,tf_DietType)
	plt.figure()
	plt.matshow(tf)
	plt.colorbar()
	plt.title('actDietTypeTFMatrix')
	plt.savefig('visTForTFIDFMatrix/actDietTypeTFMatrix')
Esempio n. 5
0
def TFEclud(domain):
    similarity_dict = {}
    similarity = 0
    if domain == 'ActItem':
        tf = dataGen4DietAct.genActItemTFArray()
    elif domain == 'DietItem':
        tf = dataGen4DietAct.genDietItemTFArray()
    elif domain == 'DietType':
        tf = dataGen4DietAct.genDietTypeTFArray()
    elif domain == 'ActType':
        tf = dataGen4DietAct.genActTypeTFArray()
    tf = normArray(tf)
    x = tf.shape[0]
    for i in range(x):
        similarity_dict[i] = {}
        for j in range(x):
            # print tf[i],tf[j]
            similarity = simEclud(tf[i], tf[j])
            # print similarity
            similarity_dict[i][j] = similarity
    # print similarity_dict
    return similarity_dict
def buildSubAveInfo():
    workbookW = xlwt.Workbook()
    ws = workbookW.add_sheet('AveInfo')

    groupAct = dietActInfoRetrv.getGroups(labelsActType)
    groupDiet = dietActInfoRetrv.getGroups(labelsDietType)

    Age, Gender, Height, Weight, BMI, FatFree, FatMass, PercFat, Vo2max = slpInfoRetrv.getDemoGInfo(
    )
    SlpHours = slpInfoRetrv.getSlpHours()
    MedianHR = slpInfoRetrv.getMedianHR()
    MedianHRBefore = slpInfoRetrv.getMedianHRBefore()
    MedianHRAfter = slpInfoRetrv.getMedianHRAfter()

    titles = [
        'SubjId', 'ActGroup', 'DietGroup', 'HoursSleep', 'MedianHR',
        'MedianHRBefore', 'MedianHRAfter', 'age', 'gender', 'height', 'weight',
        'BMI', 'FatFreeMass', 'FatMass', 'PercFat', 'vo2max'
    ]

    for i in range(len(titles)):
        ws.write(0, i, titles[i])

    rowW = 1

    for index in range(len(sleep_list)):
        ws.write(rowW, 0, sleep_list[index])

        for key in groupAct:
            if sleep_list[index] in groupAct[key]:
                ws.write(rowW, 1, key)
                break

        for key in groupDiet:
            if sleep_list[index] in groupDiet[key]:
                ws.write(rowW, 2, key)
                break

        ws.write(rowW, 1 + 2, SlpHours[index])
        ws.write(rowW, 2 + 2, MedianHR[index])
        ws.write(rowW, 3 + 2, MedianHRBefore[index])
        ws.write(rowW, 4 + 2, MedianHRAfter[index])
        ws.write(rowW, 5 + 2, Age[index])
        ws.write(rowW, 6 + 2, Gender[index])
        ws.write(rowW, 7 + 2, Height[index])
        ws.write(rowW, 8 + 2, Weight[index])
        ws.write(rowW, 9 + 2, BMI[index])
        ws.write(rowW, 10 + 2, FatFree[index])
        ws.write(rowW, 11 + 2, FatMass[index])
        ws.write(rowW, 12 + 2, PercFat[index])
        ws.write(rowW, 13 + 2, Vo2max[index])
        rowW += 1

    ws2 = workbookW.add_sheet('DietTF')

    row_labels = utilise.itemDict2list(dataGen4DietAct.genDietTypeDict())

    X = utilise.normArray(dataGen4DietAct.genDietTypeTFArray())

    ws2.write(0, 0, 'SubjId')
    ws2.write(0, 1, 'DietGroup')

    for i in range(len(row_labels)):
        ws2.write(0, i + 2, row_labels[i])

    rowW = 1
    for index in range(len(available_list)):
        ws2.write(rowW, 0, available_list[index])

        for key in groupDiet:
            if available_list[index] in groupDiet[key]:
                ws2.write(rowW, 1, key)
                break

        for i in range(len(row_labels)):
            ws2.write(rowW, i + 2, X[index][i])

        rowW += 1

    ws3 = workbookW.add_sheet('ActTF')

    row_labels = utilise.itemDict2list(dataGen4DietAct.genActTypeDict())

    X = utilise.normArray(dataGen4DietAct.genActTypeTFArray())

    ws3.write(0, 0, 'SubjId')
    ws3.write(0, 1, 'ActGroup')

    for i in range(len(row_labels)):
        ws3.write(0, i + 2, row_labels[i])

    rowW = 1
    for index in range(len(available_list)):
        ws3.write(rowW, 0, available_list[index])

        for key in groupAct:
            if available_list[index] in groupAct[key]:
                ws3.write(rowW, 1, key)
                break

        for i in range(len(row_labels)):
            ws3.write(rowW, i + 2, X[index][i])

        rowW += 1

    workbookW.save('SubAveInfo.xls')
Esempio n. 7
0
    i = 0
    for subjectID in available_list:
        duration = dietActInfoRetrv.getDuration(subjectID)
        
        for n in range(1,duration+1):
            dictWithTime = buildTypeIndex.build_daily_single_diet_index_with_time4DC(subjectID,n)
            for time in dictWithTime:
                for key in type_dict:
                    if type_dict[key] in dictWithTime[time]:
                        array[i,key] += dictWithTime[time][type_dict[key]] 
        i += 1 
    return array 

aa = getDietTypeTFArray4DC()
a = dataGen4DietAct.genDietTypeTFArray()


def sihouetteScore(domain):
    if domain == 'DietType':
        X = getDietTypeTFArray4DC()
    elif domain == 'ActType':
        X = getActTypeTFArray4DC() 
    X = utilise.normArray(X)
    reduced_data = PCA(n_components=2).fit_transform(X)
    
    range_n_clusters = [2, 3, 4, 5, 6] 

    for n_clusters in range_n_clusters:
        clusterer = KMeans(n_clusters=n_clusters, n_init = 300)
        clusterer.fit(reduced_data)
def bestLabel(labelsDietType,labelsActType):

    workbookW = xlwt.Workbook()
    ws = workbookW.add_sheet('sheet1')
    rowW = 0

    for domain in Domain:
        if domain == 'DietType':
            labels = utilise.string2array(labelsDietType) 
            row_labels = utilise.itemDict2list(dataGen4DietAct.genDietTypeDict())
            X = dataGen4DietAct.genDietTypeTFArray()
        elif domain == 'ActType':
            labels = utilise.string2array(labelsActType)
            row_labels = utilise.itemDict2list(dataGen4DietAct.genActTypeDict())
            X = dataGen4DietAct.genActTypeTFArray()
        X = utilise.normArray(X)
        
        
        # write the lables to excel file  
        col = 0
        for label in row_labels:
            ws.write(rowW,col,label)
            col += 1 
        rowW += 1 
        
        # print type(labels)
        plt.figure()
        
        n_clusters = np.max(labels) + 1 
        
        for k in range(n_clusters):
            class_members = labels == k
            group = [] 
            for x in X[class_members]:
                group.append(x)
            group = np.array(group)
            
            meanVec = np.mean(group,axis=0)
            meanVec.tolist()
            stdVec = np.std(group,axis=0)
            stdVec.tolist() 
            
            # write the mean vector of each group to excel file 
            col = 0
            for value in meanVec:
                ws.write(rowW,col,value)
                col += 1 
            rowW += 1 
            # print meanVec 
            
            # we don't have to do normalization here, as the input X has already been normalized 
            # totalSum = np.sum(meanVec[0])
            # print totalSum
            # meanVec = meanVec/totalSum
            
            # # normalize the meanVec 
            # firstMax = np.max(meanVec)
            # meanVec = meanVec/firstMax
            
            firstMax = np.max(meanVec)
            # print firstMax
            tempVec = np.copy(meanVec)
            for j in range(X.shape[1]):
                if tempVec[j] == firstMax:
                    tempVec[j] = 0
            secondMax = np.max(tempVec)
            # print secondMax
            tempVec2 = np.copy(tempVec)
            for j in range(X.shape[1]):
                if tempVec2[j]==secondMax:
                    tempVec2[j] = 0
            thirdMax = np.max(tempVec2)
            # print thirdMax

            
            x = range(X.shape[1])
            plt.plot(x,meanVec)
            # print meanVec
            for j in range(X.shape[1]):
                # if meanVec[j] == firstMax:
                # if meanVec[j] == firstMax or meanVec[j] == secondMax:
                if meanVec[j] == firstMax or meanVec[j] == secondMax or meanVec[j] == thirdMax:
                    print k,domain,n_clusters,meanVec[j],row_labels[j]
                    plt.text(x[j],meanVec[j],row_labels[j])

        # print row_labels
        # plt.xlabel(row_labels)
        plt.title(domain+'_TF_KMeans_'+str(n_clusters))
        plt.savefig('visClustering'+domain+'Pattern/KMeans__TF_'+str(n_clusters)+'_groupFreq')
    
    workbookW.save('tempLabels.xls')
def HC(domain, para):
    if para in Metric:
        if para == 'TF':
            if domain == 'DietItem':
                X = dataGen4DietAct.genDietItemTFArray()
            elif domain == 'ActItem':
                X = dataGen4DietAct.genActItemTFArray()
            elif domain == 'DietType':
                X = dataGen4DietAct.genDietTypeTFArray()
            elif domain == 'ActType':
                X = dataGen4DietAct.genActTypeTFArray()
        elif para == 'TFIDF':
            if domain == 'DietItem':
                X = dataGen4DietAct.DietItemTfidfArray()
            elif domain == 'ActItem':
                X = dataGen4DietAct.ActItemTfidfArray()
            elif domain == 'DietType':
                X = dataGen4DietAct.DietTypeTfidfArray()
            elif domain == 'ActType':
                X = dataGen4DietAct.ActTypeTfidfArray()
        X = utilise.normArray(X)

    if para in Sim:
        Similarity_dict = {}
        if domain == 'DietItem':
            Similarity_dict = utilise.SimilarityDict(domain, para)
        elif domain == 'ActItem':
            Similarity_dict = utilise.SimilarityDict(domain, para)
        elif domain == 'DietType':
            Similarity_dict = utilise.SimilarityDict(domain, para)
        elif domain == 'ActType':
            Similarity_dict = utilise.SimilarityDict(domain, para)
        X = visSimilarityMat.similarityDict2array(Similarity_dict, 0)

    # method can be ward, complete, average
    method = 'ward'
    row_method = method
    row_metric = 'euclidean'
    column_method = method
    column_metric = 'euclidean'

    # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.pdist.html
    # d1 = ssd.pdist(X,'cosine')
    d1 = ssd.pdist(X)
    # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.squareform.html#scipy.spatial.distance.squareform
    D1 = ssd.squareform(d1)  # full matrix
    # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    Y1 = sch.linkage(D1, method=row_method, metric=row_metric)
    row_idxing = sch.leaves_list(Y1)

    # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.pdist.html
    d2 = ssd.pdist(X.T)
    # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.squareform.html#scipy.spatial.distance.squareform
    D2 = ssd.squareform(d2)
    # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    Y2 = sch.linkage(D2, method=column_method, metric=column_metric)
    col_idxing = sch.leaves_list(Y2)

    heatmap_array = X[:, col_idxing][
        row_idxing, :]  #a numpy.ndarray or numpy.matrix, for this example, let's say mxn array
    top_dendrogram = Y2  #a (n-1) x 4 array
    side_dendrogram = Y1  #a (m-1) x 4 array

    row_labels = range(X.shape[0])
    if para in Sim:
        col_labels = range(X.shape[1])
    if para in Metric:
        if domain == 'DietItem':
            col_labels = utilise.itemDict2list(
                dataGen4DietAct.genDietItemDict())
        elif domain == 'ActItem':
            col_labels = utilise.itemDict2list(
                dataGen4DietAct.genActItemDict())
        elif domain == 'DietType':
            col_labels = utilise.itemDict2list(
                dataGen4DietAct.genDietTypeDict())
        elif domain == 'ActType':
            col_labels = utilise.itemDict2list(
                dataGen4DietAct.genActTypeDict())
    col_idxing = list(col_idxing)
    row_idxing = list(row_idxing)
    print col_idxing

    new_row_labels = []
    new_col_labels = []
    for i in range(len(row_idxing)):
        new_row_labels.append(str(row_labels[row_idxing[i]]))
    for j in range(len(col_idxing)):
        new_col_labels.append(str(col_labels[col_idxing[j]]))

    heatmap = pdh.DendroHeatMap(heat_map_data=heatmap_array,
                                left_dendrogram=side_dendrogram,
                                top_dendrogram=top_dendrogram)
    heatmap.title = 'HC_' + domain + '_' + para + '_' + method
    heatmap.row_labels = new_row_labels
    heatmap.col_labels = new_col_labels

    # heatmap.show()
    heatmap.export('VisClustering' + domain + 'Pattern/Hierarchy_' + para +
                   '_' + method + '.png')