def originalDataSubSupport():
    df = artificialDataGenerator.originalData()
    print df.columns
    df = df[[
        'alcoholD', 'caffeineD', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP',
        'seafood', 'snack', 'starchyP', 'vegetables', 'entertainmentRelax',
        'social', 'sport', 'transportation1', 'transportation2',
        'transportation3', 'workStudy', 'gender', 'label', 'ID'
    ]]
    df.columns = [
        'alcohol', 'cafe', 'dairy', 'egg', 'fruit', 'grain', 'meat', 'seafood',
        'snack', 'starchy', 'vegetables', 'leisure', 'social', 'sport', 'walk',
        'car', 'bike', 'workStudy', 'gender', 'label', 'ID'
    ]

    for i in range(df.shape[0]):
        df.set_value(i, 'ID', int(df['ID'][i]))

    sleep_list = [
        '044', '045', '048', '050', '051', '052', '053', '056', '058', '059',
        '060', '061', '063', '064', '065', '066', '067', '068', '069', '070',
        '071', '072', '073', '074', '075'
    ]

    dd_low_list = {}
    dd_high_list = {}
    dd_low = {}
    dd_high = {}
    for i in df.columns:
        if i != 'label' and i != 'ID':
            dd_low_list[i] = []
            dd_high_list[i] = []

    for sub in sleep_list:

        ID = int(sub)
        df_sub = df[df['ID'] == ID]

        for i in df.columns:
            if i != 'label' and i != 'ID':

                temp = df_sub[df_sub[i] > 0]
                if sum(df_sub[i]) != 0:
                    dd_low_list[i].append(
                        sum(temp[temp['label'] == 0][i]) / sum(df_sub[i]))
                    dd_high_list[i].append(
                        sum(temp[temp['label'] == 1][i]) / sum(df_sub[i]))
#                else:
#                    dd_low_list[i].append(0)
#                    dd_high_list[i].append(0)

    for i in dd_low_list.keys():
        dd_low[i] = sum(dd_low_list[i]) / len(dd_low_list[i])
        dd_high[i] = sum(dd_high_list[i]) / len(dd_high_list[i])

    return dd_low, dd_high
def originalData():
    df = artificialDataGenerator.originalData()
    print df.columns
    df = df[[
        'alcoholD', 'caffeineD', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP',
        'seafood', 'snack', 'starchyP', 'vegetables', 'entertainmentRelax',
        'social', 'sport', 'transportation1', 'transportation2',
        'transportation3', 'workStudy', 'gender', 'label'
    ]]
    df.columns = [
        'alcohol', 'cafe', 'dairy', 'egg', 'fruit', 'grain', 'meat', 'seafood',
        'snack', 'starchy', 'vegetables', 'leisure', 'social', 'sport', 'walk',
        'car', 'bike', 'workStudy', 'gender', 'label'
    ]

    dd = {}
    dd_low = {}
    dd_high = {}
    dd_diff = {}
    for i in df.columns:
        if i != 'label':
            dd[i] = [0, 0]
            temp = df[df[i] > 0]

            #            dd[i][0] = temp[temp['label']==0].shape[0]
            #            dd[i][1] = temp[temp['label']==1].shape[0]
            #            dd[i][2] = temp[temp['label']==2].shape[0]
            #
            #            dd[i][0] = sum(temp[temp['label']==0][i])
            #            dd[i][1] = sum(temp[temp['label']==1][i])
            #            dd[i][2] = sum(temp[temp['label']==2][i])

            #            dd[i][0] = sum(temp[temp['label']==0][i])/(temp[temp['label']==0].shape[0])
            #            dd[i][1] = sum(temp[temp['label']==1][i])/(temp[temp['label']==1].shape[0])
            #            dd[i][2] = sum(temp[temp['label']==2][i])/(temp[temp['label']==2].shape[0])
            #            ll = copy.deepcopy(dd)
            #            dd[i][0] = dd[i][0]/float(sum(ll[i]))
            #            dd[i][1] = dd[i][1]/float(sum(ll[i]))
            #            dd[i][2] = dd[i][2]/float(sum(ll[i]))
            #            dd_low[i] = dd[i][0] #+ dd[i][1]
            #            dd_high[i] = dd[i][2] #+ dd[i][1]
            #            dd_diff[i] = max(dd[i])-min(dd[i])

            dd[i][0] = sum(temp[temp['label'] == 0][i]) / sum(df[i])
            dd[i][1] = sum(temp[temp['label'] == 1][i]) / sum(df[i])
            #            dd[i][2] = sum(temp[temp['label']==2][i])/sum(df[i])

            #            dd[i][0] = float(temp[temp['label']==0][i].shape[0])/temp.shape[0]
            #            dd[i][1] = float(temp[temp['label']==1][i].shape[0])/temp.shape[0]
            #            dd[i][2] = float(temp[temp['label']==2][i].shape[0])/temp.shape[0]

            dd_low[i] = dd[i][0]
            dd_high[i] = dd[i][1]
            dd_diff[i] = max(dd[i]) - min(dd[i])

    return dd, dd_low, dd_high, dd_diff
Example #3
0
def genOriginalActDietTypeDataSet():
    dataset = []     
    
    df = artificialDataGenerator.originalData()
    df.index = range(df.shape[0])
    
    for i in range(df.shape[0]):
        temp = []
        for j in df.columns:
            if j != 'compositeP' and j != 'label' and j!='sleepTime' and j!='gender' and j!='ID' and j!='others':
                if df[j][i] > 0:
                    temp.append(j)
        temp = tuple(temp)
        dataset.append(temp)        
        
    dataset = tuple(dataset)
    print len(dataset)
    return dataset
def visdiff():
    surrogateDF, labels = artificialDataGenerator.artificialData()
    df, cols = artificialDataGenerator.originalData()
    newDF = newFeatureFrame()
    for i in newDF.columns:
        plt.figure()
        newDF[i].plot.kde(label='new')

        if i == 'walk':
            i = 'transportation1'
        if i == 'car':
            i = 'transportation2'
        if i == 'bike':
            i = 'transportation3'
        if i == 'leisure':
            i = 'entertainmentRelax'

        df[i].plot.kde(label='original')
        surrogateDF[i].plot.kde(label='surrogate')

        #        plt.legend()
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
        plt.title(i)
        plt.savefig('distribution/' + i + '_diff')
#            accuracy = accuracy_score(labelTest,pre_labels)
#
#            scores = cross_validation.cross_val_score(clf, dataset, labels, cv=5)
#            accuracy = scores.mean()
#
#            if accuracy > bestAcc:
#                bestAcc = accuracy
#                #p,r,f,s = precision_recall_fscore_support(labelTest,pre_labels)
#
#            # p,r,f,s = precision_recall_fscore_support(labelTest,pre_labels)
#            # cross_val_score(clf, dataset, labels)
#        print bestAcc#,p,r,f,s
'''
original data test 
'''
df = artificialDataGenerator.originalData()
temp_df = df[[
    'compositeP', 'others', 'alcoholD', 'caffeineD', 'dairyP', 'eggP',
    'fruitP', 'grainP', 'meatP', 'seafood', 'snack', 'starchyP', 'vegetables',
    'entertainmentRelax', 'social', 'sport', 'transportation1',
    'transportation2', 'transportation3', 'workStudy', 'gender'
]]
dataset = temp_df.as_matrix()
labels = list(df['label'])
clf = LogisticRegression(penalty='l1', C=0.5)
scores = cross_validation.cross_val_score(clf, dataset, labels, cv=5)
accuracy = scores.mean()
print accuracy
'''
original data test pattern features
'''
def clusteringKmeansLabelsOriginalDays():
    workbookW = xlwt.Workbook()
    ws = workbookW.add_sheet('sheet1')
    rowW = 0
    
    df,cols = artificialDataGenerator.originalData()
    for domain in Domain:
        print df.columns 
        if domain == 'DietType':
            df_temp = df[['alcoholD','caffeineD','dairyP','eggP','fruitP','grainP','meatP','seafood','snack','starchyP','vegetables']]
            row_labels = df_temp.columns 
            X = df_temp.as_matrix()
            X = utilise.normArray(X)
            kmeans = KMeans(n_clusters=2, n_init = 3000)
            kmeans.fit(X)
            labels = kmeans.labels_ 
        else:
            df_temp = df[['entertainmentRelax','social','sport','transportation1','transportation2','transportation3','workStudy']]
            row_labels = df_temp.columns             
            X = df_temp.as_matrix()
            X = utilise.normArray(X)
            kmeans = KMeans(n_clusters=3, n_init = 3000)
            kmeans.fit(X)
            labels = kmeans.labels_ 
        
        # write the lables to excel file  
        col = 0
        for label in row_labels:
            ws.write(rowW,col,label)
            col += 1 
        rowW += 1 
        
        # print type(labels)
        plt.figure()
        
        n_clusters = np.max(labels) + 1
        
        for k in range(n_clusters):
            class_members = labels == k
            group = [] 
            for x in X[class_members]:
                group.append(x)
            group = np.array(group)
            
            meanVec = np.mean(group,axis=0)
            meanVec.tolist()
            stdVec = np.std(group,axis=0)
            stdVec.tolist() 
            print stdVec
            
            # write the mean vector of each group to excel file 
            col = 0
            for value in meanVec:
                ws.write(rowW,col,value)
                col += 1 
            rowW += 1 
            # print meanVec 
            
            firstMax = np.max(meanVec)
            # print firstMax
            tempVec = np.copy(meanVec)
            for j in range(X.shape[1]):
                if tempVec[j] == firstMax:
                    tempVec[j] = 0
            secondMax = np.max(tempVec)
            # print secondMax
            tempVec2 = np.copy(tempVec)
            for j in range(X.shape[1]):
                if tempVec2[j]==secondMax:
                    tempVec2[j] = 0
            thirdMax = np.max(tempVec2)
            # print thirdMaxO

            
            x = range(X.shape[1])
            plt.plot(x,meanVec)
#            plt.errorbar(x,meanVec,yerr=stdVec)
            # print meanVec
            for j in range(X.shape[1]):
                # if meanVec[j] == firstMax:
                # if meanVec[j] == firstMax or meanVec[j] == secondMax:
                if meanVec[j] == firstMax or meanVec[j] == secondMax or meanVec[j] == thirdMax:
                    ws.write(rowW,0,k)
                    ws.write(rowW,1,domain)
                    ws.write(rowW,2,row_labels[j])
                    ws.write(rowW,3,meanVec[j])
                    rowW += 1 
                    print k,domain,n_clusters,meanVec[j],row_labels[j]
                if row_labels[j] == 'transportation1':
                    plt.text(x[j],meanVec[j],'walk')
                elif row_labels[j] == 'transportation2':
                    plt.text(x[j],meanVec[j],'car')
                elif row_labels[j] == 'transportation3':
                    plt.text(x[j],meanVec[j],'bike')
                else:
                    plt.text(x[j],meanVec[j],row_labels[j])

        # print row_labels
        # plt.xlabel(row_labels)
        plt.title(domain+'_TF_KMeans_'+str(n_clusters))
        plt.savefig('visClustering'+domain+'Pattern/KMeans__TF_OriginalDays_'+str(n_clusters)+'_groupFreq')
    
    workbookW.save('tempLabels.xls')