def newData():
    df = newDataProcess.newFeatureFrame()
    print df.columns
    df = df[[
        'alcoholD', 'caffeineD', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP',
        'seafood', 'snack', 'starchyP', 'vegetables', 'leisure', 'social',
        'sport', 'walk', 'car', 'bike', 'workStudy', 'gender', 'label'
    ]]
    df.columns = [
        'alcohol', 'cafe', 'dairy', 'egg', 'fruit', 'grain', 'meat', 'seafood',
        'snack', 'starchy', 'vegetables', 'leisure', 'social', 'sport', 'walk',
        'car', 'bike', 'workStudy', 'gender', 'label'
    ]

    dd = {}
    dd_low = {}
    dd_high = {}
    dd_diff = {}
    for i in df.columns:
        if i != 'label':
            dd[i] = [0, 0]
            temp = df[df[i] > 0]

            #            dd[i][0] = temp[temp['label']==0].shape[0]
            #            dd[i][1] = temp[temp['label']==1].shape[0]
            #            dd[i][2] = temp[temp['label']==2].shape[0]
            #
            #            dd[i][0] = sum(temp[temp['label']==0][i])
            #            dd[i][1] = sum(temp[temp['label']==1][i])
            #            dd[i][2] = sum(temp[temp['label']==2][i])

            #            dd[i][0] = sum(temp[temp['label']==0][i])/(temp[temp['label']==0].shape[0])
            #            dd[i][1] = sum(temp[temp['label']==1][i])/(temp[temp['label']==1].shape[0])
            #            dd[i][2] = sum(temp[temp['label']==2][i])/(temp[temp['label']==2].shape[0])
            #            ll = copy.deepcopy(dd)
            #            dd[i][0] = dd[i][0]/float(sum(ll[i]))
            #            dd[i][1] = dd[i][1]/float(sum(ll[i]))
            #            dd[i][2] = dd[i][2]/float(sum(ll[i]))
            #            dd_low[i] = dd[i][0] #+ dd[i][1]
            #            dd_high[i] = dd[i][2] #+ dd[i][1]
            #            dd_diff[i] = max(dd[i])-min(dd[i])

            dd[i][0] = sum(temp[temp['label'] == 0][i]) / sum(df[i])
            dd[i][1] = sum(temp[temp['label'] == 1][i]) / sum(df[i])
            #            dd[i][2] = sum(temp[temp['label']==2][i])/sum(df[i])

            #            dd[i][0] = float(temp[temp['label']==0][i].shape[0])/temp.shape[0]
            #            dd[i][1] = float(temp[temp['label']==1][i].shape[0])/temp.shape[0]
            #            dd[i][2] = float(temp[temp['label']==2][i].shape[0])/temp.shape[0]

            dd_low[i] = dd[i][0]
            dd_high[i] = dd[i][1]
            dd_diff[i] = max(dd[i]) - min(dd[i])

    return dd, dd_low, dd_high, dd_diff
def newDataSubSupport():
    df = newDataProcess.newFeatureFrame()
    print df.columns
    df = df[[
        'alcoholD', 'caffeineD', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP',
        'seafood', 'snack', 'starchyP', 'vegetables', 'leisure', 'social',
        'sport', 'walk', 'car', 'bike', 'workStudy', 'gender', 'label', 'ID'
    ]]
    df.columns = [
        'alcohol', 'cafe', 'dairy', 'egg', 'fruit', 'grain', 'meat', 'seafood',
        'snack', 'starchy', 'vegetables', 'leisure', 'social', 'sport', 'walk',
        'car', 'bike', 'workStudy', 'gender', 'label', 'ID'
    ]

    for i in range(df.shape[0]):
        df.set_value(i, 'ID', int(df['ID'][i]))

    dd_low_list = {}
    dd_high_list = {}
    dd_low = {}
    dd_high = {}
    for i in df.columns:
        if i != 'label' and i != 'ID':
            dd_low_list[i] = []
            dd_high_list[i] = []

    for sub in range(30):

        ID = int(sub)
        df_sub = df[df['ID'] == ID]

        for i in df.columns:
            if i != 'label' and i != 'ID':

                temp = df_sub[df_sub[i] > 0]
                if sum(df_sub[i]) != 0:
                    dd_low_list[i].append(
                        sum(temp[temp['label'] == 0][i]) / sum(df_sub[i]))
                    dd_high_list[i].append(
                        sum(temp[temp['label'] == 1][i]) / sum(df_sub[i]))
#                else:
#                    dd_low_list[i].append(0)
#                    dd_high_list[i].append(0)

    for i in dd_low_list.keys():
        dd_low[i] = sum(dd_low_list[i]) / len(dd_low_list[i])
        dd_high[i] = sum(dd_high_list[i]) / len(dd_high_list[i])

    return dd_low, dd_high
Ejemplo n.º 3
0
def genNewActDietTypeDataSet():
    dataset = []

    newDF = newDataProcess.newFeatureFrame() 
    for i in range(newDF.shape[0]):
        temp = []
        for j in newDF.columns:
            if j != 'grainP' and j!='ID' and j!='gender' and j!='age' and j != 'label' and j != 'vegetables' and j != 'meatP' and j != 'dairyP' and j!='caffeineD' and j!='snack': 
                if newDF.ix[i,j] > 0:
                    temp.append(j)
        temp = tuple(temp)
        dataset.append(temp)
    
    dataset = tuple(dataset)
    print len(dataset)
    return dataset
Ejemplo n.º 4
0
def genNewActDietTypeDataSetForMoreSleep():
    dataset = []     
    
    df = newDataProcess.newFeatureFrame()
    df = df[df['label']==1]
    df.index = range(df.shape[0])
    
    for i in range(df.shape[0]):
        temp = []
        for j in df.columns:
            if j != 'grainP' and j!='ID' and j!='gender' and j!='age' and j != 'label' and j != 'vegetables' and j != 'meatP' and j != 'dairyP' and j!='caffeineD' and j!='snack':
                if df[j][i] > 0:
                    temp.append(j)
        temp = tuple(temp)
        dataset.append(temp)        
        
    dataset = tuple(dataset)
    print len(dataset)
    return dataset
temp_df = df[['alcoholD', 'eggP', 'seafood', 'gender']]
#temp_df = df[['alcoholD','eggP','seafood','gender']]
for i in temp_df.columns:
    for j in range(temp_df.shape[0]):
        if temp_df[i][j] > 1:
            temp_df.set_value(j, i, 1)
dataset = temp_df.as_matrix()
labels = list(df['label'])
clf = LogisticRegression(penalty='l1', C=0.5)
scores = cross_validation.cross_val_score(clf, dataset, labels, cv=5)
accuracy = scores.mean()
print accuracy
'''
new data test 
'''
df = newDataProcess.newFeatureFrame()
temp_df = df[[
    'alcoholD', 'caffeineD', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP',
    'seafood', 'snack', 'starchyP', 'vegetables', 'leisure', 'social', 'sport',
    'walk', 'car', 'bike', 'workStudy', 'gender'
]]
dataset = temp_df.as_matrix()
labels = list(df['label'])
clf = LogisticRegression(penalty='l1', C=0.5)
scores = cross_validation.cross_val_score(clf, dataset, labels, cv=5)
accuracy = scores.mean()
print accuracy
'''
new data test 
'''
df = newDataProcess.newFeatureFrame()
def clusteringKmeansLabelsNewDays():
    
    workbookW = xlwt.Workbook()
    ws = workbookW.add_sheet('sheet1')
    rowW = 0
    
    df = newDataProcess.newFeatureFrame() 
    for domain in Domain:
        print df.columns 
        if domain == 'DietType':
            df_temp = df[['alcoholD','caffeineD','dairyP','eggP','fruitP','grainP','meatP','seafood','snack','starchyP','vegetables']]
            row_labels = df_temp.columns 
            X = df_temp.as_matrix()
            X = utilise.normArray(X)
            kmeans = KMeans(n_clusters=2, n_init = 3000)
            kmeans.fit(X)
            labels = kmeans.labels_ 
        else:
            df_temp = df[['leisure','social','sport','walk','car','bike','workStudy']]
            row_labels = df_temp.columns             
            X = df_temp.as_matrix()
            X = utilise.normArray(X)
            kmeans = KMeans(n_clusters=3, n_init = 3000)
            kmeans.fit(X)
            labels = kmeans.labels_ 
        
        # write the lables to excel file  
        col = 0
        for label in row_labels:
            ws.write(rowW,col,label)
            col += 1 
        rowW += 1 
        
        # print type(labels)
        plt.figure()
        
        n_clusters = np.max(labels) + 1
        
        for k in range(n_clusters):
            class_members = labels == k
            group = [] 
            for x in X[class_members]:
                group.append(x)
            group = np.array(group)
            
            meanVec = np.mean(group,axis=0)
            meanVec.tolist()
            stdVec = np.std(group,axis=0)
            stdVec.tolist() 
            print stdVec
            
            # write the mean vector of each group to excel file 
            col = 0
            for value in meanVec:
                ws.write(rowW,col,value)
                col += 1 
            rowW += 1 
            # print meanVec 
            
            firstMax = np.max(meanVec)
            # print firstMax
            tempVec = np.copy(meanVec)
            for j in range(X.shape[1]):
                if tempVec[j] == firstMax:
                    tempVec[j] = 0
            secondMax = np.max(tempVec)
            # print secondMax
            tempVec2 = np.copy(tempVec)
            for j in range(X.shape[1]):
                if tempVec2[j]==secondMax:
                    tempVec2[j] = 0
            thirdMax = np.max(tempVec2)
            # print thirdMaxO

            
            x = range(X.shape[1])
            plt.plot(x,meanVec)
#            plt.errorbar(x,meanVec,yerr=stdVec)
            # print meanVec
            for j in range(X.shape[1]):
                # if meanVec[j] == firstMax:
                # if meanVec[j] == firstMax or meanVec[j] == secondMax:
                if meanVec[j] == firstMax or meanVec[j] == secondMax or meanVec[j] == thirdMax:
                    ws.write(rowW,0,k)
                    ws.write(rowW,1,domain)
                    ws.write(rowW,2,row_labels[j])
                    ws.write(rowW,3,meanVec[j])
                    rowW += 1 
                    print k,domain,n_clusters,meanVec[j],row_labels[j]
                plt.text(x[j],meanVec[j],row_labels[j])

        # print row_labels
        # plt.xlabel(row_labels)
        plt.title(domain+'_TF_KMeans_'+str(n_clusters))
        plt.savefig('visClustering'+domain+'Pattern/KMeans__TF_NewDataDays_'+str(n_clusters)+'_groupFreq')
    
    workbookW.save('tempLabels.xls')