def newData(): df = newDataProcess.newFeatureFrame() print df.columns df = df[[ 'alcoholD', 'caffeineD', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP', 'seafood', 'snack', 'starchyP', 'vegetables', 'leisure', 'social', 'sport', 'walk', 'car', 'bike', 'workStudy', 'gender', 'label' ]] df.columns = [ 'alcohol', 'cafe', 'dairy', 'egg', 'fruit', 'grain', 'meat', 'seafood', 'snack', 'starchy', 'vegetables', 'leisure', 'social', 'sport', 'walk', 'car', 'bike', 'workStudy', 'gender', 'label' ] dd = {} dd_low = {} dd_high = {} dd_diff = {} for i in df.columns: if i != 'label': dd[i] = [0, 0] temp = df[df[i] > 0] # dd[i][0] = temp[temp['label']==0].shape[0] # dd[i][1] = temp[temp['label']==1].shape[0] # dd[i][2] = temp[temp['label']==2].shape[0] # # dd[i][0] = sum(temp[temp['label']==0][i]) # dd[i][1] = sum(temp[temp['label']==1][i]) # dd[i][2] = sum(temp[temp['label']==2][i]) # dd[i][0] = sum(temp[temp['label']==0][i])/(temp[temp['label']==0].shape[0]) # dd[i][1] = sum(temp[temp['label']==1][i])/(temp[temp['label']==1].shape[0]) # dd[i][2] = sum(temp[temp['label']==2][i])/(temp[temp['label']==2].shape[0]) # ll = copy.deepcopy(dd) # dd[i][0] = dd[i][0]/float(sum(ll[i])) # dd[i][1] = dd[i][1]/float(sum(ll[i])) # dd[i][2] = dd[i][2]/float(sum(ll[i])) # dd_low[i] = dd[i][0] #+ dd[i][1] # dd_high[i] = dd[i][2] #+ dd[i][1] # dd_diff[i] = max(dd[i])-min(dd[i]) dd[i][0] = sum(temp[temp['label'] == 0][i]) / sum(df[i]) dd[i][1] = sum(temp[temp['label'] == 1][i]) / sum(df[i]) # dd[i][2] = sum(temp[temp['label']==2][i])/sum(df[i]) # dd[i][0] = float(temp[temp['label']==0][i].shape[0])/temp.shape[0] # dd[i][1] = float(temp[temp['label']==1][i].shape[0])/temp.shape[0] # dd[i][2] = float(temp[temp['label']==2][i].shape[0])/temp.shape[0] dd_low[i] = dd[i][0] dd_high[i] = dd[i][1] dd_diff[i] = max(dd[i]) - min(dd[i]) return dd, dd_low, dd_high, dd_diff
def newDataSubSupport(): df = newDataProcess.newFeatureFrame() print df.columns df = df[[ 'alcoholD', 'caffeineD', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP', 'seafood', 'snack', 'starchyP', 'vegetables', 'leisure', 'social', 'sport', 'walk', 'car', 'bike', 'workStudy', 'gender', 'label', 'ID' ]] df.columns = [ 'alcohol', 'cafe', 'dairy', 'egg', 'fruit', 'grain', 'meat', 'seafood', 'snack', 'starchy', 'vegetables', 'leisure', 'social', 'sport', 'walk', 'car', 'bike', 'workStudy', 'gender', 'label', 'ID' ] for i in range(df.shape[0]): df.set_value(i, 'ID', int(df['ID'][i])) dd_low_list = {} dd_high_list = {} dd_low = {} dd_high = {} for i in df.columns: if i != 'label' and i != 'ID': dd_low_list[i] = [] dd_high_list[i] = [] for sub in range(30): ID = int(sub) df_sub = df[df['ID'] == ID] for i in df.columns: if i != 'label' and i != 'ID': temp = df_sub[df_sub[i] > 0] if sum(df_sub[i]) != 0: dd_low_list[i].append( sum(temp[temp['label'] == 0][i]) / sum(df_sub[i])) dd_high_list[i].append( sum(temp[temp['label'] == 1][i]) / sum(df_sub[i])) # else: # dd_low_list[i].append(0) # dd_high_list[i].append(0) for i in dd_low_list.keys(): dd_low[i] = sum(dd_low_list[i]) / len(dd_low_list[i]) dd_high[i] = sum(dd_high_list[i]) / len(dd_high_list[i]) return dd_low, dd_high
def genNewActDietTypeDataSet(): dataset = [] newDF = newDataProcess.newFeatureFrame() for i in range(newDF.shape[0]): temp = [] for j in newDF.columns: if j != 'grainP' and j!='ID' and j!='gender' and j!='age' and j != 'label' and j != 'vegetables' and j != 'meatP' and j != 'dairyP' and j!='caffeineD' and j!='snack': if newDF.ix[i,j] > 0: temp.append(j) temp = tuple(temp) dataset.append(temp) dataset = tuple(dataset) print len(dataset) return dataset
def genNewActDietTypeDataSetForMoreSleep(): dataset = [] df = newDataProcess.newFeatureFrame() df = df[df['label']==1] df.index = range(df.shape[0]) for i in range(df.shape[0]): temp = [] for j in df.columns: if j != 'grainP' and j!='ID' and j!='gender' and j!='age' and j != 'label' and j != 'vegetables' and j != 'meatP' and j != 'dairyP' and j!='caffeineD' and j!='snack': if df[j][i] > 0: temp.append(j) temp = tuple(temp) dataset.append(temp) dataset = tuple(dataset) print len(dataset) return dataset
temp_df = df[['alcoholD', 'eggP', 'seafood', 'gender']] #temp_df = df[['alcoholD','eggP','seafood','gender']] for i in temp_df.columns: for j in range(temp_df.shape[0]): if temp_df[i][j] > 1: temp_df.set_value(j, i, 1) dataset = temp_df.as_matrix() labels = list(df['label']) clf = LogisticRegression(penalty='l1', C=0.5) scores = cross_validation.cross_val_score(clf, dataset, labels, cv=5) accuracy = scores.mean() print accuracy ''' new data test ''' df = newDataProcess.newFeatureFrame() temp_df = df[[ 'alcoholD', 'caffeineD', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP', 'seafood', 'snack', 'starchyP', 'vegetables', 'leisure', 'social', 'sport', 'walk', 'car', 'bike', 'workStudy', 'gender' ]] dataset = temp_df.as_matrix() labels = list(df['label']) clf = LogisticRegression(penalty='l1', C=0.5) scores = cross_validation.cross_val_score(clf, dataset, labels, cv=5) accuracy = scores.mean() print accuracy ''' new data test ''' df = newDataProcess.newFeatureFrame()
def clusteringKmeansLabelsNewDays(): workbookW = xlwt.Workbook() ws = workbookW.add_sheet('sheet1') rowW = 0 df = newDataProcess.newFeatureFrame() for domain in Domain: print df.columns if domain == 'DietType': df_temp = df[['alcoholD','caffeineD','dairyP','eggP','fruitP','grainP','meatP','seafood','snack','starchyP','vegetables']] row_labels = df_temp.columns X = df_temp.as_matrix() X = utilise.normArray(X) kmeans = KMeans(n_clusters=2, n_init = 3000) kmeans.fit(X) labels = kmeans.labels_ else: df_temp = df[['leisure','social','sport','walk','car','bike','workStudy']] row_labels = df_temp.columns X = df_temp.as_matrix() X = utilise.normArray(X) kmeans = KMeans(n_clusters=3, n_init = 3000) kmeans.fit(X) labels = kmeans.labels_ # write the lables to excel file col = 0 for label in row_labels: ws.write(rowW,col,label) col += 1 rowW += 1 # print type(labels) plt.figure() n_clusters = np.max(labels) + 1 for k in range(n_clusters): class_members = labels == k group = [] for x in X[class_members]: group.append(x) group = np.array(group) meanVec = np.mean(group,axis=0) meanVec.tolist() stdVec = np.std(group,axis=0) stdVec.tolist() print stdVec # write the mean vector of each group to excel file col = 0 for value in meanVec: ws.write(rowW,col,value) col += 1 rowW += 1 # print meanVec firstMax = np.max(meanVec) # print firstMax tempVec = np.copy(meanVec) for j in range(X.shape[1]): if tempVec[j] == firstMax: tempVec[j] = 0 secondMax = np.max(tempVec) # print secondMax tempVec2 = np.copy(tempVec) for j in range(X.shape[1]): if tempVec2[j]==secondMax: tempVec2[j] = 0 thirdMax = np.max(tempVec2) # print thirdMaxO x = range(X.shape[1]) plt.plot(x,meanVec) # plt.errorbar(x,meanVec,yerr=stdVec) # print meanVec for j in range(X.shape[1]): # if meanVec[j] == firstMax: # if meanVec[j] == firstMax or meanVec[j] == secondMax: if meanVec[j] == firstMax or meanVec[j] == secondMax or meanVec[j] == thirdMax: ws.write(rowW,0,k) ws.write(rowW,1,domain) ws.write(rowW,2,row_labels[j]) ws.write(rowW,3,meanVec[j]) rowW += 1 print k,domain,n_clusters,meanVec[j],row_labels[j] plt.text(x[j],meanVec[j],row_labels[j]) # print row_labels # plt.xlabel(row_labels) plt.title(domain+'_TF_KMeans_'+str(n_clusters)) plt.savefig('visClustering'+domain+'Pattern/KMeans__TF_NewDataDays_'+str(n_clusters)+'_groupFreq') workbookW.save('tempLabels.xls')