def genderAnalysis(): df, labels = artificialDataGenerator.artificialData() df1 = df[df['gender'] == 1] print 'the total appearance of bike in men is ' print sum(df1['transportation3']) print 'the number of men in the dataset is ' print(df1.shape[0]) df2 = df[df['gender'] == 0] print 'the total appearance of bike in women is ' print sum(df2['transportation3']) print 'the number of women in the dataset is ' print(df2.shape[0]) #def bikeAnalysis(): df, labels = artificialDataGenerator.artificialData() df1 = df[df['transportation3'] >= 1] print 'the total appearance of work/study in people who bike is ' print sum(df1['workStudy']) print 'the number of people who bike in the dataset is ' print(df1.shape[0]) df2 = df[df['transportation3'] == 0] print "the total appearance of work/study in people who not bike is " print sum(df2['workStudy']) print 'the number of people who not bike in the dataset is ' print(df2.shape[0])
def sihouetteScoreArtificialData(metric): df, cols = artificialDataGenerator.artificialData() for domain in Domain: print df.columns if domain == 'DietType': df_temp = df[[ 'alcoholD', 'caffeineD', 'compositeP', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP', 'seafood', 'snack', 'starchyP', 'vegetables' ]] else: df_temp = df[[ 'entertainmentRelax', 'others', 'social', 'sport', 'transportation1', 'transportation2', 'transportation3', 'workStudy' ]] X = df_temp.as_matrix() X = utilise.normArray(X) range_n_clusters = [2, 3, 4, 5, 6] for n_clusters in range_n_clusters: clusterer = KMeans(n_clusters=n_clusters, n_init=300) clusterer.fit(X) cluster_labels = clusterer.labels_ # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed clusters silhouette_avg = silhouette_score(X, cluster_labels) print(metric, domain, 'For n_clusters =', n_clusters, 'The average silhouette_score is :', silhouette_avg)
def artificialData(): df = artificialDataGenerator.artificialData() print df.columns df = df[[ 'alcoholD', 'caffeineD', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP', 'seafood', 'snack', 'starchyP', 'vegetables', 'entertainmentRelax', 'social', 'sport', 'transportation1', 'transportation2', 'transportation3', 'workStudy', 'gender', 'label' ]] df.columns = [ 'alcohol', 'cafe', 'dairy', 'egg', 'fruit', 'grain', 'meat', 'seafood', 'snack', 'starchy', 'vegetables', 'leisure', 'social', 'sport', 'walk', 'car', 'bike', 'workStudy', 'gender', 'label' ] dd = {} dd_low = {} dd_high = {} dd_diff = {} for i in df.columns: if i != 'label': dd[i] = [0, 0] temp = df[df[i] > 0] # dd[i][0] = temp[temp['label']==0].shape[0] # dd[i][1] = temp[temp['label']==1].shape[0] # dd[i][2] = temp[temp['label']==2].shape[0] # # dd[i][0] = sum(temp[temp['label']==0][i]) # dd[i][1] = sum(temp[temp['label']==1][i]) # dd[i][2] = sum(temp[temp['label']==2][i]) # dd[i][0] = sum(temp[temp['label']==0][i])/(temp[temp['label']==0].shape[0]) # dd[i][1] = sum(temp[temp['label']==1][i])/(temp[temp['label']==1].shape[0]) # dd[i][2] = sum(temp[temp['label']==2][i])/(temp[temp['label']==2].shape[0]) # ll = copy.deepcopy(dd) # dd[i][0] = dd[i][0]/float(sum(ll[i])) # dd[i][1] = dd[i][1]/float(sum(ll[i])) # dd[i][2] = dd[i][2]/float(sum(ll[i])) # dd_low[i] = dd[i][0] #+ dd[i][1] # dd_high[i] = dd[i][2] #+ dd[i][1] # dd_diff[i] = max(dd[i])-min(dd[i]) dd[i][0] = float(sum(temp[temp['label'] == 0][i])) / sum(df[i]) dd[i][1] = float(sum(temp[temp['label'] == 1][i])) / sum(df[i]) # dd[i][2] = sum(temp[temp['label']==2][i])/sum(df[i]) # dd[i][0] = float(temp[temp['label']==0][i].shape[0])/temp.shape[0] # dd[i][1] = float(temp[temp['label']==1][i].shape[0])/temp.shape[0] # dd[i][2] = float(temp[temp['label']==2][i].shape[0])/temp.shape[0] dd_low[i] = dd[i][0] dd_high[i] = dd[i][1] dd_diff[i] = max(dd[i]) - min(dd[i]) return dd, dd_low, dd_high, dd_diff
def walkAnalysis(): df, labels = artificialDataGenerator.artificialData() df1 = df[df['transportation1'] >= 1] print 'the total appearance of cafe in people who walk is ' print sum(df1['caffeineD']) print 'the number of people who walk in the dataset is ' print(df1.shape[0]) df2 = df[df['transportation1'] == 0] print "the total appearance of cafe in people who not walk is " print sum(df2['caffeineD']) print 'the number of people who not walk in the dataset is ' print(df2.shape[0])
def KM_AtificialData(): df, cols = artificialDataGenerator.artificialData() for domain in Domain: print df.columns if domain == 'DietType': df_temp = df[[ 'alcoholD', 'caffeineD', 'compositeP', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP', 'seafood', 'snack', 'starchyP', 'vegetables' ]] else: df_temp = df[[ 'entertainmentRelax', 'others', 'social', 'sport', 'transportation1', 'transportation2', 'transportation3', 'workStudy' ]] X = df_temp.as_matrix() X = utilise.normArray(X) range_n_clusters = [2, 3, 4, 5, 6] for n_clusters in range_n_clusters: kmeans = KMeans(n_clusters=n_clusters, n_init=3000) kmeans.fit(X) labels = kmeans.labels_ inertia = kmeans.inertia_ plt.figure() reduced_data = PCA(n_components=2).fit_transform(X) N = np.max(labels) + 1 for k in range(N): class_members = labels == k if k == 0: for x in reduced_data[class_members]: plt.plot(x[0], x[1], 'go', markersize=5) if k == 1: for x in reduced_data[class_members]: plt.plot(x[0], x[1], 'ro', markersize=5) if k == 2: for x in reduced_data[class_members]: plt.plot(x[0], x[1], 'bo', markersize=5) if k == 3: for x in reduced_data[class_members]: plt.plot(x[0], x[1], 'yo', markersize=5) # for i in range(reduced_data.shape[0]): # plt.text(reduced_data[i, 0], reduced_data[i, 1],i) plt.title('K-means clustering (PCA-reduced data)') plt.savefig('visClustering' + domain + 'Pattern/KMeans_TF_artificial_' + str(n_clusters)) print domain, n_clusters, inertia, labels
def genArtificialActDietTypeDataSet(): dataset = [] newDF = artificialDataGenerator.artificialData() for i in range(newDF.shape[0]): temp = [] for j in newDF.columns: if j != 'compositeP' and j != 'sleepTime' and j != 'label' and j!='gender' and j!='others': if newDF.ix[i,j] > 0: temp.append(j) temp = tuple(temp) dataset.append(temp) dataset = tuple(dataset) print len(dataset) return dataset
def genArtificialActDietTypeDataSetForMoreSleep(): dataset = [] df = artificialDataGenerator.artificialData() df = df[df['label']==1] df.index = range(df.shape[0]) for i in range(df.shape[0]): temp = [] for j in df.columns: if j != 'compositeP' and j != 'sleepTime' and j != 'label' and j!='gender' and j!='others': if df.ix[i,j] > 0: temp.append(j) temp = tuple(temp) dataset.append(temp) dataset = tuple(dataset) print len(dataset) return dataset
def visdiff(): surrogateDF, labels = artificialDataGenerator.artificialData() df, cols = artificialDataGenerator.originalData() newDF = newFeatureFrame() for i in newDF.columns: plt.figure() newDF[i].plot.kde(label='new') if i == 'walk': i = 'transportation1' if i == 'car': i = 'transportation2' if i == 'bike': i = 'transportation3' if i == 'leisure': i = 'entertainmentRelax' df[i].plot.kde(label='original') surrogateDF[i].plot.kde(label='surrogate') # plt.legend() plt.legend(bbox_to_anchor=(1.05, 1), loc=2) plt.title(i) plt.savefig('distribution/' + i + '_diff')
temp_df = df[['alcoholD', 'eggP', 'seafood', 'gender', 'bikeWork', 'walkCar']] #temp_df = df[['alcoholD','eggP','seafood','gender']] for i in temp_df.columns: for j in range(temp_df.shape[0]): if temp_df[i][j] > 1: temp_df.set_value(j, i, 1) dataset = temp_df.as_matrix() labels = list(df['label']) clf = LogisticRegression(penalty='l1', C=0.5) scores = cross_validation.cross_val_score(clf, dataset, labels, cv=5) accuracy = scores.mean() print accuracy ''' artificial data test ''' df = artificialDataGenerator.artificialData() temp_df = df[[ 'alcoholD', 'caffeineD', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP', 'seafood', 'snack', 'starchyP', 'vegetables', 'entertainmentRelax', 'social', 'sport', 'transportation1', 'transportation2', 'transportation3', 'workStudy', 'gender' ]] dataset = temp_df.as_matrix() labels = list(df['label']) clf = LogisticRegression(penalty='l1', C=0.5) scores = cross_validation.cross_val_score(clf, dataset, labels, cv=5) accuracy = scores.mean() print accuracy ''' artificial data test pattern features '''
def clusteringKmeansLabelsArtificialDays(): workbookW = xlwt.Workbook() ws = workbookW.add_sheet('sheet1') rowW = 0 df = artificialDataGenerator.artificialData() for domain in Domain: print df.columns if domain == 'DietType': df_temp = df[['alcoholD','caffeineD','dairyP','eggP','fruitP','grainP','meatP','seafood','snack','starchyP','vegetables']] row_labels = df_temp.columns X = df_temp.as_matrix() X = utilise.normArray(X) kmeans = KMeans(n_clusters=2, n_init = 3000) kmeans.fit(X) labels = kmeans.labels_ else: df_temp = df[['entertainmentRelax','social','sport','transportation1','transportation2','transportation3','workStudy']] row_labels = df_temp.columns X = df_temp.as_matrix() X = utilise.normArray(X) kmeans = KMeans(n_clusters=3, n_init = 3000) kmeans.fit(X) labels = kmeans.labels_ # write the lables to excel file col = 0 for label in row_labels: ws.write(rowW,col,label) col += 1 rowW += 1 # print type(labels) plt.figure() n_clusters = np.max(labels) + 1 for k in range(n_clusters): class_members = labels == k group = [] for x in X[class_members]: group.append(x) group = np.array(group) meanVec = np.mean(group,axis=0) meanVec.tolist() stdVec = np.std(group,axis=0) stdVec.tolist() print stdVec # write the mean vector of each group to excel file col = 0 for value in meanVec: ws.write(rowW,col,value) col += 1 rowW += 1 # print meanVec firstMax = np.max(meanVec) # print firstMax tempVec = np.copy(meanVec) for j in range(X.shape[1]): if tempVec[j] == firstMax: tempVec[j] = 0 secondMax = np.max(tempVec) # print secondMax tempVec2 = np.copy(tempVec) for j in range(X.shape[1]): if tempVec2[j]==secondMax: tempVec2[j] = 0 thirdMax = np.max(tempVec2) # print thirdMaxO x = range(X.shape[1]) plt.plot(x,meanVec) # plt.errorbar(x,meanVec,yerr=stdVec) # print meanVec for j in range(X.shape[1]): # if meanVec[j] == firstMax: # if meanVec[j] == firstMax or meanVec[j] == secondMax: if meanVec[j] == firstMax or meanVec[j] == secondMax or meanVec[j] == thirdMax: ws.write(rowW,0,k) ws.write(rowW,1,domain) ws.write(rowW,2,row_labels[j]) ws.write(rowW,3,meanVec[j]) rowW += 1 print k,domain,n_clusters,meanVec[j],row_labels[j] if row_labels[j] == 'transportation1': plt.text(x[j],meanVec[j],'walk') elif row_labels[j] == 'transportation2': plt.text(x[j],meanVec[j],'car') elif row_labels[j] == 'transportation3': plt.text(x[j],meanVec[j],'bike') else: plt.text(x[j],meanVec[j],row_labels[j]) # print row_labels # plt.xlabel(row_labels) plt.title(domain+'_TF_KMeans_'+str(n_clusters)) plt.savefig('visClustering'+domain+'Pattern/KMeans__TF_ArtificialDays_'+str(n_clusters)+'_groupFreq') workbookW.save('tempLabels.xls')