-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleaning.py
316 lines (217 loc) · 8.24 KB
/
cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats as ss
import seaborn as sns
from scipy.stats import mstats
import os
import statsmodels.api as sm
from sklearn.decomposition import PCA
def createImagesFolder(directory):
if not os.path.exists(directory):
os.makedirs(directory)
def applyPCA(df, components):
pca = PCA(n_components=components)
pca.fit(df[:-1])
PCA(copy=True, iterated_power='auto', n_components=components, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
print "PCA levels :",(pca.explained_variance_ratio_)
def multipleRegression(df):
text_file = open("figures/RegressionMultipleValues.txt", "w")
# regression for every pair of columns
for i in range (len(df.columns.tolist())):
for j in range(i+1, len(df.columns.tolist())-1):
X = df[[df.columns[i],df.columns[j]]]
y = df[df.columns[-1]]
X = sm.add_constant(X)
est = sm.OLS(y,X).fit()
text_file.write(est.summary().as_text())
text_file.close()
text_file = open("figures/RegressionMultipleValues.txt", "a")
# regression of all columns
Z = df[df.columns.tolist()[:-1]]
#t = df[df.columns[-1]]
Z = sm.add_constant(Z)
est2 = sm.OLS(y,Z).fit()
text_file.write(est2.summary().as_text())
text_file.close()
def kruskalWallis(df, alpha):
print(" Kruskal Wallis H-test test:")
h = list(df.columns.values)
for column in h[:-1]:
# get the H and pval
H, pval = mstats.kruskalwallis(df[column].tolist(),df["quality"].tolist())
print " H-statistic:", H
print " P-Value:", pval
#check pvalue
if pval < alpha:
print "Reject NULL hypothesis - Significant differences exist between ",column," and quality \n\n"
if pval >= alpha:
print "Accept NULL hypothesis - No significant difference between ", column," and quality \n\n"
def isNormalDistribution(df,alpha,shapiro=True):
print "\nChecking if the columns follow a normal distribution by d'Agostino & Pearson or Shpapiro test...\n"
#list of column except the "quality"
h = list(df.columns.values)
count = 0
for i in h:
u,v = ss.shapiro(df[i])
k,p = mstats.normaltest(df[i])
if (shapiro):
if v < alpha:
print " The null hypothesis can be rejected; Column: ", i,"\n"
count += 1
else: print " The null hypothesis can not be rejected; Column: ",i,"\n"
else:
if p < alpha:
print " The null hypothesis can be rejected; Column: ", i,"\n"
count += 1
else: print " The null hypothesis can not be rejected; Column: ",i,"\n"
if count == len(h):
print "\n\n Any column follows a normal distribution\n"
def isHomogeneous(df,alpha,levene=True):
print "\nChecking if all the columns are homogeneous by Levene or Fligner-Killeen test...\n"
#colums to list
h = list(df.columns.values)[:-1]
#columns values to list
col1 = df[h[0]].tolist()
col2 = df[h[1]].tolist()
col3 = df[h[2]].tolist()
col4 = df[h[3]].tolist()
col5 = df[h[4]].tolist()
col6 = df[h[5]].tolist()
col7 = df[h[6]].tolist()
col8 = df[h[7]].tolist()
col9 = df[h[8]].tolist()
col10 = df[h[9]].tolist()
col11 = df[h[10]].tolist()
L,p_val = ss.levene(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11)
F, p = ss.fligner(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11)
if (levene):
if p_val < alpha:
print "\n It is not an homegeneous dataset (Levene)\n"
else: print "\n It is an homogeneneous dataset (Levene)\n"
else:
if p < alpha:
print "\n It is not an homegeneous dataset (Fligner-Killeen) \n"
else: print "\n It is an homogeneneous dataset (Fligner-Killen)\n"
def regression(df, r):
h = list(df.columns.values)
val = []
#we loop simetrical matrix
for i in range (len(h)):
for j in range(i+1,len(h)):
#compute line parameters
slope, intercept, rvalue, p, error = ss.linregress(df[h[i]],df[h[j]])
#print rvalue**2, h[i],h[j]
#if correlation is good enough ...
if (rvalue**2 > r or rvalue**2 < -r) :
print "\n*** The line equation is: ", h[j] ," =",slope,"*", h[i]," + (",intercept,") with r=",rvalue**2," ***\n"
val.append((h[i],h[j]))
count = 0
#plotting the lines which are good enough
for x in val:
sns.lmplot(x=x[0],y=x[1],data=df.sample(n=30),scatter=True,fit_reg=True)
title = "line"+str(count)+".png"
plt.savefig('figures/'+title)
count += 1
plt.show(block=False)
plt.clf()
def checkOutliers(df, maxQ, minQ, applyFunction=True,removeOutliers=True):
h = list(df.columns.values)[:-1]
dp = pd.DataFrame(columns=h)
totalOutliers = 0
if (applyFunction):
print "Counting the outliers..."
print "Column---Number---Outliers Up---Outliers Down"
for i in h:
outliers = 0
#compute iqr and maxQuantil and minQuantil
iqr = np.percentile(df[i].tolist(),maxQ)- np.percentile(df[i].tolist(),minQ)
maxQuantil = np.percentile(df[i].tolist(),maxQ) + float(iqr*1.5)
minQuantil = np.percentile(df[i].tolist(),minQ) - float(iqr*1.5)
supOutliers = df[i][df[i] > maxQuantil].count()
infOutliers = df[i][df[i] < minQuantil].count()
outliers += supOutliers + infOutliers
print i,"-->",outliers, ",", supOutliers, ",",infOutliers
#plotting the outliers
flierprops = dict(markerfacecolor='1.75', markersize=5,linestyle='none')
sns.boxplot(df[i],flierprops=flierprops)
plt.savefig('figures/'+str(i)+"_BoxPlot")
plt.show(block=False)
plt.clf()
#converting the outliers to NaN values
df[i] = df[i][df[i] < maxQuantil]
df[i] = df[i][df[i] > minQuantil]
#remove the rows with NaN values
dp = df.dropna(axis=0, how='any')
totalOutliers = df[h[0]].count() - dp[h[0]].count()
print "\n*** The total outliers in the dataset are :" +str(totalOutliers)+ " ***\n "
#if removeOutliers we remove the NaN rows. If not we replace by the Nan value by the mean
if (removeOutliers):
return dp
else:
dp = df.fillna(df.mean())
return dp
else:
return df
def normalizedData(df):
h = list(df.columns.values)
dp = pd.DataFrame(columns=h)
#we normalize data between 0 and 1
for i in h:
maxim = float(df[i].max())
minim = float(df[i].min())
dp[i] = df[i].map(lambda x: (x- minim)/ (maxim-minim))
return dp
def drawNormal(df):
h = list(df.columns.values)[:-1]
dp = pd.DataFrame(columns=h)
#we plot the histogrames and the normal curve.
for i in h:
v = sorted(df[i].tolist())
fit = ss.norm.pdf(v,np.mean(v),np.std(v))
plt.plot(v,fit)
plt.hist(v,normed='True',label=i)
s = i+".png"
plt.savefig('figures/'+s)
plt.show(block=False)
plt.clf()
# We also plot the Q-Q graphic of the mean
s = i+"_Q-Q_plot.png"
mean = np.mean(df[i])
std = np.std(df[i], ddof=1)
dp[i] = df[i].map(lambda x:(x-mean)/std)
ss.probplot(dp[i],plot=plt)
plt.savefig('figures/'+s)
plt.show(block=False)
plt.clf()
if __name__=="__main__":
#create the folder for storing the images
createImagesFolder('figures')
#reading the dataset and printing the basical statistics
df = pd.read_csv("wine.csv",sep=';')
df.describe().to_csv("wineStatistics.csv")
# checking the outliers
dc = checkOutliers(df,75,25,True,True)
#normalizing data
dn = normalizedData(dc)
#checking if normal distribution
isNormalDistribution(dn,0.05)
#checking homogeneousity
isHomogeneous(dn,0.05)
#applying Kruskall Wallis hypothesis
kruskalWallis(dn,0.05)
#computing the possible lines between fields given a correlation and plotting the lines
regression(dn,0.675)
#plotting the normal curves for each fiel
drawNormal(dn)
#applyin multiple regression for each pair of variable and for all the variables
multipleRegression(dn)
# printing the final dataset statistics and the final dataset itself
dn.to_csv("wineTreated.csv")
temp = dn.describe()
temp.to_csv('wineTreatedStatistics.csv')
#uncomment if you want to apply pca
"""
applyPCA(dn,2)
"""