/
Wine_data.py
215 lines (165 loc) · 7.06 KB
/
Wine_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing, linear_model, svm
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn import cluster
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import manifold
from sklearn.neighbors import NearestNeighbors
#convert csv of data set to pandas dataframe`
df = pd.read_csv("winequality-white.csv", sep=';')
df.head()
df.describe()
corr = df.corr()
#plot correlation heatmap between all columns in dataframe
plt.subplots(figsize=(15,10))
ax = plt.axes()
ax.set_title("Wine Characteristic Correlation Heatmap")
sns.heatmap(corr,
xticklabels=corr.columns,
yticklabels=corr.columns,
annot=True,
cmap=sns.diverging_palette(220, 20, as_cmap=True))
#histogram of quality
fig = plt.figure(figsize = (8,8))
plt.hist(df.quality,bins=6,alpha=0.5,histtype='bar',ec='black')
plt.title('Distribution of the Quality')
plt.xlabel('Quality')
plt.ylabel('Count')
#boxplot of quality and alcohol
plt.figure(figsize=(8,5))
sns.boxplot(x='quality',y='alcohol',data=df,palette='GnBu_d')
plt.title("Boxplot of Quality and Alcohol")
plt.show()
cols = ["total sulfur dioxide","free sulfur dioxide","residual sugar","fixed acidity","volatile acidity","alcohol","sulphates","pH","density", "citric acid", "chlorides"]
#separate data into training and test sets and choose state so that results are repeatable
X_train, X_test, y_train, y_test = train_test_split(df[cols], df["quality"], test_size=0.2, random_state=4)
#declare and fit svm with rbf kernel to training set
clf = svm.SVC(kernel="rbf", gamma=1, C=1,
decision_function_shape="ovo")
clf.fit(X_train,y_train)
ypred = clf.predict(X_test)
#get scores for how well the svm performs on test set
print("Accuracy rbf kernel: %.2f"
% clf.score(X_test, y_test))
print("nmi rbf kernel: %.2f"
% normalized_mutual_info_score(y_test, ypred))
print(classification_report(y_test, ypred))
X = StandardScaler().fit_transform(df[cols])
#finding ideal eps for DBSCAN by calculating the distance to the nearest n points for each point,
#sorting and plotting the results. Then we look to see where the change is most pronounce
#and select that as epsilon.
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(X)
distances, indices = nbrs.kneighbors(X)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
plt.title("finding optimal eps value for DBSCAN using elbow method")
params = {'n_clusters': 2, 'eps': 2, 'min_samples': 5}
spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'],
eigen_solver='arpack',
affinity="nearest_neighbors")
dbscan = cluster.DBSCAN(eps=params['eps'],
min_samples=params['min_samples'])
clustering_algorithms = (
('SpectralClustering', spectral),
('DBSCAN', dbscan),
)
y_pred = {}
for name, algorithm in clustering_algorithms:
y_pred[name] = algorithm.fit_predict(X)
#calculate number of true labels between 1-9 in each cluster and then choose the label which appears
#most frequently as the label of that cluster
avg_quality_spec = [[0, 0, 0, 0, 0, 0, 0, 0 ,0], [0, 0, 0, 0, 0, 0, 0]]
avg_quality_dbscan = [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]]
for i, pred in enumerate(y_pred['DBSCAN']):
if pred == 0:
avg_quality_dbscan[0][df['quality'].iloc[i] - 3] += 1
elif pred == -1:
avg_quality_dbscan[1][df['quality'].iloc[i] - 3] += 1
for i, pred in enumerate(y_pred['SpectralClustering']):
if pred == 0:
avg_quality_spec[0][df['quality'].iloc[i] - 3] += 1
elif pred == 1:
avg_quality_spec[1][df['quality'].iloc[i] - 3] += 1
# in all the clusters, the quality of wine that appears the most in 6 so all elements
#in dataframe are labeled as 6
y_pred_ = [6 for i in range(4898)]
print("f1_score: %0.3f"
% f1_score(df['quality'], y_pred_, average='weighted'))
print("accuracy_score: %.03f"
% accuracy_score(df['quality'], y_pred_))
print("Mutual Information: %0.3f"
% metrics.normalized_mutual_info_score(df['quality'], y_pred_))
#for labels in y_pred.values():
# print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
# print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
# print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
# print("Adjusted Rand Index: %0.3f"
# % metrics.adjusted_rand_score(labels_true, labels))
# print("Silhouette Coefficient: %0.3f"
# % metrics.silhouette_score(X, labels))
#dimension reduction with pca
pca = PCA(n_components=3).fit(X)
new_cor = pca.transform(X)
#plot projection of 3d pca onto two components
fig = plt.figure(figsize=(15,10))
fig.add_subplot(1,3,1)
plt.scatter(new_cor[:,0], new_cor[:,1], marker='x', cmap='jet')
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
fig.add_subplot(1,3,2)
plt.scatter(new_cor[:,0], new_cor[:,2], marker='x', cmap='jet')
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 3")
fig.add_subplot(1,3,3)
plt.scatter(new_cor[:,1], new_cor[:,2], marker='x', cmap='jet')
plt.xlabel("PCA Component 2")
plt.ylabel("PCA Component 3")
#pearson correlation coefficient for all the pca components with quality
corr1 = scipy.stats.pearsonr(new_cor[:,0], df['quality'])
corr2 = scipy.stats.pearsonr(new_cor[:,1], df['quality'])
corr3 = scipy.stats.pearsonr(new_cor[:,2], df['quality'])
print(corr1)
print(corr2)
print(corr3)
#plot information content of each pca component
fig = plt.figure(figsize=(12,6))
plt.bar(np.arange(pca.n_components_), 100*pca.explained_variance_ratio_)
plt.title('Relative information content of PCA components')
plt.xlabel("PCA component number")
plt.ylabel("PCA component variance % ")
#dimension reduction with Isomap
iso = manifold.Isomap(n_neighbors=6, n_components=3).fit(X)
manifold_3Da = iso.transform(X)
#plot projection of 3d isomap onto two components
fig = plt.figure(figsize=(15,10))
fig.add_subplot(1,3,1)
plt.scatter(manifold_3Da[:,0], manifold_3Da[:,1], marker='x', cmap='jet')
plt.xlabel("Isomap Component 1")
plt.ylabel("Isomap Component 2")
fig.add_subplot(1,3,2)
plt.scatter(manifold_3Da[:,0], manifold_3Da[:,2], marker='x', cmap='jet')
plt.xlabel("Isomap Component 1")
plt.ylabel("PCA Component 3")
fig.add_subplot(1,3,3)
plt.scatter(manifold_3Da[:,1], manifold_3Da[:,2], marker='x', cmap='jet')
plt.xlabel("Isomap Component 2")
plt.ylabel("Isomap Component 3")
plt.show()
#pearson correlation coefficient for all the Isomap components with quality
corr1 = scipy.stats.pearsonr(manifold_3Da[:,0], df['quality'])
corr2 = scipy.stats.pearsonr(manifold_3Da[:,1], df['quality'])
corr3 = scipy.stats.pearsonr(manifold_3Da[:,2], df['quality'])
print(corr1)
print(corr2)
print(corr3)