/
myScikitLearnFcns.py
1893 lines (1320 loc) · 53.1 KB
/
myScikitLearnFcns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#coding:utf-8
def hehe():
print "hehe"
def Test01():
from sklearn import datasets
iris = datasets.load_iris() #load data
digits = datasets.load_digits()
print digits.images[0] #老子是中文注释!!
def Test02():
from sklearn import datasets
iris = datasets.load_iris() #这里花的数据
data = iris.data
#print data.shape
#print iris.DESCR
digits = datasets.load_digits() #这是数字的数据
#print digits.images.shape
import pylab as pl
pl.imshow(digits.images[-1], cmap = pl.cm.gray_r)
#pl.show()
data = digits.images.reshape(digits.images.shape[0], -1); #这里的data是一个数组,每一行是64维特征
print len(data)
print len(data[-1])
def supervisedTest01():
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()
iris_X = iris.data #iris_X 是150*4的特征(二维矩阵)
iris_Y = iris.target #iris_Y 是150*1的label (1维向量)
#print len(iris_X)
#print len(iris_Y)
#print np.unique(iris_Y) #这是获得label的种类, 这里是一共3类
np.random.seed(0)
indices = np.random.permutation(len(iris_X)) #获得0-149的一个全排列
#print indices
iris_x_train = iris_X[indices[:-10]] #这里是获得从开始到倒数第十的数据
iris_y_train = iris_Y[indices[:-10]] #获得与iris_x_train对应的label
iris_x_test = iris_X[indices[-10:]] #这是获得最后的10组数据作为test数据
iris_y_test = iris_Y[indices[-10:]] ##获得与iris_x_test对应的label
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_neighbors=3, p=2, weights='uniform')
knn.fit(iris_x_train, iris_y_train) #其实这里输入的就是train_x 和train_y
#算法待理解
print knn.predict(iris_x_test)
print iris_y_test
def supervisedTest02():
import numpy as np
from sklearn import datasets
diabetes = datasets.load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test = diabetes.data[-20:]
diabetes_Y_train = diabetes.target[:-20]
diabetes_Y_test = diabetes.target[-20:]
from sklearn import linear_model
regr = linear_model.LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
regr.fit(diabetes_X_train, diabetes_Y_train)
#print regr.coef_ #注意因为diabetes_X_train的特征是4维,所以coef_的个数是4+1 = 5
mean_err = np.mean((regr.predict(diabetes_X_test) - diabetes_Y_test) ** 2)
score = regr.score(diabetes_X_test, diabetes_Y_test) #这是判断test数据预测程度
print mean_err
print score
print len(diabetes.data) #样本数目
print len(diabetes.data[0]) #特征维数
def supervisedTest03():
import numpy as np
from sklearn import linear_model
from sklearn import datasets
diabetes = datasets.load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test = diabetes.data[-20:]
diabetes_Y_train = diabetes.target[:-20]
diabetes_Y_test = diabetes.target[-20:]
X = np.c_[ .5, 1].T
y = [0.5, 1]
test = np.c_[0, 2].T
regr = linear_model.LinearRegression() #这里用的是LinearRegression
import pylab as pl
pl.figure(1)
np.random.seed(0)
for _ in range(6):
this_X = .1 * np.random.normal(size=(2,1)) + X
regr.fit(this_X, y)
pl.plot(test, regr.predict(test))
pl.scatter(this_X, y, s=3)
#pl.show()
regr = linear_model.Ridge(alpha=.1) #这里用的是Ridge, 都是linear_model
pl.figure(4)
np.random.seed(0)
for _ in range(6):
this_X = .1 * np.random.normal(size=(2,1)) + X
regr.fit(this_X, y) # 这里是调用了estimator的fit函数
pl.plot(test, regr.predict(test))
pl.scatter(this_X, y, s=3)
pl.show()
#alphas = np.logspace(-4, -1, 6) #从-4到-1, 分6份
#from __future__ import print_function
#
#aaa = [regr.set_params(alpha = alpha).fit(diabetes_X_train, diabetes_Y_train).score(diabetes_X_test, diabetes_Y_test) for alpha in alphas]
def plotTest01():
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA
iris = datasets.load_iris()
X = iris.data[:,:2] # only first two features
Y = iris.target
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
plt.figure(2, figsize = (8, 6))
plt.clf()
plt.scatter(X[:, 0], X[:, 1], c = Y, cmap = plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
#plt.show()
fig = plt.figure(1, figsize = (8, 6))
ax = Axes3D(fig, elev = -150, azim = 110)
X_reduced = PCA(n_components = 3).fit_transform(iris.data)
#这里是画散点图
ax.scatter(X_reduced[:,0], X_reduced[:, 1], X_reduced[:,2], c = Y, cmap = plt.cm.Paired)
ax.set_title('First three PCA directionsa')
ax.set_xlabel('1st eigenvector')
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel('2nd eigenvector')
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel('3rd eigenvector')
ax.w_zaxis.set_ticklabels([])
plt.show()
def DigitClassificationTest01():
from sklearn import datasets, neighbors, linear_model
import numpy as np
digits = datasets.load_digits()
X_digits = digits.data
Y_digits = digits.target
n_samples = len(X_digits)
X_train = X_digits[:.9 * n_samples]
Y_train = Y_digits[:.9 * n_samples]
X_test = X_digits[.9 * n_samples:]
Y_test = Y_digits[.9 * n_samples:]
knn = neighbors.KNeighborsClassifier()
logistic = linear_model.LogisticRegression()
knn_score = knn.fit(X_train, Y_train).score(X_test, Y_test)
logistic_score = logistic.fit(X_train, Y_train).score(X_test, Y_test)
#print knn_score
#print logistic_score
#这个准确程度可以到90%+
#print len(X_train) #X_train是1617 * 64维的矩阵
#print Y_train[1] #这里的y就是对应的0-9, 这是一个多类分类的问题,训练的时候也是直接用的fit函数
iris = datasets.load_iris()
iris_X = iris.data #iris_X 是150*4的特征(二维矩阵)
iris_Y = iris.target #iris_Y 是150*1的label (1维向量)
np.random.seed(0)
indices = np.random.permutation(len(iris_X)) #获得0-149的一个全排列
iris_x_train = iris_X[indices[:-50]]
iris_y_train = iris_Y[indices[:-50]]
iris_x_test = iris_X[indices[-50:]]
iris_y_test = iris_Y[indices[-50:]]
from sklearn import svm
#svc = svm.SVC(kernel = 'linear') # SVR -- Support Vector Regression, # SVC -- Support Vector Classification
svc = svm.SVC(kernel = 'poly', degree = 3)
#svc = svm.SVC(kernel = 'rbf')
svc.fit(iris_x_train, iris_y_train)
score = svc.score(iris_x_test, iris_y_test)
print score
def ModelSelectionTest01():
from sklearn import datasets, svm
import numpy as np
digits = datasets.load_digits()
X_digits = digits.data
Y_digits = digits.target
svc = svm.SVC(C = 1, kernel = 'linear')
score = svc.fit(X_digits[:-100], Y_digits[:-100]).score(X_digits[-100:], Y_digits[-100:])
#print score
X_folds = np.array_split(X_digits, 3)
Y_folds = np.array_split(Y_digits, 3)
#print len(X_folds[0])
scores = list()
for k in range(3):
X_train = list(X_folds) #这里的X_folds是一个具有3个元素的list
X_test = X_train.pop(k) #test是train的第K个元素
X_train = np.concatenate(X_train) #这里是把X_train减去X_test
#print len(X_train)
Y_train = list(Y_folds)
Y_test = Y_train.pop(k)
Y_train = np.concatenate(Y_train)
scores.append(svc.fit(X_train, Y_train).score(X_test, Y_test))
#print scores
from sklearn import cross_validation
k_fold = cross_validation.KFold(n = 6, n_folds = 3)
for train_indices, test_indices in k_fold:
print train_indices, test_indices
k_fold = cross_validation.KFold(len(X_digits), n_folds = 3)
scores = [svc.fit(X_digits[train], Y_digits[train]).score(X_digits[test], Y_digits[test]) for train , test in k_fold]
#print scores
scores = cross_validation.cross_val_score(svc, X_digits, Y_digits, cv = k_fold, n_jobs = 1)
#print scores
from sklearn.grid_search import GridSearchCV
gammas = np.logspace(-6, -1, 10)
clf = GridSearchCV(estimator = svc, param_grid = dict(gamma = gammas), n_jobs = 1)
clf.fit(X_digits[:1000], Y_digits[:1000])
print clf.best_score_
print clf.best_estimator_.gamma
from sklearn import linear_model, datasets
lasso = linear_model.LassoCV() #这里的lassoCV和lasso有什么区别?
diabetes = datasets.load_diabetes()
X_diabetes = diabetes.data
Y_diabetes = diabetes.target
lasso.fit(X_diabetes, Y_diabetes)
print lasso.alpha_
def unsupervisedLearningTest01():
from sklearn import cluster, datasets
iris = datasets.load_iris()
X_iris = iris.data
Y_iris = iris.target
k_means = cluster.KMeans(n_clusters = 3) #这里是设置k-means的中心数
k_means.fit(X_iris) # fit data
print k_means.labels_[::10]
def unsupervisedLearningTest02():
from sklearn import cluster
import scipy as sp
import numpy as np
try:
lena = sp.lena()
except AttributeError:
from scipy import misc
lena = misc.lena()
X = lena.reshape((-1, 1))
k_means = cluster.KMeans(n_clusters = 5, n_init = 1)
k_means.fit(X)
values = k_means.cluster_centers_.squeeze()
labels = k_means.labels_
lena_compressed = np.choose(labels, values)
lena_compressed.shape = lena.shape
print lena_compressed
def plotTest02():
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn import cluster
n_clusters = 5
np.random.seed(0)
try:
lena = sp.lena()
except AttributeError:
from scipy import misc
lena = misc.lena()
X = lena.reshape((-1, 1))
k_means = cluster.KMeans(n_clusters = n_clusters, n_init = 4)
k_means.fit(X)
values = k_means.cluster_centers_.squeeze() #这是获得聚类中心
labels = k_means.labels_
lena_compressed = np.choose(labels, values)
lena_compressed.shape = lena.shape
vmin = lena.min()
vmax = lena.max()
#original
plt.figure(1, figsize = (3, 2.2))
plt.imshow(lena, cmap = plt.cm.gray, vmin = vmin, vmax = vmax)
#compressed data
plt.figure(2, figsize = (3, 2.2))
plt.imshow(lena_compressed, cmap = plt.cm.gray, vmin = vmin, vmax = vmax)
#这里面有一些函数要搞清楚意义是什么
#equal bins lena
regular_values = np.linspace(0, 256, n_clusters + 1)
regular_labels = np.searchsorted(regular_values, lena) - 1
regular_values = 0.5 * (regular_values[1:] + regular_values[:-1]) #mean
regular_lena = np.choose(regular_labels.ravel(), regular_values)
regular_lena.shape = lena.shape
plt.figure(3, figsize=(3, 2.2))
plt.imshow(regular_lena, cmap = plt.cm.gray, vmin = vmin, vmax = vmax)
#histogram
plt.figure(4, figsize = (3, 2.2))
plt.clf()
plt.axes([0.01, 0.01, 0.98, 0.98])
plt.hist(X, bins = 256, color = '0.5', edgecolor = '.5')
plt.yticks()
plt.xticks(regular_values)
values = np.sort(values)
for center_1, center_2 in zip(values[:-1], values[1:]):
plt.axvline(0.5 * (center_1 + center_2), color = 'b')
for center_1, center_1 in zip(regular_values[:-1], regular_values[1:]):
plt.axvline(0.5 * (center_1 + center_2), color = 'b', linestyle = '--')
plt.show()
def unsupervisedLearningTest03():
# Connectivity-constrained clustering
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.image import grid_to_graph
from sklearn.cluster import AgglomerativeClustering
from sklearn import cluster, datasets
lena = sp.misc.lena()
#Downsample the image by a factor of 4
lena = lena[::2, ::2] + lena[1::2, ::2] + lena[::2, 1::2] + lena[1::2, 1::2]
X = np.reshape(lena, (-1, 1))
# Define the structure A of the data. Pixels connected to their neighbors.
# 把图片变成一张图, 讨论其连接性
connectivity = grid_to_graph(*lena.shape)
print "Compute structured hierarchical clustering..."
st = time.time()
n_clusters = 15 # number of regions
ward = AgglomerativeClustering(n_clusters = n_clusters, linkage = 'ward', connectivity = connectivity).fit(X)
label = np.reshape(ward.labels_, lena.shape)
print "Elapsed time: " + str(time.time() - st)
print "Number of pixels: " + str(label.size)
print "Number of clusters: " + str(np.unique(label).size)
#Feature agglomeration
digits = datasets.load_digits()
images = digits.images
X = np.reshape(images, (len(images), -1))
connectivity = grid_to_graph(*images[0].shape)
agglo = cluster.FeatureAgglomeration(connectivity = connectivity, n_clusters = 32)
agglo.fit(X)
X_reduced = agglo.transform(X)
X_approx = agglo.inverse_transform(X_reduced)
images_approx = np.reshape(X_approx, images.shape)
def unsupervisedLearningTest04():
#主成分分析
import numpy as np
x1 = np.random.normal(size = 100)
x2 = np.random.normal(size = 100)
x3 = x1 + x2
X = np.c_[x1, x2, x3]
from sklearn import decomposition
pca = decomposition.PCA()
pca.fit(X)
print pca.explained_variance_
pca.n_components = 2
X_reduced = pca.fit_transform(X)
print X_reduced.shape
#独立成分分析
time = np.linspace(0, 10, 2000)
s1 = np.sin(2 * time)
s2 = np.sign(np.sin(3 * time))
S = np.c_[s1, s2]
S += 0.2 * np.random.normal(size = S.shape) #注意这里的shape
S /= S.std(axis = 0)
A = np.array([[1, 1], [0.5, 2]])
X = np.dot(S, A.T)
ica = decomposition.FastICA()
S_ = ica.fit_transform(X)
A_ = ica.mixing_.T
np.allclose(X, np.dot(S_, A_) + ica.mean_)
def CombinationTest01():
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
import matplotlib.pyplot as plt
import numpy as np
logistic = linear_model.LogisticRegression()
pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
digits = datasets.load_digits()
X_digits = digits.data
Y_digits = digits.target
pca.fit(X_digits)
plt.figure(1, figsize = (4, 3))
plt.clf()
plt.axes([0.2, 0.2, 0.7, 0.7])
plt.plot(pca.explained_variance_, linewidth = 2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')
#print len(pca.explained_variance_) #pca.explained_variance_ 是64*1的向量(和特征维数相同),从大到小
#plt.show()
#Prediction
n_components = [20, 40, 64]
Cs = np.logspace(-4, 4, 3)
estimator = GridSearchCV(pipe, dict(pca__n_components = n_components, logistic__C = Cs))
estimator.fit(X_digits, Y_digits)
plt.axvline(estimator.best_estimator_.named_steps['pca'].n_components, linestyle = ':', label = 'n_components chosen')
plt.legend(prop = dict(size = 12))
plt.show()
def lfwTest01():
from sklearn.datasets import fetch_lfw_people
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
for name in lfw_people.target_names:
print(name)
def lfwTest02():
#from __future__ import print_function
#学习对应模块间的接口, 数据格式
from time import time
import logging
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.datasets import fetch_lfw_people
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import RandomizedPCA
from sklearn.svm import SVC
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4) #这里的min_faces_per_person是用来限制读取图片的数据
#lfw_people = fetch_lfw_people(min_faces_per_person=5, resize=0.4)
# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = lfw_people.images.shape
X = lfw_people.data
n_features = X.shape[1]
# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]
#print target_names.shape
#print("Total dataset size:")
#print("n_samples: %d" % n_samples)
#print("n_features: %d" % n_features)
#print("n_classes: %d" % n_classes)
#print("h: %d" % h)
#print("w: %d" % w)
#print lfw_people
#print target_names
###############################################################################
# Split into a training set and a test set using a stratified k fold
# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
#X_train 是966 * 1850的二维矩阵
print X_train
print len(X_train)
print len(X_train[0])
#singleImage = X[1].reshape(h, w)
#
##显示图片
#plt.imshow(singleImage, cmap = plt.cm.gray_r)
#plt.show()
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150
print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]))
t0 = time()
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) #这里是利用PCA获得主成分
print("done in %0.3fs" % (time() - t0))
print pca.components_ #这里的pca.components_是150 * 1850的矩阵 -- 可以理解为就是找了150个向量, 每一向量都是原来所有样本的某一种线性组合(因此特征维数不会变)
print len(pca.components_)
print len(pca.components_[0])
eigenfaces = pca.components_.reshape((n_components, h, w)) #将向量转化为图片
#这里可以认为特征脸是原始若干张人脸的线性叠加
print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train) #把原始图片投影到eigenfaces空间里
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))
#print len(eigenfaces[0])
print X_train_pca #X_train_pca是966 * 150维矩阵, 其150维的每一维都是原始的train向量在对应eigenface上的投影
print len(X_train_pca) #所以train_pca是一组float, 不是int
print len(X_train_pca[0])
#print y_train #y_train就是label, 0-6 表示类别
#显示图片
#plt.imshow(eigenfaces[-1], cmap = plt.cm.gray_r)
#plt.show()
# Train a SVM classification model
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid) #注意这里是用svm进行分类,所以是svc
clf = clf.fit(X_train_pca, y_train) #输入为转换后的pca数据和y_train进行训练 -- 多类别svm
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)
# Quantitative evaluation of the model quality on the test set
print("Predicting people's names on the test set")
t0 = time()
y_pred = clf.predict(X_test_pca) #测试数据也是在相同的eigenface空间内进行投影后的结果
print("done in %0.3fs" % (time() - t0))
print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
###############################################################################
# Qualitative evaluation of the predictions using matplotlib
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
"""Helper function to plot a gallery of portraits"""
plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
for i in range(n_row * n_col):
plt.subplot(n_row, n_col, i + 1)
plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
plt.title(titles[i], size=12)
plt.xticks(())
plt.yticks(())
# plot the result of the prediction on a portion of the test set
def title(y_pred, y_test, target_names, i):
pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
return 'predicted: %s\ntrue: %s' % (pred_name, true_name)
prediction_titles = [title(y_pred, y_test, target_names, i)
for i in range(y_pred.shape[0])]
plot_gallery(X_test, prediction_titles, h, w)
# plot the gallery of the most significative eigenfaces
eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)
plt.show()
def face_completion_Test01():
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
from sklearn.utils.validation import check_random_state
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
#load the faces datasets
data = fetch_olivetti_faces()
targets = data.target
#print len(data.data)
#print len(data.data[0]) #data.data 是 400 * 4096 的数据
#感觉这里的4096维和原图不一样啊...ravelled image
#face = data.data[1].reshape(64,64) #注意这里的data和image
#face = data.images[1]
#face_ccw_90 = zip(*face)[::-1]
#face_cw_90 = zip(*face[::-1])
#plt.imshow(face_cw_90, cmap = plt.cm.gray_r)
#plt.show()
#这里是为了做左右预测, 所以把原图旋转了90度
#for i in range(len(data.images)):
# face = data.images[i]
# data.images[i] = face_cw_90 = zip(*face[::-1])
#print data.images[0]
data = data.images.reshape((len(data.images), -1)) #相当于就是data.data...把一张图片变成了一个行向量
#print len(data[0])
train = data[targets < 30]
test = data[targets >= 30] #注意这里的test和targe没有关系
n_faces = 5
rng = check_random_state(4)
#test.shape = [100, 4096]
face_ids = rng.randint(test.shape[0], size = (n_faces, )) #这里相当于是在0-99中随机选择出5个数
test = test[face_ids, :]
#print face_ids
n_pixels = data.shape[1]
X_train = train[:, :np.ceil(0.5 * n_pixels)] #脸的上半部分
Y_train = train[:, np.floor(0.5 * n_pixels):] #脸的下半部分
X_test = test[:, :np.ceil(0.5 * n_pixels)] #相当于是那脸的前半部分预测后半部分 -- 是一个多对多的学习过程, train和test的维度相同
Y_test = test[:, np.floor(0.5 * n_pixels):]
#注意因为是要做completion, 所以是regression 而不是 classification
#这里的ESTMATORS是一个字典
ESTIMATORS = {
"Extra trees": ExtraTreesRegressor(n_estimators = 10, max_features = 32, random_state = 0),
"k-nn": KNeighborsRegressor(),
"Linear regression": LinearRegression(),
"Ridge": RidgeCV(),
}
#这里是直接进行预测, 也就是fit + predict的过程
print "start fiting and predicting"
y_test_predict = dict()
for name, estimator in ESTIMATORS.items():
estimator.fit(X_train, Y_train)
y_test_predict[name] = estimator.predict(X_test)
print "start plotting"
#下面是画图
image_shape = (64, 64)
n_cols = 1 + len(ESTIMATORS)
plt.figure(figsize=(2.0 * n_cols, 2.26 * n_faces))
plt.suptitle("Face completion with multi-output estimators GoGoGo", size = 16)
for i in range(n_faces):
true_face = np.hstack((X_test[i], Y_test[i]))
if i:
sub = plt.subplot(n_faces, n_cols, i * n_cols + 1)
else:
sub = plt.subplot(n_faces, n_cols, i * n_cols + 1, title = "true faces")
sub.axis("off")
sub.imshow(true_face.reshape(image_shape), cmap = plt.cm.gray, interpolation = "nearest")
#a = true_face.reshape(image_shape)
#sub.imshow(zip(*a)[::-1], cmap = plt.cm.gray, interpolation = "nearest")
for j, est in enumerate(sorted(ESTIMATORS)):
completed_face = np.hstack((X_test[i], y_test_predict[est][i]))
if i:
sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j)
else:
sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j, title = est)
sub.axis("off")
sub.imshow(completed_face.reshape(image_shape), cmap = plt.cm.gray, interpolation = "nearest")
#b = completed_face.reshape(image_shape)
#sub.imshow(zip(*b)[::-1], cmap = plt.cm.gray, interpolation = "nearest")
plt.show()
def rotateTest():
m = [[1,2,3],[4,5,6],[7,8,9]]
print zip(*m)[::-1] # counter clock-wise
print zip(*m[::-1]) # clock-wise
def OnlineLearningTest01():
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.image import extract_patches_2d
faces = datasets.fetch_olivetti_faces()
print "Learning the dictionary..."
rng = np.random.RandomState(0)
kmeans = MiniBatchKMeans(n_clusters = 81, random_state = rng, verbose = True)
patch_size = (20, 20)
buffer = []
index = 1
t0 = time.time()
#Online Learning
index = 0
for _ in range(6):
for img in faces.images:
data = extract_patches_2d(img, patch_size, max_patches = 50, random_state = rng)
data = np.reshape(data, (len(data), -1))
buffer.append(data)
index += 1
if index % 10 == 0:
data = np.concatenate(buffer, axis = 0) #这里是把一个数组合并成矩阵
#这里要先做标准化
data -= np.mean(data, axis = 0)
data /= np.std(data, axis = 0)
kmeans.partial_fit(data) #每次都是调用partial_fit函数进行学习
buffer = []
if index % 100 == 0:
print "Partial fit of %4i out of %i" % (index, 6 * len(faces.images))
dt = time.time() - t0
print "done in %.2fs. " % dt
#plot result
plt.figure(figsize = (4.2, 4))
for i, patch in enumerate(kmeans.cluster_centers_):
plt.subplot(9,9, i + 1)
plt.imshow(patch.reshape(patch_size), cmap = plt.cm.gray, interpolation = "nearest")
plt.xticks(())
plt.xticks(())
plt.suptitle('Patches of faces\nTrain time %.1fs on %d patches' % (dt, 8 * len(faces.images)), fontsize = 16)
plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
plt.show()
def Test03():
import numpy as np
a = [3, 4]
b = [1, 2]
data = []
data.append(a)
data.append(b)
print data
print np.concatenate(data, axis = 0)
a = np.array([[ 1],[ 2],[ 3]])
b = np.array([[ 2],[ 3],[ 4]])
print a
print b
print np.hstack((a,b,a))
print np.vstack((a,b,a))
def RBMtest01():
#利用RBM进行non-linear feature extraction
#相对于直接进行logistic regression, RBM features 可以提高分类精度
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import convolve
from sklearn import linear_model, datasets, metrics
from sklearn.cross_validation import train_test_split
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
def nudge_dataset(X, Y):
direction_vectors = [
[[0, 1, 0],
[0, 0, 0],
[0, 0, 0]],
[[0, 0, 0],
[1, 0, 0],
[0, 0, 0]],
[[0, 0, 0],
[0, 0, 1],
[0, 0, 0]],
[[0, 0, 0],
[0, 0, 0],
[0, 1, 0]]
]
shift = lambda x, w: convolve(x.reshape((8, 8)), mode = 'constant', weights = w).ravel()
X = np.concatenate([X] + [np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors])
Y = np.concatenate([Y for _ in range(5)], axis = 0)
return X, Y
digits = datasets.load_digits()
X = np.asarray(digits.data, 'float32') #这里应该就是进行了一下数据类型转换 a#list to array
X, Y = nudge_dataset(X, digits.target) #相当于重新生成了5倍的X,Y
#print np.max(X, 0)
#print np.min(X, 0)
X = (X - np.min(X, 0)) / (np.max(X, 0) - - np.min(X, 0) + 0.0001) # 0-1 scaling 这里做了归一化(每一维分别归一化)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
print set(Y_train)
#'''
#新建模型
logistic = linear_model.LogisticRegression()
rbm = BernoulliRBM(random_state = 0, verbose = True)
#感觉这里的pipeline就是一个连续进行fit, transform的过程
#而rbm模型transform的结果是Latent representations of the data.
classifier = Pipeline(steps = [('rbm', rbm), ('logistic', logistic)])
#Training
#这里的参数是根据cross-validation选出来的 -- GridSearchCV
rbm.learning_rate = 0.06
rbm.n_iter = 20
rbm.n_components = 100 #这里就是利用rbm 训练出100个特征
logistic.C = 6000
#rbm.fit(X_train, Y_train)
rbm.fit(X_train)