-
Notifications
You must be signed in to change notification settings - Fork 0
/
MC_procedure.py
128 lines (100 loc) · 5.61 KB
/
MC_procedure.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
'''Script containing functions to process a SVM supervised classification at the object scale with customized kernel function and using a Monte Carlo procedure.
'''
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#from joblib import Parallel, delayed
from sklearn.cross_validation import StratifiedKFold, train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from confusion_matrix import ConfusionMatrix
class MC_SVM:
"""Monte Carlo procedure for supervised classification using SVM and customized kernel function.
"""
def __init__(self,kernel,n_folds=5):
"""
Input:
- kernel (class): estimator of the kernel function
- n_folds (int): number of folds for the stratified k-folds CV
"""
self.n_folds=n_folds
self.pipe = Pipeline([('Kernel', kernel),('svm', SVC()),])
def fit_predict(self,Gtr,Ytr,Gt,Yt,grid_search,acc_param="F1Mean",RS=0,VERBOSE=False):
"""Learn the model on training dataset and predict on the testing dataset.
Input:
- Gtr (array): training subset
- Ytr (array): true labels of training subset
- Gt (array): testing subset
- Yt (array): true labels of testing subset
- grid_search (dict): grid search for the CV
- acc (str): accuracy parameter for the cross-validation (default "F1Mean", otherwise it will be the OA)
- RS (int) : random seed used for the stratified k-fold CV
- VERBOSE (bool): verbose (default: False)
Return:
- confMatrix (object ConfusionMatrix): confusion matrix issued from the classification
- yp (array): vector of predicted labels of the testing subset
- grid.best_params_ (dict): combination of parameters that gave the best results during the CV
TODO: implement own scoring parameter
"""
if acc_param== "F1Mean":
score = 'f1_macro'
else:
score= 'accuracy'
## Initialization of the stratifield K-CV
cv = StratifiedKFold(Ytr,n_folds=self.n_folds,random_state=RS)
#Implementation of a fit and a predict methods with parameters from grid search
grid = GridSearchCV(self.pipe, param_grid=grid_search, scoring=score,verbose=VERBOSE, cv=cv,n_jobs=3)
grid.fit(Gtr,Ytr)
model = grid.best_estimator_
if VERBOSE:
print grid.best_score_
#Learn model
model.fit(Gtr,Ytr) #could use refit in version 0.19 of sklearn
#Predict
yp = model.predict(Gt)
#Compute confusion matrix
confMatrix = ConfusionMatrix()
confMatrix.compute_confusion_matrix(yp, Yt)
return confMatrix, yp, grid.best_params_
def split_dataset(self,i,G,Y,grid_search,acc,test_prop,RS,VERBOSE):
"""One iteration of the MC procedure (split dataset into training and testing subsets, fit and predict).
Input:
- i (int): number of the repetition for random split of the dataset
- G (array or list): list of objects
- Y (array): vector of true labels
- grid_search (dict): grid search for the CV
- acc (str): accuracy parameter for the cross-validation (default "F1Mean")
- test_prop (float): proportion of the dataset to use for testing subset during the n-fold cross-validation (default: 20.25)
- RS (int) : random seed used for the stratified k-fold CV
- VERBOSE (bool): verbose (default: False)
Return:
- confMatrix (object ConfusionMatrix): confusion matrix issued from the classification
- yp (array): vector of predicted labels of the testing subset
- best_params (dict): combination of parameters that gave the best results during the CV
"""
#Divide the dataset into training and testing subsets
Gtr,Gt,Ytr,Yt = train_test_split(G,Y,test_size=test_prop,random_state=i,stratify=Y)
#Predict the labels of the testing subset
confMatrix, yp, best_params = self.fit_predict(Gtr,Ytr,Gt,Yt,grid_search,acc,RS=RS,VERBOSE=VERBOSE)
return confMatrix, yp, best_params
def mc_procedure(i,G,Y,kernel,grid_search,acc,test_prop,n_folds=3, RS=0, VERBOSE=False):
"""One iteration of the Monte Carlo procedure for supervised classification.
(This function was designed to enable the parallel run of the iterations of the MC procedure)
Input:
- i (int): number of the repetition for random split of the dataset
- G (array or list): list of objects
- Y (array): vector of true labels
- kernel (class): estimator of the kernel function
- grid_search (dict): grid search for the CV
- acc (str): accuracy parameter for the cross-validation (default "F1Mean")
- test_prop (float): proportion of the dataset to use for testing subset during the k-folds cross-validation (default: 20.25)
- n_folds (int): number of folds for the stratified k-folds CV
- RS (int) : random seed used for the stratified k-fold CV
- VERBOSE (bool): verbose (default: False)
Return:
-results (tuple): results of the classification (confusion matrix, predicted labels and best parameters)
"""
classif = MC_SVM(kernel,n_folds)
#Process the classifications
results = classif.split_dataset(i,G,Y,grid_search,acc,test_prop,RS, VERBOSE )
return results