-
Notifications
You must be signed in to change notification settings - Fork 0
/
grid_search_ipy.py
152 lines (127 loc) · 5.61 KB
/
grid_search_ipy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""Search over specified parameter in parallel on IPython
"""
import os
import numpy as np
from IPython.parallel import Client
from sklearn.externals import joblib
from sklearn.cross_validation import check_cv
from sklearn.base import is_classifier, clone
from sklearn.utils.validation import _num_samples, check_arrays
from sklearn.grid_search import ParameterGrid, BaseSearchCV, _check_param_grid
DATA_FILENAME_TEMPLATE = 'data_%03d.npy'
class OnProgressError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
def get_fullpath(filename):
return os.path.abspath(os.path.join(os.path.dirname(__file__), filename))
def evaluate(estimator, data_filename, params):
from sklearn.externals import joblib
X_train, y_train, X_test, y_test = joblib.load(data_filename, mmap_mode='c')
estimator.set_params(**params)
estimator.fit(X_train, y_train)
test_score = estimator.score(X_test, y_test)
return test_score
class IPyGridSearchCV(BaseSearchCV):
"""Search over specified parameter in parallel on IPython
"""
def __init__(self, estimator, param_grid, dataset_filenames=None,
sync=True, scoring=None,
loss_func=None, score_func=None, fit_params=None, n_jobs=1, iid=True,
refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'):
super(IPyGridSearchCV, self).__init__(
estimator, scoring, loss_func, score_func, fit_params, n_jobs, iid,
refit, cv, verbose, pre_dispatch)
self.param_grid = param_grid
self.dataset_filenames = dataset_filenames
self.sync = sync
_check_param_grid(param_grid)
def fit(self, X, y=None, **params):
return self._fit(X, y, ParameterGrid(self.param_grid))
def _fit(self, X, y, parameter_iterable):
"""Actual fitting, performing the search over parameters."""
estimator = self.estimator
cv = self.cv
n_samples = _num_samples(X)
X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')
if y is not None:
if len(y) != n_samples:
raise ValueError('Target variable (y) has a different number '
'of samples (%i) than data (X: %i samples)'
% (len(y), n_samples))
y = np.asarray(y)
cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
if not self.dataset_filenames:
self.save_dataset_filename(X, y, cv)
dataset_filenames = self.dataset_filenames
client = Client()
lb_view = client.load_balanced_view()
if self.verbose > 0:
print("Number of CPU core %d" % len(client.ids()))
self.tasks = [([lb_view.apply(evaluate, estimator, dataset_filename, params)
for dataset_filename in dataset_filenames], params)
for params in parameter_iterable]
if self.sync:
self.wait()
self.set_grid_scores()
self.set_best_score_params()
if self.refit:
self.set_best_estimator(estimator)
return self
def save_dataset_filename(self, X, y, cv):
dataset_filenames = []
for i, (train, test) in enumerate(cv):
cv_fold = ([X[k] for k in train], y[train], [X[k] for k in test], y[test])
cv_split_filename = get_fullpath(DATA_FILENAME_TEMPLATE % i)
joblib.dump(cv_fold, cv_split_filename)
dataset_filenames.append(cv_split_filename)
self.dataset_filenames = dataset_filenames
def wait(self):
return [task.get() for task_group in self.tasks
for task in task_group[0]]
def get_progress(self):
return np.mean([task.ready() for task_group in self.tasks
for task in task_group[0]])
def set_grid_scores(self):
if self.get_progress() != 1:
raise OnProgressError('On process')
grid_scores = []
for task in self.tasks:
grid_scores.append((np.mean([t.get() for t in task[0]]), task[1]))
self.grid_scores_ = grid_scores
def set_best_score_params(self):
best = sorted(self.grid_scores_, reverse=True)[0]
self.best_score_ = best[0]
self.best_params_ = best[1]
def set_best_estimator(self, estimator):
best_estimator = clone(estimator).set_params(
**self.best_params_)
if y is not None:
best_estimator.fit(X, y, **self.fit_params)
else:
best_estimator.fit(X, **self.fit_params)
self.best_estimator_ = best_estimator
def create_clf():
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
clf = Pipeline([
('vect', TfidfVectorizer(
token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
)),
('svc', SVC()),
])
return clf
if __name__ == '__main__':
from sklearn.datasets import load_iris, fetch_20newsgroups
from sklearn.cross_validation import KFold
news = fetch_20newsgroups(subset='all')
n_samples = 3000
X, y = news.data[:n_samples], news.target[:n_samples]
cv = KFold(len(X), 4, shuffle=True, random_state=0)
params = {'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3)}
estimator = create_clf()
ipy_gridsearchcv = IPyGridSearchCV(estimator, params, cv=cv)
ipy_gridsearchcv.fit(X, y)
print ipy_gridsearchcv.best_params_, ipy_gridsearchcv.best_score_