/
parameter_learn2.py
83 lines (71 loc) · 3.42 KB
/
parameter_learn2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Mathieu Blondel <mathieu@mblondel.org>
# License: BSD 3 clause
######################################################
# PRUNING HYPERPARAMETERS FOR ONE VS REST CLASSIFIER #
######################################################
from __future__ import print_function
from pprint import pprint
from time import time
import logging
from scipy.stats import randint as sp_randint
import numpy as np
import pickle
import util2, classifier2, movie_genre, os, sys
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
#'vect__max_df': (0.5, 0.75, 1.0),
#'vect__max_features': (None, 5000, 10000, 50000),
# 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
#'union__use_idf': (True, False),
#'union__norm': ('l1', 'l2'),
'estimator__union__transformer_weights':({'colors': 0.35,'subs': 0.65}, {'colors': 0.5,'subs': 0.5},
{'colors': 0.2,'subs': 0.8}),
'estimator__union__subs__tfidf__max_features': (3, 5, 10, 15),
'estimator__sgd__alpha': (0.001, 0.0001, 0.00001, 0.000001),
'estimator__sgd__loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'),
'estimator__sgd__penalty': ('l2', 'elasticnet'),
'estimator__sgd__n_iter': (10, 20, 50, 80, 100, 150),
}
# parameters = {
# #'vect__max_df': (0.5, 0.75, 1.0),
# #'vect__max_features': (None, 5000, 10000, 50000),
# # 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
# #'union__use_idf': (True, False),
# #'union__norm': ('l1', 'l2'),
# 'estimator__union__transformer_weights':({'colors': 0.35,'subs': 0.65}, {'colors': 0.5,'subs': 0.5},
# {'colors': 0.2,'subs': 0.8}),
# 'estimator__union__subs__tfidf__max_features': sp_randint(3, 10),
# 'estimator__sgd__alpha': sp_randint(0.00001, 0.000001),
# 'estimator__sgd__loss': ('hinge', 'modified_huber', 'squared_hinge', 'perceptron'),
# 'estimator__sgd__penalty': ('l2', 'elasticnet'),
# 'estimator__sgd__n_iter': sp_randint(10, 80),
# }
if __name__ == "__main__":
# multiprocessing requires the fork to happen in a __main__ protected
# block
# find the best parameters for both the feature extraction and the
# classifier
for i in range(0, 20):
p = classifier2.predictor()
print (p.trainExamples)
grid_search = RandomizedSearchCV(p.classif, parameters, n_jobs=-1, verbose=1, error_score=0)
print("Performing grid search...")
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(p.trainExamples, p.mlb.fit_transform(util2.getCorrectGenres(p.trainExamples)))
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
print ("###########################################################################")