Ejemplo n.º 1
0
def test_grid_search_sparse_score_func():
    X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator.C

    X_ = sp.csr_matrix(X_)
    clf = SparseLinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    cv.fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator.C

    assert_array_equal(y_pred, y_pred2)
    assert_equal(C, C2)
Ejemplo n.º 2
0
def test_grid_search_sparse():
    """Test that grid search works with both dense and sparse matrices"""
    X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C':[0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator.C

    X_ = sp.csr_matrix(X_)
    clf = SparseLinearSVC()
    cv = GridSearchCV(clf, {'C':[0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator.C

    assert np.mean(y_pred == y_pred2) >= .9
    assert_equal(C, C2)
Ejemplo n.º 3
0
def test_grid_search_sparse():
    """Test that grid search works with both dense and sparse matrices"""
    X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator.C

    X_ = sp.csr_matrix(X_)
    clf = SparseLinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator.C

    assert np.mean(y_pred == y_pred2) >= .9
    assert_equal(C, C2)
Ejemplo n.º 4
0
def test_grid_search_sparse_score_func():
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    # XXX: set refit to False due to a random bug when True (default)
    cv.set_params(refit=False).fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator.C

    X_ = sp.csr_matrix(X_)
    clf = SparseLinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    # XXX: set refit to False due to a random bug when True (default)
    cv.set_params(refit=False).fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator.C

    assert_array_equal(y_pred, y_pred2)
    assert_equal(C, C2)
Ejemplo n.º 5
0
def test_grid_search_sparse_score_func():
    X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    # XXX: set refit to False due to a random bug when True (default)
    cv.fit(X_[:180], y_[:180], refit=False)
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator.C

    X_ = sp.csr_matrix(X_)
    clf = SparseLinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    # XXX: set refit to False due to a random bug when True (default)
    cv.fit(X_[:180], y_[:180], refit=False)
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator.C

    assert_array_equal(y_pred, y_pred2)
    assert_equal(C, C2)
Ejemplo n.º 6
0
def train_svm_crossvalidated(X, y, tuned_parameters={'kernel': ['rbf'], 'gamma': 2.0**np.arange(-15,3), 'C': 2.0**np.arange(-5, 15)}):
    """
    Performs grid search with stratified K-fold cross validation on observations X with 
    true labels y and returns an optimal SVM, clf
    """

    k_fold = _size_dependent_k_split(np.size(X,0))

    clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=recall_score)
    clf.fit(X, y, cv=StratifiedKFold(y, k_fold))

    y_true, y_pred = y, clf.predict(X)

    #print "Classification report for the best estimator: "
    #print clf.best_estimator
    print "Tuned with optimal value: %0.3f" % recall_score(y_true, y_pred)
    
    return clf
X_test_pca = pca.transform(X_test)


# Train a SVM classification model

print "Fitting the classifier to the training set"
param_grid = {"C": [1, 5, 10, 100], "gamma": [0.0001, 0.001, 0.01, 0.1]}
clf = GridSearchCV(SVC(kernel="rbf"), param_grid, fit_params={"class_weight": "auto"}, n_jobs=-1)
clf = clf.fit(X_train_pca, y_train)
print "Best estimator found by grid search:"
print clf.best_estimator


# Quantitative evaluation of the model quality on the test set

y_pred = clf.predict(X_test_pca)
print classification_report(y_test, y_pred, labels=selected_target, target_names=target_names[selected_target])

print confusion_matrix(y_test, y_pred, labels=selected_target)


# Qualitative evaluation of the predictions using matplotlib

n_row = 3
n_col = 4


def title(y_pred, y_test, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit("_", 1)[-1]
    true_name = target_names[y_test[i]].rsplit("_", 1)[-1]
    return "predicted: %s\ntrue:      %s" % (pred_name, true_name)
Ejemplo n.º 8
0
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
clf = GridSearchCV(SVC(kernel='rbf'),
                   param_grid,
                   fit_params={'class_weight': 'auto'})
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator

################################################################################
# Quantitative evaluation of the model quality on the test set

print "Predicting the people names on the testing set"
t0 = time()
y_pred = clf.predict(X_test_pca)
print "done in %0.3fs" % (time() - t0)

print classification_report(y_test, y_pred, target_names=target_names)
print confusion_matrix(y_test, y_pred, labels=range(n_classes))

################################################################################
# Qualitative evaluation of the predictions using matplotlib

n_row = 3
n_col = 4


def title(y_pred, y_test, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
    true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
Ejemplo n.º 9
0
# split the dataset in two equal part respecting label proportions
train, test = iter(StratifiedKFold(y, 2)).next()

################################################################################
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = [
    ('precision', precision_score),
    ('recall', recall_score),
]

for score_name, score_func in scores:
    clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func)
    clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5))
    y_true, y_pred = y[test], clf.predict(X[test])

    print "Classification report for the best estimator: "
    print clf.best_estimator
    print "Tuned for '%s' with optimal value: %0.3f" % (
        score_name, score_func(y_true, y_pred))
    print classification_report(y_true, y_pred)
    print "Grid scores:"
    pprint(clf.grid_scores_)
    print

# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality
Ejemplo n.º 10
0
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}

clf = GridSearchCV(SVC(kernel='rbf'),
                   param_grid,
                   fit_params={'class_weight': 'auto'})

#clf = SVC(kernel='rbf')
#clf = SVC(kernel='linear')

clf.fit(np.vstack([moto_vq_train, plane_vq_train]), np.array(labels))

print "Best estimator found by grid search:"
#print clf.best_estimator

###############################################################################
# Evaluation

moto_vq_eval, plane_vq_eval = [
    np.load(file) for file in ['moto_vq_eval.npy', 'plane_vq_eval.npy']
]

y_name = ['moto'] * moto_vq_eval.shape[0] + ['plane'] * plane_vq_eval.shape[0]
y_test = [0] * moto_vq_eval.shape[0] + [1] * plane_vq_eval.shape[0]
y_test = np.array(y_test)

y_pred = clf.predict(np.vstack([moto_vq_eval, plane_vq_eval]))

print classification_report(y_test, y_pred, labels=labels, class_names=y_name)
print confusion_matrix(y_test, y_pred)
clf = GridSearchCV(SVC(kernel='rbf'), param_grid,
                   fit_params={'class_weight': 'auto'})

#clf = SVC(kernel='rbf')
#clf = SVC(kernel='linear')

clf.fit(np.vstack([moto_vq_train,plane_vq_train]),
        np.array(labels))

print "Best estimator found by grid search:"
#print clf.best_estimator

###############################################################################
# Evaluation 

moto_vq_eval, plane_vq_eval  = [np.load(file) 
                                for file 
                                in ['moto_vq_eval.npy','plane_vq_eval.npy']]

y_name = ['moto']*moto_vq_eval.shape[0] + ['plane']* plane_vq_eval.shape[0]
y_test = [0]* moto_vq_eval.shape[0] + [1]* plane_vq_eval.shape[0]
y_test = np.array(y_test)


y_pred = clf.predict(np.vstack([moto_vq_eval, plane_vq_eval]))


print classification_report(y_test, y_pred, labels=labels, class_names=y_name)
print confusion_matrix(y_test, y_pred)
Ejemplo n.º 12
0
tuned_parameters = [{
    'kernel': ['rbf'],
    'gamma': [1e-3, 1e-4],
    'C': [1, 10, 100, 1000]
}, {
    'kernel': ['linear'],
    'C': [1, 10, 100, 1000]
}]

scores = [
    ('precision', precision_score),
    ('recall', recall_score),
]

for score_name, score_func in scores:
    clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func)
    clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5))
    y_true, y_pred = y[test], clf.predict(X[test])

    print "Classification report for the best estimator: "
    print clf.best_estimator
    print "Tuned for '%s' with optimal value: %0.3f" % (
        score_name, score_func(y_true, y_pred))
    print classification_report(y_true, y_pred)
    print "Grid scores:"
    pprint(clf.grid_scores_)
    print

# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality
Ejemplo n.º 13
0
from scikits.learn.grid_search import GridSearchCV
from scikits.learn import datasets
from scikits.learn.metrics import zero_one

################################################################################
# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

################################################################################
# Set the parameters by cross-validation
tuned_parameters = [{'kernel':('rbf', ), 'gamma':[1e-3, 1e-4]},
                    {'kernel':('linear', )}]

clf = GridSearchCV(SVC(C=1), tuned_parameters, n_jobs=2)

y_pred = []
y_true = []
for train, test in StratifiedKFold(y, 2):
    clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5))
    y_pred = np.append(y_pred, clf.predict(X[test]))
    y_true = np.append(y_true, y[test])

classif_rate = np.mean(y_pred == y_true) * 100
print "Classification rate : %f" % classif_rate