Ejemplo n.º 1
0
def test_dense_tf_idf():
    hv = HashingVectorizer(dim=1000, probes=3)
    hv.vectorize(JUNK_FOOD_DOCS)
    hv.vectorize(NOTJUNK_FOOD_DOCS)

    # extract the TF-IDF data
    X = hv.get_tfidf()
    assert_equal(X.shape, (11, 1000))

    # label junk food as -1, the others as +1
    y = np.ones(X.shape[0])
    y[:6] = -1

    # train and test a classifier
    clf = DenseLinearSVC(C=10).fit(X[1:-1], y[1:-1])
    assert_equal(clf.predict([X[0]]), [-1])
    assert_equal(clf.predict([X[-1]]), [1])
Ejemplo n.º 2
0
def test_dense_tf_idf():
    hv = HashingVectorizer(dim=1000, probes=3)
    hv.vectorize(JUNK_FOOD_DOCS)
    hv.vectorize(NOTJUNK_FOOD_DOCS)

    # extract the TF-IDF data
    X = hv.get_tfidf()
    assert_equal(X.shape, (11, 1000))

    # label junk food as -1, the others as +1
    y = np.ones(X.shape[0])
    y[:6] = -1

    # train and test a classifier
    clf = DenseLinearSVC().fit(X[1:-1], y[1:-1])
    assert_equal(clf.predict([X[0]]), [-1])
    assert_equal(clf.predict([X[-1]]), [1])
Ejemplo n.º 3
0
def test_grid_search_error():
    """Test that grid search will capture errors on data with different
    length"""
    X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    assert_raises(ValueError, cv.fit, X_[:180], y_)
Ejemplo n.º 4
0
def test_grid_search_sparse():
    """Test that grid search works with both dense and sparse matrices"""
    X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator.C

    X_ = sp.csr_matrix(X_)
    clf = SparseLinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator.C

    assert np.mean(y_pred == y_pred2) >= .9
    assert_equal(C, C2)
Ejemplo n.º 5
0
def test_grid_search_sparse_score_func():
    X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    # XXX: set refit to False due to a random bug when True (default)
    cv.fit(X_[:180], y_[:180], refit=False)
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator.C

    X_ = sp.csr_matrix(X_)
    clf = SparseLinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    # XXX: set refit to False due to a random bug when True (default)
    cv.fit(X_[:180], y_[:180], refit=False)
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator.C

    assert_array_equal(y_pred, y_pred2)
    assert_equal(C, C2)
Ejemplo n.º 6
0
def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=None):
    min_c = l1_min_c(X, y, loss, fit_intercept, intercept_scaling)

    clf = {
        ('log', False): LogisticRegression(penalty='l1'),
        ('log', True): SparseLogRegression(penalty='l1'),
        ('l2', False): LinearSVC(loss='l2', penalty='l1', dual=False),
        ('l2', True): SparseSVC(loss='l2', penalty='l1', dual=False),
    }[loss, sp.issparse(X)]

    clf.fit_intercept = fit_intercept
    clf.intercept_scaling = intercept_scaling

    clf.C = min_c
    clf.fit(X, y)
    assert (np.asanyarray(clf.coef_) == 0).all()
    assert (np.asanyarray(clf.intercept_) == 0).all()

    clf.C = min_c * 1.01
    clf.fit(X, y)
    assert (np.asanyarray(clf.coef_) != 0).any() or \
           (np.asanyarray(clf.intercept_) != 0).any()
    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    err = metrics.zero_one(y_test, pred) / float(pred.shape[0])
    return err, train_time, test_time

######################################################################
## Train Liblinear model
libsvm_parameters = {
    'loss': 'l2',
    'penalty': 'l2',
    'C': 1000,
    'dual': False,
    'eps': 1e-3,
    }
libsvm_res = benchmark(LinearSVC(**libsvm_parameters))
libsvm_err, libsvm_train_time, libsvm_test_time = libsvm_res

######################################################################
## Train SGD model
sgd_parameters = {
    'alpha': 0.001,
    'n_iter': 2,
    }
sgd_err, sgd_train_time, sgd_test_time = benchmark(SGD(**sgd_parameters))

######################################################################
## Train GNB model
gnb_err, gnb_train_time, gnb_test_time = benchmark(GNB())

# to leverage efficiently a sparse datastracture; hence we use a dense
# representation of a text dataset. Efficient handling of sparse data
# structures should be expected in an upcoming version of scikits.learn
print "n_samples: %d, n_features: %d" % news_train.data.shape

print "Training a linear classification model with L1 penalty... "
parameters = {
    'loss': 'l1',
    'penalty': 'l2',
    'C': 10,
    'dual': True,
    'eps': 1e-4,
}
print "parameters:", parameters
t0 = time()
clf = LinearSVC(**parameters).fit(news_train.data, news_train.target)
print "done in %fs" % (time() - t0)
print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100)

print "Loading 20 newsgroups test set... "
t0 = time()
news_test = load_mlcomp('20news-18828', 'test')
print "done in %fs" % (time() - t0)

print "Predicting the labels of the test set..."
t0 = time()
pred = clf.predict(news_test.data)
print "done in %fs" % (time() - t0)
print "Classification accuracy: %f" % (np.mean(pred == news_test.target) * 100)

cm = confusion_matrix(news_test.target, pred)
Ejemplo n.º 9
0
    pred = clf.predict(X_test)
    test_time = time() - t0
    err = metrics.zero_one(y_test, pred) / float(pred.shape[0])
    return err, train_time, test_time


######################################################################
## Train Liblinear model
liblinear_parameters = {
    'loss': 'l2',
    'penalty': 'l2',
    'C': 1000,
    'dual': False,
    'tol': 1e-3,
}
liblinear_res = benchmark(LinearSVC(**liblinear_parameters))
liblinear_err, liblinear_train_time, liblinear_test_time = liblinear_res

######################################################################
## Train GNB model
gnb_err, gnb_train_time, gnb_test_time = benchmark(GNB())

######################################################################
## Train SGD model
sgd_parameters = {
    'alpha': 0.001,
    'n_iter': 2,
}
sgd_err, sgd_train_time, sgd_test_time = benchmark(
    SGDClassifier(**sgd_parameters))
Ejemplo n.º 10
0
import matplotlib.pyplot as pl
from scikits.learn.decomposition import RandomizedPCA
from scikits.learn.svm import LinearSVC
from scikits.learn.pipeline import Pipeline
from scikits.learn.grid_search import GridSearchCV
from scikits.learn.metrics import classification_report
from preprocess import InfinitivesExtractor, load_data

# Data attributes
targets = [0, 1, 2]
target_names = ["covered", "no alternance", "uncovered"]
target_colors = "rgb"

# Classification settings
pipeline = Pipeline([('extr', InfinitivesExtractor()),
                     ('svc', LinearSVC(multi_class=True))])
parameters = {
    'extr__count': (True, False),
    'extr__n': (3, 4, 5, 6),
    'svc__C': (1e-1, 1e-2, 1e9)
}
grid_search = GridSearchCV(pipeline, parameters)

print "Loading data..."
X, y = load_data()
print "Searching for the best model..."
t0 = time()
grid_search.fit(X, y)
print "Done in %0.3f" % (time() - t0)
print "Best score: %0.3f" % grid_search.best_score
clf = grid_search.best_estimator