Example #1
0
def test_auto_weight():
    """Test class weights for imbalanced data"""
    # compute reference metrics on iris dataset that is quite balanced by
    # default
    X, y = iris.data, iris.target
    clf = svm.SVC(kernel="linear").fit(X, y)
    assert_almost_equal(metrics.f1_score(y, clf.predict(X)), 0.99, 2)

    # make the same prediction using automated class_weight
    clf_auto = svm.SVC(kernel="linear").fit(X, y, class_weight="auto")
    assert_almost_equal(metrics.f1_score(y, clf_auto.predict(X)), 0.99, 2)

    # Make sure that in the balanced case it does not change anything
    # to use "auto"
    assert_array_almost_equal(clf.coef_, clf_auto.coef_, 6)

    # build an very very imbalanced dataset out of iris data
    X_0 = X[y == 0, :]
    y_0 = y[y == 0]

    X_imbalanced = np.vstack([X] + [X_0] * 10)
    y_imbalanced = np.concatenate([y] + [y_0] * 10)

    # fit a model on the imbalanced data without class weight info
    y_pred = svm.SVC().fit(X_imbalanced, y_imbalanced).predict(X)
    assert_almost_equal(metrics.f1_score(y, y_pred), 0.88, 2)

    # fit a model with auto class_weight enabled
    clf = svm.SVC().fit(X_imbalanced, y_imbalanced, class_weight="auto")
    y_pred = clf.predict(X)
    assert_almost_equal(metrics.f1_score(y, y_pred), 0.92, 2)
Example #2
0
def test_auto_weight():
    """Test class weights for imbalanced data"""
    # compute reference metrics on iris dataset that is quite balanced by
    # default
    X, y = iris.data, iris.target
    clf = svm.SVC(kernel="linear").fit(X, y)
    assert_almost_equal(metrics.f1_score(y, clf.predict(X)), 0.99, 2)

    # make the same prediction using automated class_weight
    clf_auto = svm.SVC(kernel="linear").fit(X, y, class_weight="auto")
    assert_almost_equal(metrics.f1_score(y, clf_auto.predict(X)), 0.99, 2)

    # Make sure that in the balanced case it does not change anything
    # to use "auto"
    assert_array_almost_equal(clf.coef_, clf_auto.coef_, 6)

    # build an very very imbalanced dataset out of iris data
    X_0 = X[y == 0, :]
    y_0 = y[y == 0]

    X_imbalanced = np.vstack([X] + [X_0] * 10)
    y_imbalanced = np.concatenate([y] + [y_0] * 10)

    # fit a model on the imbalanced data without class weight info
    y_pred = svm.SVC().fit(X_imbalanced, y_imbalanced).predict(X)
    assert_almost_equal(metrics.f1_score(y, y_pred), 0.88, 2)

    # fit a model with auto class_weight enabled
    clf = svm.SVC().fit(X_imbalanced, y_imbalanced, class_weight="auto")
    y_pred = clf.predict(X)
    assert_almost_equal(metrics.f1_score(y, y_pred), 0.92, 2)
Example #3
0
def benchmark(clf):
    print 80 * '_'
    print "Training: "
    print clf
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print "train time: %0.3fs" % train_time

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print "test time:  %0.3fs" % test_time

    score = metrics.f1_score(y_test, pred)
    print "f1-score:   %0.3f" % score

    if hasattr(clf, 'coef_'):
        nnz = clf.coef_.nonzero()[0].shape[0]
        print "non-zero coef: %d" % nnz
        print

    if print_report:
        print "classification report:"
        print metrics.classification_report(y_test, pred,
                                            target_names=categories)

    if print_cm:
        print "confusion matrix:"
        print metrics.confusion_matrix(y_test, pred)

    print
    return score, train_time, test_time
Example #4
0
def test_auto_weight():
    """Test class weights for imbalanced data"""
    from scikits.learn.linear_model import LogisticRegression
    # we take as dataset a the two-dimensional projection of iris so
    # that it is not separable and remove half of predictors from
    # class 1
    X, y = iris.data[:, :2], iris.target
    unbalanced = np.delete(np.arange(y.size), np.where(y > 1)[0][::2])

    for clf in (svm.SVC(kernel='linear'), svm.LinearSVC(), LogisticRegression()):
        # check that score is better when class='auto' is set.
        y_pred = clf.fit(X[unbalanced], y[unbalanced],
                         class_weight={}).predict(X)
        y_pred_balanced = clf.fit(X[unbalanced], y[unbalanced],
                                  class_weight='auto').predict(X)
        assert np.argmin(clf.class_weight) == 0
        assert metrics.f1_score(y, y_pred) <= metrics.f1_score(y, y_pred_balanced)
Example #5
0
def test_auto_weight():
    """Test class weights for imbalanced data"""
    from scikits.learn.linear_model import LogisticRegression
    # we take as dataset a the two-dimensional projection of iris so
    # that it is not separable and remove half of predictors from
    # class 1
    from scikits.learn.svm.base import _get_class_weight
    X, y = iris.data[:, :2], iris.target
    unbalanced = np.delete(np.arange(y.size), np.where(y > 1)[0][::2])

    assert np.argmax(_get_class_weight('auto', y[unbalanced])[0]) == 2

    for clf in (svm.SVC(kernel='linear'), svm.LinearSVC(), LogisticRegression()):
        # check that score is better when class='auto' is set.
        y_pred = clf.fit(X[unbalanced], y[unbalanced],
                         class_weight={}).predict(X)
        y_pred_balanced = clf.fit(X[unbalanced], y[unbalanced],
                                  class_weight='auto').predict(X)
        assert metrics.f1_score(y, y_pred) <= metrics.f1_score(y, y_pred_balanced)
Example #6
0
    def test_auto_weight(self):
        """Test class weights for imbalanced data"""
        # compute reference metrics on iris dataset that is quite balanced by
        # default
        X, y = iris.data, iris.target
        X = preprocessing.scale(X)
        idx = np.arange(X.shape[0])
        np.random.seed(13)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]
        clf = self.factory(alpha=0.0001, n_iter=1000).fit(X, y)
        assert_approx_equal(metrics.f1_score(y, clf.predict(X)), 0.96, 2)

        # make the same prediction using automated class_weight
        clf_auto = self.factory(alpha=0.0001,
                                n_iter=1000).fit(X, y, class_weight="auto")
        assert_approx_equal(metrics.f1_score(y, clf_auto.predict(X)), 0.96, 2)

        # Make sure that in the balanced case it does not change anything
        # to use "auto"
        assert_array_almost_equal(clf.coef_, clf_auto.coef_, 6)

        # build an very very imbalanced dataset out of iris data
        X_0 = X[y == 0, :]
        y_0 = y[y == 0]

        X_imbalanced = np.vstack([X] + [X_0] * 10)
        y_imbalanced = np.concatenate([y] + [y_0] * 10)

        # fit a model on the imbalanced data without class weight info
        clf = self.factory(n_iter=1000)
        clf.fit(X_imbalanced, y_imbalanced)
        y_pred = clf.predict(X)
        assert metrics.f1_score(y, y_pred) < 0.96

        # fit a model with auto class_weight enabled
        clf = self.factory(n_iter=1000)
        clf.fit(X_imbalanced, y_imbalanced, class_weight="auto")
        y_pred = clf.predict(X)
        assert metrics.f1_score(y, y_pred) > 0.96
Example #7
0
    def test_auto_weight(self):
        """Test class weights for imbalanced data"""
        # compute reference metrics on iris dataset that is quite balanced by
        # default
        X, y = iris.data, iris.target
        X = preprocessing.scale(X)
        idx = np.arange(X.shape[0])
        np.random.seed(13)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]
        clf = self.factory(alpha=0.0001, n_iter=1000).fit(X, y)
        assert_approx_equal(metrics.f1_score(y, clf.predict(X)), 0.96, 2)

        # make the same prediction using automated class_weight
        clf_auto = self.factory(alpha=0.0001,
                                n_iter=1000).fit(X, y, class_weight="auto")
        assert_approx_equal(metrics.f1_score(y, clf_auto.predict(X)), 0.96, 2)

        # Make sure that in the balanced case it does not change anything
        # to use "auto"
        assert_array_almost_equal(clf.coef_, clf_auto.coef_, 6)

        # build an very very imbalanced dataset out of iris data
        X_0 = X[y == 0, :]
        y_0 = y[y == 0]

        X_imbalanced = np.vstack([X] + [X_0] * 10)
        y_imbalanced = np.concatenate([y] + [y_0] * 10)

        # fit a model on the imbalanced data without class weight info
        clf = self.factory(n_iter=1000)
        clf.fit(X_imbalanced, y_imbalanced)
        y_pred = clf.predict(X)
        assert metrics.f1_score(y, y_pred) < 0.96

        # fit a model with auto class_weight enabled
        clf = self.factory(n_iter=1000)
        clf.fit(X_imbalanced, y_imbalanced, class_weight="auto")
        y_pred = clf.predict(X)
        assert metrics.f1_score(y, y_pred) > 0.96
def benchmark(clf):
    print 80 * '_'
    print "Training: "
    print clf
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print "train time: %0.3fs" % train_time

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print "test time:  %0.3fs" % test_time

    score = metrics.f1_score(y_test, pred)
    print "f1-score:   %0.3f" % score

    if hasattr(clf, 'coef_'):
        nnz = clf.coef_.nonzero()[0].shape[0]
        print "non-zero coef: %d" % nnz

        if opts.print_top10:
            print "top 10 keywords per class:"
            for i, category in enumerate(categories):
                top10 = np.argsort(clf.coef_[i, :])[-10:]
                print trim("%s: %s" % (category, " ".join(vocabulary[top10])))
        print

    if opts.print_report:
        print "classification report:"
        print metrics.classification_report(y_test, pred,
                                            target_names=categories)

    if opts.print_cm:
        print "confusion matrix:"
        print metrics.confusion_matrix(y_test, pred)

    print
    return score, train_time, test_time
Example #9
0
print "Extracting features from the dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform((open(f).read() for f in news_test.filenames))
y_test = news_test.target
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape

print "Predicting the outcomes of the testing set"
t0 = time()
pred = clf.predict(X_test)
print "done in %fs" % (time() - t0)
<<<<<<< HEAD
print "precision: %0.3f" % precision(y_test, pred)
print "recall: %0.3f" % recall(y_test, pred)
print "f1_score: %0.3f" % f1_score(y_test, pred)
=======

print "Classification report on test set:"
print classification_report(news_test.target, pred,
                            class_names=news_test.target_names)

>>>>>>> remote

cm = confusion_matrix(y_test, pred)
print "Confusion matrix:"
print cm

# Show confusion matrix
pl.matshow(cm)
pl.title('Confusion matrix')
Example #10
0
def evaluate(X, Y, hyperparams):
    """
    Evaluate X and Y using leave-one-out or K-fold crossvalidation, and return the nll.
    TODO: Hyperparameters should be a kwarg and passed to the classifier constructor.
    """

#    from scikits.learn.cross_val import LeaveOneOut
#    loo = LeaveOneOut(len(Y))
    from scikits.learn.cross_val import KFold
    K = 5
#    print >> sys.stderr, "Using 10-fold cross-validation"
    loo = KFold(len(Y), K)
#    print loo

    all_y_test = []
    all_y_test_predict = []

    nlltotal = 0.
    for train, test in loo:
        trainidx = [idx for idx in range(len(train)) if train[idx]]
        testidx = [idx for idx in range(len(test)) if test[idx]]
        X_train, X_test, y_train, y_test = X[trainidx], X[testidx], Y[trainidx], Y[testidx]
#        print "train", X_train.shape, y_train.shape
#        print "test", X_test.shape, y_test.shape

        if len(frozenset(y_train)) == 1:
            # Skip training on this LOO set if there is only one y-value in the training set
            continue

        clf = fit_classifier(X_train, y_train, hyperparams)

#        print "target", y_test
##        print "predict", clf.predict(X_test)
#        print "predict", clf.predict_proba(X_test)
##        print "df", clf.decision_function(X_test)
##        print "score", clf.score(X_test, y_test)

#        y_test_predict = clf.predict_proba(X_test)
        y_test_predict = clf.predict(X_test)
#        print y_test_predict

        all_y_test.append(y_test)
        all_y_test_predict.append(y_test_predict)

##        print clf.best_estimator
#        print precision_score(y_test, y_test_predict)
#        print recall_score(y_test, y_test_predict)
#        print classification_report(y_test, y_test_predict)
#
#
#        assert y_test.shape == (1,)
#        assert y_test_predict.shape == (1,)
#        if y_test_predict[0] >= 1.:
##            print >> sys.stderr, "WHA? y_test_predict[0] %f >= 1. !!!" % y_test_predict[0]
#            y_test_predict[0] = 1-1e-9
#        elif y_test_predict[0] <= 0.:
##            print >> sys.stderr, "WHA? y_test_predict[0] %f <= 0. !!!" % y_test_predict[0]
#            y_test_predict[0] = 1e-9
#
#        if y_test[0] == 1:
#            probtarget = y_test_predict[0]
#        else:
#            assert y_test[0] == 0
#            probtarget = 1-y_test_predict[0]
##        print "probtarget", probtarget
##        print y_test[0], y_test_predict[0], repr(probtarget)
#        nll = -math.log(probtarget)
##        print "nll", nll
##        print
#
#        nlltotal += nll
#    nlltotal /= len(Y)
##    print "nlltotal %f (alpha=%f, n_iter=%d)" % (nlltotal, alpha, n_iter)
#    return nlltotal

    y_test = numpy.hstack(all_y_test)
    y_test_predict = numpy.hstack(all_y_test_predict)
    assert y_test.ndim == 1
    assert y_test_predict.ndim == 1
    assert Y.shape == y_test.shape
    assert y_test.shape == y_test_predict.shape
#    import plot
#    print "precision_recall_fscore_support", scikits.learn.metrics.precision_recall_fscore_support(y_test, y_test_predict)
    f1 = f1_score(y_test, y_test_predict)
#    print "\tf1 = %0.3f when evaluating with %s" % (f1, hyperparams)
#    sys.stdout.flush()
#    precision, recall, thresholds = scikits.learn.metrics.precision_recall_curve(y_test, y_test_predict)
#    plot.plot_precision_recall(precision, recall)
#    print "confusion_matrix", scikits.learn.metrics.confusion_matrix(y_test, y_test_predict)
#    print "roc_curve", scikits.learn.metrics.roc_curve(y_test, y_test_predict)
#    fpr, tpr, thresholds = scikits.learn.metrics.roc_curve(y_test, y_test_predict)
#    print "auc", scikits.learn.metrics.auc(fpr, tpr)
#    plot.plot_roc(fpr, tpr)
    return f1