Ejemplo n.º 1
0
def test_grid_search_sparse_scoring():
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator_.C

    X_ = sp.csr_matrix(X_)
    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
    cv.fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator_.C

    assert_array_equal(y_pred, y_pred2)
    assert_equal(C, C2)

    # Smoke test the score
    #np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
    #                        cv.score(X_[:180], y[:180]))

    # test loss where greater is worse
    def f1_loss(y_true_, y_pred_):
        return -f1_score(y_true_, y_pred_)

    F1Loss = Scorer(f1_loss, greater_is_better=False)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss)
    cv.fit(X_[:180], y_[:180])
    y_pred3 = cv.predict(X_[180:])
    C3 = cv.best_estimator_.C

    assert_equal(C, C3)
    assert_array_equal(y_pred, y_pred3)
Ejemplo n.º 2
0
def train_svm(C=0.1, grid=False):
    ds = PascalSegmentation()
    svm = LinearSVC(C=C, dual=False, class_weight='auto')

    if grid:
        data_train = load_pascal("kTrain")
        X, y = shuffle(data_train.X, data_train.Y)
        # prepare leave-one-label-out by assigning labels to images
        image_indicators = np.hstack([np.repeat(i, len(x)) for i, x in
                                      enumerate(X)])
        # go down to only 5 "folds"
        labels = image_indicators % 5
        X, y = np.vstack(X), np.hstack(y)

        cv = LeavePLabelOut(labels=labels, p=1)
        param_grid = {'C': 10. ** np.arange(-3, 3)}
        scorer = Scorer(recall_score, average="macro")
        grid_search = GridSearchCV(svm, param_grid=param_grid, cv=cv,
                                   verbose=10, scoring=scorer, n_jobs=-1)
        grid_search.fit(X, y)
    else:
        data_train = load_pascal("train")
        X, y = np.vstack(data_train.X), np.hstack(data_train.Y)
        svm.fit(X, y)
        print(svm.score(X, y))
        eval_on_sp(ds, data_train, [svm.predict(x) for x in data_train.X],
                   print_results=True)

        data_val = load_pascal("val")
        eval_on_sp(ds, data_val, [svm.predict(x) for x in data_val.X],
                   print_results=True)
Ejemplo n.º 3
0
def test_raises_on_score_list():
    # test that when a list of scores is returned, we raise proper errors.
    X, y = make_blobs(random_state=0)
    f1_scorer_no_average = Scorer(f1_score, average=None)
    clf = DecisionTreeClassifier()
    assert_raises(ValueError, cross_val_score, clf, X, y,
                  scoring=f1_scorer_no_average)
    grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average,
                               param_grid={'max_depth': [1, 2]})
    assert_raises(ValueError, grid_search.fit, X, y)
Ejemplo n.º 4
0
def test_classification_scores():
    X, y = make_blobs(random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LinearSVC(random_state=0)
    clf.fit(X_train, y_train)
    score1 = SCORERS['f1'](clf, X_test, y_test)
    score2 = f1_score(y_test, clf.predict(X_test))
    assert_almost_equal(score1, score2)

    # test fbeta score that takes an argument
    scorer = Scorer(fbeta_score, beta=2)
    score1 = scorer(clf, X_test, y_test)
    score2 = fbeta_score(y_test, clf.predict(X_test), beta=2)
    assert_almost_equal(score1, score2)

    # test that custom scorer can be pickled
    unpickled_scorer = pickle.loads(pickle.dumps(scorer))
    score3 = unpickled_scorer(clf, X_test, y_test)
    assert_almost_equal(score1, score3)

    # smoke test the repr:
    repr(fbeta_score)
Ejemplo n.º 5
0
def test_permutation_score():
    iris = load_iris()
    X = iris.data
    X_sparse = coo_matrix(X)
    y = iris.target
    svm = SVC(kernel='linear')
    cv = cval.StratifiedKFold(y, 2)

    score, scores, pvalue = cval.permutation_test_score(
        svm, X, y, "accuracy", cv)
    assert_greater(score, 0.9)
    assert_almost_equal(pvalue, 0.0, 1)

    score_label, _, pvalue_label = cval.permutation_test_score(svm,
                                                               X,
                                                               y,
                                                               "accuracy",
                                                               cv,
                                                               labels=np.ones(
                                                                   y.size),
                                                               random_state=0)
    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # test with custom scoring object
    scorer = Scorer(fbeta_score, beta=2)
    score_label, _, pvalue_label = cval.permutation_test_score(svm,
                                                               X,
                                                               y,
                                                               scoring=scorer,
                                                               cv=cv,
                                                               labels=np.ones(
                                                                   y.size),
                                                               random_state=0)
    assert_almost_equal(score_label, .95, 2)
    assert_almost_equal(pvalue_label, 0.01, 3)

    # check that we obtain the same results with a sparse representation
    svm_sparse = SVC(kernel='linear')
    cv_sparse = cval.StratifiedKFold(y, 2, indices=True)
    score_label, _, pvalue_label = cval.permutation_test_score(svm_sparse,
                                                               X_sparse,
                                                               y,
                                                               "accuracy",
                                                               cv_sparse,
                                                               labels=np.ones(
                                                                   y.size),
                                                               random_state=0)

    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # set random y
    y = np.mod(np.arange(len(y)), 3)

    score, scores, pvalue = cval.permutation_test_score(
        svm, X, y, "accuracy", cv)

    assert_less(score, 0.5)
    assert_greater(pvalue, 0.4)

    # test with deprecated interface
    with warnings.catch_warnings(record=True):
        score, scores, pvalue = cval.permutation_test_score(
            svm, X, y, score_func=accuracy_score, cv=cv)
    assert_less(score, 0.5)
    assert_greater(pvalue, 0.4)
Ejemplo n.º 6
0
features = csr_matrix(features)

#features = features[train_index]
#salaries = salaries[train_index]

print "features", features.shape
print "valid features", validation_features.shape


def log_mean_absolute_error(y_true, y_pred):
    return mean_absolute_error(np.exp(y_true), np.exp(y_pred))


metric = dio.error_metric
error_scorer = Scorer(log_mean_absolute_error, greater_is_better=False)
if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    features = ch2.fit_transform(features, salaries)
    validation_features = ch2.transform(validation_features)
    print "done in %fs" % (time() - t0)
    print

#features = features.toarray()
#validation_features = validation_features.toarray()

#print "features", features.shape
#print "valid features", validation_features.shape