def test_grid_search_sparse_scoring(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2) # Smoke test the score #np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]), # cv.score(X_[:180], y[:180])) # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) F1Loss = Scorer(f1_loss, greater_is_better=False) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss) cv.fit(X_[:180], y_[:180]) y_pred3 = cv.predict(X_[180:]) C3 = cv.best_estimator_.C assert_equal(C, C3) assert_array_equal(y_pred, y_pred3)
def train_svm(C=0.1, grid=False): ds = PascalSegmentation() svm = LinearSVC(C=C, dual=False, class_weight='auto') if grid: data_train = load_pascal("kTrain") X, y = shuffle(data_train.X, data_train.Y) # prepare leave-one-label-out by assigning labels to images image_indicators = np.hstack([np.repeat(i, len(x)) for i, x in enumerate(X)]) # go down to only 5 "folds" labels = image_indicators % 5 X, y = np.vstack(X), np.hstack(y) cv = LeavePLabelOut(labels=labels, p=1) param_grid = {'C': 10. ** np.arange(-3, 3)} scorer = Scorer(recall_score, average="macro") grid_search = GridSearchCV(svm, param_grid=param_grid, cv=cv, verbose=10, scoring=scorer, n_jobs=-1) grid_search.fit(X, y) else: data_train = load_pascal("train") X, y = np.vstack(data_train.X), np.hstack(data_train.Y) svm.fit(X, y) print(svm.score(X, y)) eval_on_sp(ds, data_train, [svm.predict(x) for x in data_train.X], print_results=True) data_val = load_pascal("val") eval_on_sp(ds, data_val, [svm.predict(x) for x in data_val.X], print_results=True)
def test_raises_on_score_list(): # test that when a list of scores is returned, we raise proper errors. X, y = make_blobs(random_state=0) f1_scorer_no_average = Scorer(f1_score, average=None) clf = DecisionTreeClassifier() assert_raises(ValueError, cross_val_score, clf, X, y, scoring=f1_scorer_no_average) grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average, param_grid={'max_depth': [1, 2]}) assert_raises(ValueError, grid_search.fit, X, y)
def test_classification_scores(): X, y = make_blobs(random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LinearSVC(random_state=0) clf.fit(X_train, y_train) score1 = SCORERS['f1'](clf, X_test, y_test) score2 = f1_score(y_test, clf.predict(X_test)) assert_almost_equal(score1, score2) # test fbeta score that takes an argument scorer = Scorer(fbeta_score, beta=2) score1 = scorer(clf, X_test, y_test) score2 = fbeta_score(y_test, clf.predict(X_test), beta=2) assert_almost_equal(score1, score2) # test that custom scorer can be pickled unpickled_scorer = pickle.loads(pickle.dumps(scorer)) score3 = unpickled_scorer(clf, X_test, y_test) assert_almost_equal(score1, score3) # smoke test the repr: repr(fbeta_score)
def test_permutation_score(): iris = load_iris() X = iris.data X_sparse = coo_matrix(X) y = iris.target svm = SVC(kernel='linear') cv = cval.StratifiedKFold(y, 2) score, scores, pvalue = cval.permutation_test_score( svm, X, y, "accuracy", cv) assert_greater(score, 0.9) assert_almost_equal(pvalue, 0.0, 1) score_label, _, pvalue_label = cval.permutation_test_score(svm, X, y, "accuracy", cv, labels=np.ones( y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # test with custom scoring object scorer = Scorer(fbeta_score, beta=2) score_label, _, pvalue_label = cval.permutation_test_score(svm, X, y, scoring=scorer, cv=cv, labels=np.ones( y.size), random_state=0) assert_almost_equal(score_label, .95, 2) assert_almost_equal(pvalue_label, 0.01, 3) # check that we obtain the same results with a sparse representation svm_sparse = SVC(kernel='linear') cv_sparse = cval.StratifiedKFold(y, 2, indices=True) score_label, _, pvalue_label = cval.permutation_test_score(svm_sparse, X_sparse, y, "accuracy", cv_sparse, labels=np.ones( y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # set random y y = np.mod(np.arange(len(y)), 3) score, scores, pvalue = cval.permutation_test_score( svm, X, y, "accuracy", cv) assert_less(score, 0.5) assert_greater(pvalue, 0.4) # test with deprecated interface with warnings.catch_warnings(record=True): score, scores, pvalue = cval.permutation_test_score( svm, X, y, score_func=accuracy_score, cv=cv) assert_less(score, 0.5) assert_greater(pvalue, 0.4)
features = csr_matrix(features) #features = features[train_index] #salaries = salaries[train_index] print "features", features.shape print "valid features", validation_features.shape def log_mean_absolute_error(y_true, y_pred): return mean_absolute_error(np.exp(y_true), np.exp(y_pred)) metric = dio.error_metric error_scorer = Scorer(log_mean_absolute_error, greater_is_better=False) if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) features = ch2.fit_transform(features, salaries) validation_features = ch2.transform(validation_features) print "done in %fs" % (time() - t0) print #features = features.toarray() #validation_features = validation_features.toarray() #print "features", features.shape #print "valid features", validation_features.shape