def plot_label_kfold(): from sklearn.model_selection import LabelKFold labels = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] plt.figure(figsize=(10, 2)) plt.title("LabelKFold") axes = plt.gca() axes.set_frame_on(False) n_folds = 12 n_samples = 12 n_iter = 3 n_samples_per_fold = 1 cv = LabelKFold(n_splits=3) mask = np.zeros((n_iter, n_samples)) for i, (train, test) in enumerate(cv.split(range(12), labels=labels)): mask[i, train] = 1 mask[i, test] = 2 for i in range(n_folds): # test is grey colors = ["grey" if x == 2 else "white" for x in mask[:, i]] # not selected has no hatch boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter, left=i * n_samples_per_fold, height=.6, color=colors, hatch="//") for j in np.where(mask[:, i] == 0)[0]: boxes[j].set_hatch("") axes.barh(bottom=[n_iter] * n_folds, width=[1 - 0.1] * n_folds, left=np.arange(n_folds) * n_samples_per_fold, height=.6, color="w") for i in range(12): axes.text((i + .5) * n_samples_per_fold, 3.5, "%d" % labels[i], horizontalalignment="center") #ax.set_ylim(4, -0.1) axes.invert_yaxis() axes.set_xlim(0, n_samples + 1) axes.set_ylabel("CV iterations") axes.set_xlabel("Data points") axes.set_xticks(np.arange(n_samples) + .5) axes.set_xticklabels(np.arange(1, n_samples + 1)) axes.set_yticks(np.arange(n_iter + 1) + .3) axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)] + ["Group"]) plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3)) plt.tight_layout()
def test_nested_cv(): # Test if nested cross validation works with different combinations of cv rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) labels = rng.randint(0, 5, 15) cvs = [ LeaveOneLabelOut(), LeaveOneOut(), LabelKFold(), StratifiedKFold(), StratifiedShuffleSplit(n_iter=10, random_state=0) ] for inner_cv, outer_cv in combinations_with_replacement(cvs, 2): gs = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, cv=inner_cv) cross_val_score(gs, X=X, y=y, labels=labels, cv=outer_cv, fit_params={'labels': labels})
def test_grid_search_labels(): # Check if ValueError (when labels is None) propagates to GridSearchCV # And also check if labels is correctly passed to the cv object rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) labels = rng.randint(0, 3, 15) clf = LinearSVC(random_state=0) grid = {'C': [1]} label_cvs = [ LeaveOneLabelOut(), LeavePLabelOut(2), LabelKFold(), LabelShuffleSplit() ] for cv in label_cvs: gs = GridSearchCV(clf, grid, cv=cv) assert_raise_message(ValueError, "The labels parameter should not be None", gs.fit, X, y) gs.fit(X, y, labels) non_label_cvs = [StratifiedKFold(), StratifiedShuffleSplit()] for cv in non_label_cvs: gs = GridSearchCV(clf, grid, cv=cv) # Should not raise an error gs.fit(X, y)
def test_cross_val_score_predict_labels(): # Check if ValueError (when labels is None) propagates to cross_val_score # and cross_val_predict # And also check if labels is correctly passed to the cv object X, y = make_classification(n_samples=20, n_classes=2, random_state=0) clf = SVC(kernel="linear") label_cvs = [ LeaveOneLabelOut(), LeavePLabelOut(2), LabelKFold(), LabelShuffleSplit() ] for cv in label_cvs: assert_raise_message(ValueError, "The labels parameter should not be None", cross_val_score, estimator=clf, X=X, y=y, cv=cv) assert_raise_message(ValueError, "The labels parameter should not be None", cross_val_predict, estimator=clf, X=X, y=y, cv=cv)
def plot_label_kfold(): from sklearn.model_selection import LabelKFold labels = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] plt.figure(figsize=(10, 2)) plt.title("LabelKFold") axes = plt.gca() axes.set_frame_on(False) n_folds = 12 n_samples = 12 n_iter = 3 n_samples_per_fold = 1 cv = LabelKFold(n_splits=3) mask = np.zeros((n_iter, n_samples)) for i, (train, test) in enumerate(cv.split(range(12), labels=labels)): mask[i, train] = 1 mask[i, test] = 2 for i in range(n_folds): # test is grey colors = ["grey" if x == 2 else "white" for x in mask[:, i]] # not selected has no hatch boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter, left=i * n_samples_per_fold, height=.6, color=colors, hatch="//") for j in np.where(mask[:, i] == 0)[0]: boxes[j].set_hatch("") axes.barh(bottom=[n_iter] * n_folds, width=[1 - 0.1] * n_folds, left=np.arange(n_folds) * n_samples_per_fold, height=.6, color="w") for i in range(12): axes.text((i + .5) * n_samples_per_fold, 3.5, "%d" % labels[i], horizontalalignment="center") #ax.set_ylim(4, -0.1) axes.invert_yaxis() axes.set_xlim(0, n_samples + 1) axes.set_ylabel("CV iterations") axes.set_xlabel("Data points") axes.set_xticks(np.arange(n_samples) + .5) axes.set_xticklabels(np.arange(1, n_samples + 1)) axes.set_yticks(np.arange(n_iter + 1) + .3) axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)] + ["Group"]); plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3)); plt.tight_layout()
def test_label_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_labels = 15 n_samples = 1000 n_folds = 5 X = y = np.ones(n_samples) # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed labels = rng.randint(0, n_labels, n_samples) ideal_n_labels_per_fold = n_samples // n_folds len(np.unique(labels)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) lkf = LabelKFold(n_folds=n_folds) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in lkf.split(X, y, labels): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Construct the test data labels = np.array(['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']) n_labels = len(np.unique(labels)) n_samples = len(labels) n_folds = 5 tolerance = 0.05 * n_samples # 5 percent error allowed ideal_n_labels_per_fold = n_samples // n_folds X = y = np.ones(n_samples) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in lkf.split(X, y, labels): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Should fail if there are more folds than labels labels = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(labels)) assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", next, LabelKFold(n_folds=3).split(X, y, labels))
def test_label_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_labels = 15 n_samples = 1000 n_folds = 5 X = y = np.ones(n_samples) # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed labels = rng.randint(0, n_labels, n_samples) ideal_n_labels_per_fold = n_samples // n_folds len(np.unique(labels)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) lkf = LabelKFold(n_folds=n_folds) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in lkf.split(X, y, labels): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Construct the test data labels = [ 'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia' ] n_labels = len(np.unique(labels)) n_samples = len(labels) n_folds = 5 tolerance = 0.05 * n_samples # 5 percent error allowed ideal_n_labels_per_fold = n_samples // n_folds X = y = np.ones(n_samples) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in lkf.split(X, y, labels): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Should fail if there are more folds than labels labels = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(labels)) assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", next, LabelKFold(n_folds=3).split(X, y, labels))
if ttype == 'full': y = expr elif ttype == 'res': y = residues clf = MLPRegressor( hidden_layer_sizes=tuple([50] * n_layers), alpha=0.001, learning_rate_init=0.01, activation='logistic', random_state=0, shuffle=True, ) cv = LabelKFold(y, n_folds=2) for train, test in cv: pass def confidence_interval(data, conf=0.95): a = 1.0 * np.array(data) n = len(a) m, se = np.mean(a), scipy.stats.sem(a) h = se * scipy.stats.t._ppf((1 + conf) / 2., n - 1) return m, h mlp = clf.fit(X[train], y[train]) y_pred = mlp.predict(X[test]) y_resub = mlp.predict(X[train])
from sklearn.model_selection import LabelKFold from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_predict import copula_ordinal_regression as cor import numpy as np # load the processed disfa database X, y, S = cor.load_disfa() # select the first 3 action units (AU1,AU2,AU4) y = y[:, [0, 1, 2]] # select estimator and number of folds for cross validation clf = cor.COR(max_iter=5000, verbose=0) cv = LabelKFold(9) # define parameter grid parameter = { 'margins': ['normcdf', 'sigmoid'], 'C': [0] + 10.**np.arange(0, 8), 'w_nodes': np.linspace(0, 1, 5), } # apply grid search to find optimal hyper parameters clf = GridSearchCV(clf, parameter, cv=cv, n_jobs=-1, verbose=10, refit=False) clf.fit(X, y, S) print clf.best_params_ # apply cross validation using best hyper parameters y_hat = cross_val_predict(clf.best_estimator_, X,