Example #1
0
def test_grid_search_labels():
    # Check if ValueError (when labels is None) propagates to GridSearchCV
    # And also check if labels is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    labels = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    label_cvs = [
        LeaveOneLabelOut(),
        LeavePLabelOut(2),
        LabelKFold(),
        LabelShuffleSplit()
    ]
    for cv in label_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The labels parameter should not be None", gs.fit,
                             X, y)
        gs.fit(X, y, labels)

    non_label_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_label_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)
Example #2
0
def test_nested_cv():
    # Test if nested cross validation works with different combinations of cv
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    labels = rng.randint(0, 5, 15)

    cvs = [
        LeaveOneLabelOut(),
        LeaveOneOut(),
        LabelKFold(),
        StratifiedKFold(),
        StratifiedShuffleSplit(n_iter=10, random_state=0)
    ]

    for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
        gs = GridSearchCV(LinearSVC(random_state=0),
                          param_grid={'C': [1, 10]},
                          cv=inner_cv)
        cross_val_score(gs,
                        X=X,
                        y=y,
                        labels=labels,
                        cv=outer_cv,
                        fit_params={'labels': labels})
def test_cross_val_score_predict_labels():
    # Check if ValueError (when labels is None) propagates to cross_val_score
    # and cross_val_predict
    # And also check if labels is correctly passed to the cv object
    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)

    clf = SVC(kernel="linear")

    label_cvs = [
        LeaveOneLabelOut(),
        LeavePLabelOut(2),
        LabelKFold(),
        LabelShuffleSplit()
    ]
    for cv in label_cvs:
        assert_raise_message(ValueError,
                             "The labels parameter should not be None",
                             cross_val_score,
                             estimator=clf,
                             X=X,
                             y=y,
                             cv=cv)
        assert_raise_message(ValueError,
                             "The labels parameter should not be None",
                             cross_val_predict,
                             estimator=clf,
                             X=X,
                             y=y,
                             cv=cv)
Example #4
0
def plot_label_kfold():
    from sklearn.model_selection import LabelKFold
    labels = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]

    plt.figure(figsize=(10, 2))
    plt.title("LabelKFold")

    axes = plt.gca()
    axes.set_frame_on(False)

    n_folds = 12
    n_samples = 12
    n_iter = 3
    n_samples_per_fold = 1

    cv = LabelKFold(n_splits=3)
    mask = np.zeros((n_iter, n_samples))
    for i, (train, test) in enumerate(cv.split(range(12), labels=labels)):
        mask[i, train] = 1
        mask[i, test] = 2

    for i in range(n_folds):
        # test is grey
        colors = ["grey" if x == 2 else "white" for x in mask[:, i]]
        # not selected has no hatch

        boxes = axes.barh(bottom=range(n_iter),
                          width=[1 - 0.1] * n_iter,
                          left=i * n_samples_per_fold,
                          height=.6,
                          color=colors,
                          hatch="//")
        for j in np.where(mask[:, i] == 0)[0]:
            boxes[j].set_hatch("")

    axes.barh(bottom=[n_iter] * n_folds,
              width=[1 - 0.1] * n_folds,
              left=np.arange(n_folds) * n_samples_per_fold,
              height=.6,
              color="w")

    for i in range(12):
        axes.text((i + .5) * n_samples_per_fold,
                  3.5,
                  "%d" % labels[i],
                  horizontalalignment="center")
    #ax.set_ylim(4, -0.1)

    axes.invert_yaxis()
    axes.set_xlim(0, n_samples + 1)
    axes.set_ylabel("CV iterations")
    axes.set_xlabel("Data points")
    axes.set_xticks(np.arange(n_samples) + .5)
    axes.set_xticklabels(np.arange(1, n_samples + 1))
    axes.set_yticks(np.arange(n_iter + 1) + .3)
    axes.set_yticklabels(["Split %d" % x
                          for x in range(1, n_iter + 1)] + ["Group"])
    plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3))
    plt.tight_layout()
Example #5
0
def test_label_kfold():
    rng = np.random.RandomState(0)

    # Parameters of the test
    n_labels = 15
    n_samples = 1000
    n_folds = 5

    X = y = np.ones(n_samples)

    # Construct the test data
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    labels = rng.randint(0, n_labels, n_samples)

    ideal_n_labels_per_fold = n_samples // n_folds

    len(np.unique(labels))
    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    lkf = LabelKFold(n_folds=n_folds)
    for i, (_, test) in enumerate(lkf.split(X, y, labels)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(labels))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_labels_per_fold))

    # Check that each label appears only in 1 fold
    for label in np.unique(labels):
        assert_equal(len(np.unique(folds[labels == label])), 1)

    # Check that no label is on both sides of the split
    labels = np.asarray(labels, dtype=object)
    for train, test in lkf.split(X, y, labels):
        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)

    # Construct the test data
    labels = [
        'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert',
        'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura',
        'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert',
        'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary',
        'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi',
        'Silvia'
    ]

    n_labels = len(np.unique(labels))
    n_samples = len(labels)
    n_folds = 5
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    ideal_n_labels_per_fold = n_samples // n_folds

    X = y = np.ones(n_samples)

    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    for i, (_, test) in enumerate(lkf.split(X, y, labels)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(labels))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_labels_per_fold))

    # Check that each label appears only in 1 fold
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        for label in np.unique(labels):
            assert_equal(len(np.unique(folds[labels == label])), 1)

    # Check that no label is on both sides of the split
    labels = np.asarray(labels, dtype=object)
    for train, test in lkf.split(X, y, labels):
        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)

    # Should fail if there are more folds than labels
    labels = np.array([1, 1, 1, 2, 2])
    X = y = np.ones(len(labels))
    assert_raises_regexp(ValueError, "Cannot have number of folds.*greater",
                         next,
                         LabelKFold(n_folds=3).split(X, y, labels))
Example #6
0
if ttype == 'full':
    y = expr
elif ttype == 'res':
    y = residues

clf = MLPRegressor(
    hidden_layer_sizes=tuple([50] * n_layers),
    alpha=0.001,
    learning_rate_init=0.01,
    activation='logistic',
    random_state=0,
    shuffle=True,
)

cv = LabelKFold(y, n_folds=2)
for train, test in cv:
    pass


def confidence_interval(data, conf=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t._ppf((1 + conf) / 2., n - 1)
    return m, h


mlp = clf.fit(X[train], y[train])
y_pred = mlp.predict(X[test])
y_resub = mlp.predict(X[train])
from sklearn.model_selection import LabelKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
import copula_ordinal_regression as cor
import numpy as np

# load the processed disfa database
X, y, S = cor.load_disfa()

# select the first 3 action units (AU1,AU2,AU4)
y = y[:, [0, 1, 2]]

# select estimator and number of folds for cross validation
clf = cor.COR(max_iter=5000, verbose=0)
cv = LabelKFold(9)

# define parameter grid
parameter = {
    'margins': ['normcdf', 'sigmoid'],
    'C': [0] + 10.**np.arange(0, 8),
    'w_nodes': np.linspace(0, 1, 5),
}

# apply grid search to find optimal hyper parameters
clf = GridSearchCV(clf, parameter, cv=cv, n_jobs=-1, verbose=10, refit=False)
clf.fit(X, y, S)
print clf.best_params_

# apply cross validation using best hyper parameters
y_hat = cross_val_predict(clf.best_estimator_,
                          X,