def plot_label_kfold(): from sklearn.model_selection import LabelKFold labels = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] plt.figure(figsize=(10, 2)) plt.title("LabelKFold") axes = plt.gca() axes.set_frame_on(False) n_folds = 12 n_samples = 12 n_iter = 3 n_samples_per_fold = 1 cv = LabelKFold(n_splits=3) mask = np.zeros((n_iter, n_samples)) for i, (train, test) in enumerate(cv.split(range(12), labels=labels)): mask[i, train] = 1 mask[i, test] = 2 for i in range(n_folds): # test is grey colors = ["grey" if x == 2 else "white" for x in mask[:, i]] # not selected has no hatch boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter, left=i * n_samples_per_fold, height=.6, color=colors, hatch="//") for j in np.where(mask[:, i] == 0)[0]: boxes[j].set_hatch("") axes.barh(bottom=[n_iter] * n_folds, width=[1 - 0.1] * n_folds, left=np.arange(n_folds) * n_samples_per_fold, height=.6, color="w") for i in range(12): axes.text((i + .5) * n_samples_per_fold, 3.5, "%d" % labels[i], horizontalalignment="center") #ax.set_ylim(4, -0.1) axes.invert_yaxis() axes.set_xlim(0, n_samples + 1) axes.set_ylabel("CV iterations") axes.set_xlabel("Data points") axes.set_xticks(np.arange(n_samples) + .5) axes.set_xticklabels(np.arange(1, n_samples + 1)) axes.set_yticks(np.arange(n_iter + 1) + .3) axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)] + ["Group"]) plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3)) plt.tight_layout()
def plot_label_kfold(): from sklearn.model_selection import LabelKFold labels = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] plt.figure(figsize=(10, 2)) plt.title("LabelKFold") axes = plt.gca() axes.set_frame_on(False) n_folds = 12 n_samples = 12 n_iter = 3 n_samples_per_fold = 1 cv = LabelKFold(n_splits=3) mask = np.zeros((n_iter, n_samples)) for i, (train, test) in enumerate(cv.split(range(12), labels=labels)): mask[i, train] = 1 mask[i, test] = 2 for i in range(n_folds): # test is grey colors = ["grey" if x == 2 else "white" for x in mask[:, i]] # not selected has no hatch boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter, left=i * n_samples_per_fold, height=.6, color=colors, hatch="//") for j in np.where(mask[:, i] == 0)[0]: boxes[j].set_hatch("") axes.barh(bottom=[n_iter] * n_folds, width=[1 - 0.1] * n_folds, left=np.arange(n_folds) * n_samples_per_fold, height=.6, color="w") for i in range(12): axes.text((i + .5) * n_samples_per_fold, 3.5, "%d" % labels[i], horizontalalignment="center") #ax.set_ylim(4, -0.1) axes.invert_yaxis() axes.set_xlim(0, n_samples + 1) axes.set_ylabel("CV iterations") axes.set_xlabel("Data points") axes.set_xticks(np.arange(n_samples) + .5) axes.set_xticklabels(np.arange(1, n_samples + 1)) axes.set_yticks(np.arange(n_iter + 1) + .3) axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)] + ["Group"]); plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3)); plt.tight_layout()
def test_label_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_labels = 15 n_samples = 1000 n_folds = 5 X = y = np.ones(n_samples) # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed labels = rng.randint(0, n_labels, n_samples) ideal_n_labels_per_fold = n_samples // n_folds len(np.unique(labels)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) lkf = LabelKFold(n_folds=n_folds) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in lkf.split(X, y, labels): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Construct the test data labels = np.array(['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']) n_labels = len(np.unique(labels)) n_samples = len(labels) n_folds = 5 tolerance = 0.05 * n_samples # 5 percent error allowed ideal_n_labels_per_fold = n_samples // n_folds X = y = np.ones(n_samples) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in lkf.split(X, y, labels): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Should fail if there are more folds than labels labels = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(labels)) assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", next, LabelKFold(n_folds=3).split(X, y, labels))
def test_label_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_labels = 15 n_samples = 1000 n_folds = 5 X = y = np.ones(n_samples) # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed labels = rng.randint(0, n_labels, n_samples) ideal_n_labels_per_fold = n_samples // n_folds len(np.unique(labels)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) lkf = LabelKFold(n_folds=n_folds) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in lkf.split(X, y, labels): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Construct the test data labels = [ 'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia' ] n_labels = len(np.unique(labels)) n_samples = len(labels) n_folds = 5 tolerance = 0.05 * n_samples # 5 percent error allowed ideal_n_labels_per_fold = n_samples // n_folds X = y = np.ones(n_samples) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in lkf.split(X, y, labels): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Should fail if there are more folds than labels labels = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(labels)) assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", next, LabelKFold(n_folds=3).split(X, y, labels))