def test_missing_predict_proba():
    # Check that an error is thrown if predict_proba is not implemented
    base_estimator = SVC(probability=False, gamma="scale")
    self_training = SelfTrainingClassifier(base_estimator)

    with pytest.raises(AttributeError, match="predict_proba is not available"):
        self_training.fit(X_train, y_train_missing_labels)
def test_missing_predict_proba():
    # Check that an error is thrown if predict_proba is not implemented
    base_estimator = SVC(probability=False, gamma='scale')
    self_training = SelfTrainingClassifier(base_estimator)

    with pytest.raises(ValueError, match=r"base_estimator \(SVC\) should"):
        self_training.fit(X_train, y_train_missing_labels)
def test_base_estimator_meta_estimator():
    # Check that a meta-estimator relying on an estimator implementing
    # `predict_proba` will work even if it does expose this method before being
    # fitted.
    # Non-regression test for:
    # https://github.com/scikit-learn/scikit-learn/issues/19119

    base_estimator = StackingClassifier(
        estimators=[
            ("svc_1", SVC(probability=True)),
            ("svc_2", SVC(probability=True)),
        ],
        final_estimator=SVC(probability=True),
        cv=2,
    )

    assert hasattr(base_estimator, "predict_proba")
    clf = SelfTrainingClassifier(base_estimator=base_estimator)
    clf.fit(X_train, y_train_missing_labels)
    clf.predict_proba(X_test)

    base_estimator = StackingClassifier(
        estimators=[
            ("svc_1", SVC(probability=False)),
            ("svc_2", SVC(probability=False)),
        ],
        final_estimator=SVC(probability=False),
        cv=2,
    )

    assert not hasattr(base_estimator, "predict_proba")
    clf = SelfTrainingClassifier(base_estimator=base_estimator)
    with pytest.raises(AttributeError):
        clf.fit(X_train, y_train_missing_labels)
def test_warns_k_best():
    st = SelfTrainingClassifier(KNeighborsClassifier(),
                                criterion="k_best",
                                k_best=1000)
    with pytest.warns(UserWarning, match="k_best is larger than"):
        st.fit(X_train, y_train_missing_labels)

    assert st.termination_condition_ == "all_labeled"
Ejemplo n.º 5
0
def test_none_iter():
    # Check that the all samples were labeled after a 'reasonable' number of
    # iterations.
    st = SelfTrainingClassifier(KNeighborsClassifier(), threshold=0.55, max_iter=None)
    st.fit(X_train, y_train_missing_labels)

    assert st.n_iter_ < 10
    assert st.termination_condition_ == "all_labeled"
def test_strings_dtype():
    clf = SelfTrainingClassifier(KNeighborsClassifier())
    X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
    labels_multiclass = ["one", "two", "three"]

    y_strings = np.take(labels_multiclass, y)

    with pytest.raises(ValueError, match="dtype"):
        clf.fit(X, y_strings)
def test_early_stopping():
    svc = SVC(gamma="scale", probability=True)
    st = SelfTrainingClassifier(svc)
    X_train_easy = [[1], [0], [1], [0.5]]
    y_train_easy = [1, 0, -1, -1]
    # X = [[0.5]] cannot be predicted on with a high confidence, so training
    # stops early
    st.fit(X_train_easy, y_train_easy)
    assert st.n_iter_ == 1
    assert st.termination_condition_ == "no_change"
def test_prefitted_throws_error():
    # Test that passing a pre-fitted classifier and calling predict throws an
    # error
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    st = SelfTrainingClassifier(knn)
    with pytest.raises(NotFittedError,
                       match="This SelfTrainingClassifier"
                       " instance is not fitted yet"):
        st.predict(X_train)
def test_verbose(capsys, verbose):
    clf = SelfTrainingClassifier(KNeighborsClassifier(), verbose=verbose)
    clf.fit(X_train, y_train_missing_labels)

    captured = capsys.readouterr()

    if verbose:
        assert "iteration" in captured.out
    else:
        assert "iteration" not in captured.out
def test_labeled_iter(max_iter):
    # Check that the amount of datapoints labeled in iteration 0 is equal to
    # the amount of labeled datapoints we passed.
    st = SelfTrainingClassifier(KNeighborsClassifier(), max_iter=max_iter)

    st.fit(X_train, y_train_missing_labels)
    amount_iter_0 = len(st.labeled_iter_[st.labeled_iter_ == 0])
    assert amount_iter_0 == n_labeled_samples
    # Check that the max of the iterations is less than the total amount of
    # iterations
    assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter
def test_no_unlabeled():
    # Test that training on a fully labeled dataset produces the same results
    # as training the classifier by itself.
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    st = SelfTrainingClassifier(knn)
    with pytest.warns(UserWarning, match="y contains no unlabeled samples"):
        st.fit(X_train, y_train)
    assert_array_equal(knn.predict(X_test), st.predict(X_test))
    # Assert that all samples were labeled in iteration 0 (since there were no
    # unlabeled samples).
    assert np.all(st.labeled_iter_ == 0)
    assert st.termination_condition_ == "all_labeled"
Ejemplo n.º 12
0
def test_zero_iterations(base_estimator, y):
    # Check classification for zero iterations.
    # Fitting a SelfTrainingClassifier with zero iterations should give the
    # same results as fitting a supervised classifier.
    # This also asserts that string arrays work as expected.

    clf1 = SelfTrainingClassifier(base_estimator, max_iter=0)

    clf1.fit(X_train, y)

    clf2 = base_estimator.fit(X_train[:n_labeled_samples], y[:n_labeled_samples])

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
    assert clf1.termination_condition_ == "max_iter"
def test_k_best_selects_best():
    # Tests that the labels added by st really are the 10 best labels.
    svc = SVC(gamma="scale", probability=True, random_state=0)
    st = SelfTrainingClassifier(svc, criterion="k_best", max_iter=1, k_best=10)
    has_label = y_train_missing_labels != -1
    st.fit(X_train, y_train_missing_labels)

    got_label = ~has_label & (st.transduction_ != -1)

    svc.fit(X_train[has_label], y_train_missing_labels[has_label])
    pred = svc.predict_proba(X_train[~has_label])
    max_proba = np.max(pred, axis=1)

    most_confident_svc = X_train[~has_label][np.argsort(max_proba)[-10:]]
    added_by_st = X_train[np.where(got_label)].tolist()

    for row in most_confident_svc.tolist():
        assert row in added_by_st
Ejemplo n.º 14
0
    def plot_varying_threshold(self, base_classifier, X_train, y_train):
        """
        Plot the effect of varying threshold for self-training

        Parameters
        ___________
        base_classifier: Supervised classifier implementing both fit and predict_proba
        X_train: Scaled feature matrix of the training set
        y_train: Class label of the training set

        Returns
        _____________
        Matplotlib figure
        """
        total_samples  = y_train.shape[0]
        x_values = np.arange(0.4, 1.05, 0.05)
        x_values = np.append(x_values, 0.99999)
        no_labeled = np.zeros(x_values.shape[0])
        no_iterations = np.zeros(x_values.shape[0])

        for (i, threshold) in enumerate(x_values):

            # Fit model with chosen base classifier
            self_training_clf = SelfTrainingClassifier(base_classifier,threshold=threshold)
            self_training_clf.fit(X_train, y_train)

            # The number of labeled samples that the classifier has available by the end of fit
            no_labeled[i] = total_samples - \
                np.unique(self_training_clf.labeled_iter_, return_counts=True)[1][0]

            # The last iteration the classifier labeled a sample in
            no_iterations[i] = np.max(self_training_clf.labeled_iter_)

        # Plot figures
        plt.rcParams.update({'font.size': 15})
        fig, (ax1, ax2) = plt.subplots(1,2, figsize = (15,4))

        ax1.plot(x_values, no_labeled, color='b')
        ax1.set_xlabel('Threshold')
        ax1.set_ylabel('Number of labeled samples')
        ax2.plot(x_values, no_iterations, color='b')
        ax2.set_ylabel('Number of iterations')
        ax2.set_xlabel('Threshold')
        plt.show()
def test_invalid_params(max_iter, threshold):
    # Test negative iterations
    base_estimator = SVC(gamma="scale", probability=True)
    st = SelfTrainingClassifier(base_estimator, max_iter=max_iter)
    with pytest.raises(ValueError, match="max_iter must be >= 0 or None"):
        st.fit(X_train, y_train)

    base_estimator = SVC(gamma="scale", probability=True)
    st = SelfTrainingClassifier(base_estimator, threshold=threshold)
    with pytest.raises(ValueError, match="threshold must be in"):
        st.fit(X_train, y_train)
def test_k_best():
    st = SelfTrainingClassifier(KNeighborsClassifier(n_neighbors=1),
                                criterion='k_best',
                                k_best=10,
                                max_iter=None)
    y_train_only_one_label = np.copy(y_train)
    y_train_only_one_label[1:] = -1
    n_samples = y_train.shape[0]

    n_expected_iter = ceil((n_samples - 1) / 10)
    st.fit(X_train, y_train_only_one_label)
    assert st.n_iter_ == n_expected_iter

    # Check labeled_iter_
    assert np.sum(st.labeled_iter_ == 0) == 1
    for i in range(1, n_expected_iter):
        assert np.sum(st.labeled_iter_ == i) == 10
    assert np.sum(st.labeled_iter_ == n_expected_iter) == (n_samples - 1) % 10
    assert st.termination_condition_ == 'all_labeled'
def test_verbose_k_best(capsys):
    st = SelfTrainingClassifier(KNeighborsClassifier(n_neighbors=1),
                                criterion='k_best',
                                k_best=10,
                                verbose=True,
                                max_iter=None)

    y_train_only_one_label = np.copy(y_train)
    y_train_only_one_label[1:] = -1
    n_samples = y_train.shape[0]

    n_expected_iter = ceil((n_samples - 1) / 10)
    st.fit(X_train, y_train_only_one_label)

    captured = capsys.readouterr()

    msg = 'End of iteration {}, added {} new labels.'
    for i in range(1, n_expected_iter):
        assert msg.format(i, 10) in captured.out

    assert msg.format(n_expected_iter, (n_samples - 1) % 10) in captured.out
Ejemplo n.º 18
0
    def self_training_clf(self, base_classifier, X_train, y_train, 
                            threshold= None, max_iter = None,verbose = None):
        """
        Train self-training classifier from scikit-learn >= 0.24.1

        Parameters
        ___________
        base_classifier: Supervised classifier implementing both fit and predict_proba
        X_train: Scaled feature matrix of the training set
        y_train: Class label of the training set
        threshold (float):  The decision threshold for use with criterion='threshold'. Should be in [0, 1)
        max_iter (int):  Maximum number of iterations allowed. Should be greater than or equal to 0
        verbose (bool): Enable verbose output

        Returns
        _____________
        Predicted labels and probability
        """
        # Self training model
        model = SelfTrainingClassifier(base_classifier,threshold= threshold, 
                            max_iter = max_iter, verbose = verbose)

        # Fit the training set
        model.fit(X_train, y_train)

        # Predict the labels of the unlabeled data points
        predicted_labels = model.predict(X_train)

        # Predict probability
        predicted_proba = model.predict_proba(X_train)
        return predicted_labels, predicted_proba
Ejemplo n.º 19
0
def test_sanity_classification():
    base_estimator = SVC(gamma="scale", probability=True)
    base_estimator.fit(X_train[n_labeled_samples:], y_train[n_labeled_samples:])

    st = SelfTrainingClassifier(base_estimator)
    st.fit(X_train, y_train_missing_labels)

    pred1, pred2 = base_estimator.predict(X_test), st.predict(X_test)
    assert not np.array_equal(pred1, pred2)
    score_supervised = accuracy_score(base_estimator.predict(X_test), y_test)
    score_self_training = accuracy_score(st.predict(X_test), y_test)

    assert score_self_training > score_supervised
Ejemplo n.º 20
0
def self_training(x_train_all,
                  y_train_all,
                  cv_semisupervised,
                  base_model,
                  name="SelfTrainingClassifier",
                  k_best=100,
                  max_iter=None,
                  only_model=False,
                  **kwargs):
    """ Self training - a semisupervised model.
    Parameters:
        x_train_all (pd.DataFrame): contains both the features of labelled and unlabelled data.
        y_train_all (pd.Series): contains the labels of the labelled and unlabelled data. Unlabelled data must have label -1.
        cv_semisupervised (list): List of training and testing tuples which contain the indiced for the different folds.
        base_model (model): model that has a fit function!
        name (str): Name/Description for the model.
        only_model (bool): if True returns only the model
    Returns:
        dict: results from cross validation, inclusive probability based crossvalidation
    """
    # TODO cv: use the same cv split but randomly assign the other unlabelled data pieces to the other cv folds
    st_model = SelfTrainingClassifier(base_model,
                                      verbose=True,
                                      max_iter=max_iter,
                                      k_best=k_best).fit(
                                          x_train_all, y_train_all)
    # predict_proba possible
    #y_pred = st_model.predict(x_train)
    if only_model:
        return st_model

    return calculate_metrics_cv(model=st_model,
                                X=x_train_all,
                                y_true=y_train_all,
                                cv=cv_semisupervised,
                                name=name)
Ejemplo n.º 21
0
                  skip_methods=["transform", "inverse_transform"]),
    DelegatorData(
        "BaggingClassifier",
        BaggingClassifier,
        skip_methods=[
            "transform",
            "inverse_transform",
            "score",
            "predict_proba",
            "predict_log_proba",
            "predict",
        ],
    ),
    DelegatorData(
        "SelfTrainingClassifier",
        lambda est: SelfTrainingClassifier(est),
        skip_methods=["transform", "inverse_transform", "predict_proba"],
    ),
]


def test_metaestimator_delegation():
    # Ensures specified metaestimators have methods iff subestimator does
    def hides(method):
        @property
        def wrapper(obj):
            if obj.hidden_method == method.__name__:
                raise AttributeError("%r is hidden" % obj.hidden_method)
            return functools.partial(method, obj)

        return wrapper
# classifier that implements :term:`predict_proba`. The sub-classifier
# will behave as a
# semi-supervised classifier, allowing it to learn from unlabeled data.
# Read more in the :ref:`User guide <self_training>`.

import numpy as np
from sklearn import datasets
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.svm import SVC

rng = np.random.RandomState(42)
iris = datasets.load_iris()
random_unlabeled_points = rng.rand(iris.target.shape[0]) < 0.3
iris.target[random_unlabeled_points] = -1
svc = SVC(probability=True, gamma="auto")
self_training_model = SelfTrainingClassifier(svc)
self_training_model.fit(iris.data, iris.target)

##############################################################################
# New SequentialFeatureSelector transformer
# -----------------------------------------
# A new iterative transformer to select features is available:
# :class:`~sklearn.feature_selection.SequentialFeatureSelector`.
# Sequential Feature Selection can add features one at a time (forward
# selection) or remove features from the list of the available features
# (backward selection), based on a cross-validated score maximization.
# See the :ref:`User Guide <sequential_feature_selection>`.

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
Ejemplo n.º 23
0
rng = np.random.RandomState(0)
y_rand = rng.rand(y.shape[0])
y_30 = np.copy(y)
y_30[y_rand < 0.3] = -1  # set random samples to be unlabeled
y_50 = np.copy(y)
y_50[y_rand < 0.5] = -1
# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
ls30 = (LabelSpreading().fit(X, y_30), y_30, "Label Spreading 30% data")
ls50 = (LabelSpreading().fit(X, y_50), y_50, "Label Spreading 50% data")
ls100 = (LabelSpreading().fit(X, y), y, "Label Spreading 100% data")

# the base classifier for self-training is identical to the SVC
base_classifier = SVC(kernel="rbf", gamma=0.5, probability=True)
st30 = (
    SelfTrainingClassifier(base_classifier).fit(X, y_30),
    y_30,
    "Self-training 30% data",
)
st50 = (
    SelfTrainingClassifier(base_classifier).fit(X, y_50),
    y_50,
    "Self-training 50% data",
)

rbf_svc = (SVC(kernel="rbf", gamma=0.5).fit(X, y), y, "SVC with rbf kernel")

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Ejemplo n.º 24
0
    SelfTrainingClassifier
)
from sklearn.svm import SVC

from main import plot_decision_boundary, get_data


if __name__ == '__main__':
    X_train, X_test, y_train, y_test = get_data()

    X = np.concatenate([X_train, X_test], axis=0)
    y = np.concatenate([y_train, -1 * np.ones_like(y_test)], axis=0)

    models = (
        LabelPropagation(max_iter=10000),
        LabelSpreading(),
        SelfTrainingClassifier(base_estimator=SVC(probability=True, gamma="auto"))
    )
    color_maps = ('Blues', 'Greens', 'Reds')

    for model, cmap in zip(models, color_maps):
        model.fit(X, y)
        y_pred = model.predict(X[y == -1])

        sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap=cmap)
        print('-'*50, f'\nModel name: {model.__str__()}\n'
              f'Accuracy_score: {accuracy_score(y_test, y_pred)}')
        plt.show()

        plot_decision_boundary(X, y, y_test, y_pred, model)
Ejemplo n.º 25
0
X, y = datasets.load_breast_cancer(return_X_y=True)
X, y = shuffle(X, y, random_state=42)
y_true = y.copy()
y[50:] = -1
total_samples = y.shape[0]

base_classifier = SVC(probability=True, gamma=0.001, random_state=42)

x_values = np.arange(0.4, 1.05, 0.05)
x_values = np.append(x_values, 0.99999)
scores = np.empty((x_values.shape[0], n_splits))
amount_labeled = np.empty((x_values.shape[0], n_splits))
amount_iterations = np.empty((x_values.shape[0], n_splits))

for (i, threshold) in enumerate(x_values):
    self_training_clf = SelfTrainingClassifier(base_classifier,
                                               threshold=threshold)

    # We need manual cross validation so that we don't treat -1 as a separate
    # class when computing accuracy
    skfolds = StratifiedKFold(n_splits=n_splits)
    for fold, (train_index, test_index) in enumerate(skfolds.split(X, y)):
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        y_test_true = y_true[test_index]

        self_training_clf.fit(X_train, y_train)

        # The amount of labeled samples that at the end of fitting
        amount_labeled[i, fold] = (total_samples - np.unique(
rng = np.random.RandomState(0)
y_rand = rng.rand(y.shape[0])
y_30 = np.copy(y)
y_30[y_rand < 0.3] = -1  # set random samples to be unlabeled
y_50 = np.copy(y)
y_50[y_rand < 0.5] = -1
# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
ls30 = (LabelSpreading().fit(X, y_30), y_30, 'Label Spreading 30% data')
ls50 = (LabelSpreading().fit(X, y_50), y_50, 'Label Spreading 50% data')
ls100 = (LabelSpreading().fit(X, y), y, 'Label Spreading 100% data')

# the base classifier for self-training is identical to the SVC
base_classifier = SVC(kernel='rbf', gamma=.5, probability=True)
st30 = (SelfTrainingClassifier(base_classifier).fit(X, y_30), y_30,
        'Self-training 30% data')
st50 = (SelfTrainingClassifier(base_classifier).fit(X, y_50), y_50,
        'Self-training 50% data')

rbf_svc = (SVC(kernel='rbf', gamma=.5).fit(X, y), y, 'SVC with rbf kernel')

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

color_map = {-1: (1, 1, 1), 0: (0, 0, .9), 1: (1, 0, 0), 2: (.8, .6, 0)}

classifiers = (ls30, st30, ls50, st50, ls100, rbf_svc)
for i, (clf, y_train, title) in enumerate(classifiers):
def test_classification(base_estimator, selection_crit):
    # Check classification for various parameter settings.
    # Also assert that predictions for strings and numerical labels are equal.
    # Also test for multioutput classification
    threshold = 0.75
    max_iter = 10
    st = SelfTrainingClassifier(base_estimator,
                                max_iter=max_iter,
                                threshold=threshold,
                                criterion=selection_crit)
    st.fit(X_train, y_train_missing_labels)
    pred = st.predict(X_test)
    proba = st.predict_proba(X_test)

    st_string = SelfTrainingClassifier(base_estimator,
                                       max_iter=max_iter,
                                       criterion=selection_crit,
                                       threshold=threshold)
    st_string.fit(X_train, y_train_missing_strings)
    pred_string = st_string.predict(X_test)
    proba_string = st_string.predict_proba(X_test)

    assert_array_equal(np.vectorize(mapping.get)(pred), pred_string)
    assert_array_equal(proba, proba_string)

    assert st.termination_condition_ == st_string.termination_condition_
    # Check consistency between labeled_iter, n_iter and max_iter
    labeled = y_train_missing_labels != -1
    # assert that labeled samples have labeled_iter = 0
    assert_array_equal(st.labeled_iter_ == 0, labeled)
    # assert that labeled samples do not change label during training
    assert_array_equal(y_train_missing_labels[labeled],
                       st.transduction_[labeled])

    # assert that the max of the iterations is less than the total amount of
    # iterations
    assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter
    assert np.max(st_string.labeled_iter_) <= st_string.n_iter_ <= max_iter

    # check shapes
    assert st.labeled_iter_.shape == st.transduction_.shape
    assert st_string.labeled_iter_.shape == st_string.transduction_.shape
Ejemplo n.º 28
0
# Parameters
sdg_params = dict(alpha=1e-5, penalty='l2', loss='log')
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)

# Supervised Pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(**vectorizer_params)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(**sdg_params)),
])
# SelfTraining Pipeline
st_pipeline = Pipeline([
    ('vect', CountVectorizer(**vectorizer_params)),
    ('tfidf', TfidfTransformer()),
    ('clf', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
])
# LabelSpreading Pipeline
ls_pipeline = Pipeline([
    ('vect', CountVectorizer(**vectorizer_params)),
    ('tfidf', TfidfTransformer()),
    # LabelSpreading does not support dense matrices
    ('todense', FunctionTransformer(lambda x: x.todense())),
    ('clf', LabelSpreading()),
])


def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    print("Number of training samples:", len(X_train))
    print("Unlabeled samples in training set:",
          sum(1 for x in y_train if x == -1))
def test_invalid_params_selection_crit():
    st = SelfTrainingClassifier(KNeighborsClassifier(), criterion="foo")

    with pytest.raises(ValueError, match="criterion must be either"):
        st.fit(X_train, y_train)
def test_none_classifier():
    st = SelfTrainingClassifier(None)
    with pytest.raises(ValueError, match="base_estimator cannot be None"):
        st.fit(X_train, y_train_missing_labels)