def test_random_state_uniform(nl): y1 = flip_labels_uniform(y, nl, random_state=SEED, exact=False) y2 = flip_labels_uniform(y, nl, random_state=SEED, exact=False) assert_array_equal(y1, y2) _, ya = UniformNoise(nl, random_state=SEED, exact=False).simulate_noise(X, y) _, yb = UniformNoise(nl, random_state=SEED, exact=False).simulate_noise(X, y) assert_array_equal(ya, yb) assert_array_equal(ya, y1)
def test_uniform_exact(nl): yn = flip_labels_uniform(y, nl, exact=True) assert (y != yn).sum() / len(y) == nl _, ya = UniformNoise(nl, random_state=SEED, exact=True).simulate_noise(X, y) assert (y != ya).sum() / len(y) == nl
def test_stack_sim(): p = Pipeline([ ('a', UniformNoise(.3)), ('b', CCNoise(lcm)), ('c', DecisionTreeClassifier()), ]) p.fit(X, y) p.predict(X)
def test_example_two(): from skclean.simulate_noise import UniformNoise from skclean.detectors import KDN from skclean.handlers import Filter from skclean.pipeline import Pipeline, make_pipeline # Importing from skclean, not sklearn from skclean.utils import load_data X, y = load_data('breast_cancer') clf = Pipeline([ ('scale', StandardScaler()), # Scale features ('feat_sel', VarianceThreshold(.2)), # Feature selection ('detector', KDN()), # Detect mislabeled samples ('handler', Filter(SVC()) ), # Filter out likely mislabeled samples and then train a SVM ]) clf_g = GridSearchCV(clf, {'detector__n_neighbors': [2, 5, 10]}) n_clf_g = make_pipeline(UniformNoise(.3), clf_g) # Create label noise at the very first step print(cross_val_score(n_clf_g, X, y, cv=5).mean()) # 5-fold cross validation
('b', StandardScaler()), ('c', VarianceThreshold(.2)), ('d', KDN()), ('e', skclean.handlers.SampleWeight(dummy)) ]) # Inside Pipeline tmp_Handlers = [] for h in NOISE_HANDLERS: if h.iterative: # Exclude iterative handlers continue ch = clone(h) ch.detector = None tmp_Handlers.append(ch) preli_steps = [UniformNoise(.2, random_state=SEED), StandardScaler()] all_comb = product(NOISE_DETECTORS, tmp_Handlers) INSIDE_PIPE = [make_pipeline(*preli_steps + list(comb)) for comb in all_comb] # Outside Pipe OUTSIDE_PIPE = [] for h in NOISE_HANDLERS: for d in NOISE_DETECTORS: ch, d = clone(h), clone(d) ch.detector = d if 'random_state' in ch.get_params(): # trying to avoid flaky tests ch.set_params(random_state=42) OUTSIDE_PIPE.append(ch) ALL_COMBS = INSIDE_PIPE + OUTSIDE_PIPE
def test_pipe_cv(): p = Pipeline([('a', UniformNoise(.3)), ('p', StandardScaler()), ('d', KDN(n_neighbors=5)), ('c', Filter(DecisionTreeClassifier()))]) p = cross_val_score(p, X, y, cv=CV1, error_score='raise') print(p)
def test_detector_init(): p = Pipeline([('a', UniformNoise(.3)), ('p', StandardScaler()), ('d', RandomForestDetector(n_estimators=50)), ('c', Filter(DecisionTreeClassifier()))]) p.fit(X, y) p.predict(X)
from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import load_iris, make_classification from sklearn.model_selection import cross_val_score, GridSearchCV, ShuffleSplit from skclean.detectors.neighbors import KDN from skclean.detectors.ensemble import RandomForestDetector from skclean.handlers.filters import Filter from skclean.pipeline import Pipeline, make_pipeline from skclean.simulate_noise import UniformNoise, CCNoise X, y = load_iris(return_X_y=True) lcm = [[.7, .1, .2], [.3, .6, .1], [.1, .21, .69]] CV1 = ShuffleSplit(n_splits=1, test_size=.23) @pytest.mark.parametrize('sim', [UniformNoise(.3), CCNoise(lcm)]) def test_noise_sim(sim): p = Pipeline([('s', sim), ('c', DecisionTreeClassifier())]) p.fit(X, y) p.predict(X) p = make_pipeline(sim, DecisionTreeClassifier()) p.fit(X, y) p.predict(X) def test_stack_sim(): p = Pipeline([ ('a', UniformNoise(.3)), ('b', CCNoise(lcm)), ('c', DecisionTreeClassifier()),