def test_pipeline_sample(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pipeline = Pipeline([('rus', rus)]) # test transform and fit_transform: X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_trans2, y_trans2 = pipeline.fit_sample(X, y) X_trans3, y_trans3 = rus.fit_sample(X, y) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) assert_array_almost_equal(y_trans, y_trans2) assert_array_almost_equal(y_trans, y_trans3) pca = PCA() pipeline = Pipeline([('pca', pca), ('rus', rus)]) X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_pca = pca.fit_transform(X) X_trans2, y_trans2 = rus.fit_sample(X_pca, y) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(y_trans, y_trans2)
def test_predict_with_predict_params(): # tests that Pipeline passes predict_params to the final estimator # when predict is invoked pipe = Pipeline([('transf', Transf()), ('clf', DummyEstimatorParams())]) pipe.fit(None, None) pipe.predict(X=None, got_attribute=True) assert pipe.named_steps['clf'].got_attribute
def test_pipeline_init(): # Test the various init parameters of the pipeline. assert_raises(TypeError, Pipeline) # Check that we can't instantiate pipelines with objects without fit # method pipe = assert_raises(TypeError, Pipeline, [('svc', IncorrectT)]) # Smoke test with only an estimator clf = T() pipe = Pipeline([('svc', clf)]) assert_equal( pipe.get_params(deep=True), dict( svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False))) # Check that params are set pipe.set_params(svc__a=0.1) assert_equal(clf.a, 0.1) assert_equal(clf.b, None) # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC() filter1 = SelectKBest(f_classif) pipe = Pipeline([('anova', filter1), ('svc', clf)]) # Check that we can't use the same stage name twice assert_raises(ValueError, Pipeline, [('svc', SVC()), ('svc', SVC())]) # Check that params are set pipe.set_params(svc__C=0.1) assert_equal(clf.C, 0.1) # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong assert_raises(ValueError, pipe.set_params, anova__C=0.1) # Test clone pipe2 = clone(pipe) assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc']) # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('svc') params.pop('anova') params2.pop('svc') params2.pop('anova') assert_equal(params, params2)
def test_fit_predict_with_intermediate_fit_params(): # tests that Pipeline passes fit_params to intermediate steps # when fit_predict is invoked pipe = Pipeline([('transf', TransfFitParams()), ('clf', FitParamT())]) pipe.fit_predict( X=None, y=None, transf__should_get_this=True, clf__should_succeed=True) assert pipe.named_steps['transf'].fit_params['should_get_this'] assert pipe.named_steps['clf'].successful assert 'should_succeed' not in pipe.named_steps['transf'].fit_params
def test_pipeline_raise_set_params_error(): # Test pipeline raises set params error message for nested models. pipe = Pipeline([('cls', LinearRegression())]) with raises(ValueError, match="Invalid parameter"): pipe.set_params(fake='nope') # nested model check with raises(ValueError, match="Invalid parameter"): pipe.set_params(fake__estimator='nope')
def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([('transf', TransfT()), ('clf', FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert_true(pipe.predict(None)) # and transformer params should not be changed assert_true(pipe.named_steps['transf'].a is None) assert_true(pipe.named_steps['transf'].b is None)
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples,)) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def test_pipeline_fit_transform(): # Test whether pipeline works with a transformer missing fit_transform iris = load_iris() X = iris.data y = iris.target transft = TransfT() pipeline = Pipeline([('mock', transft)]) # test fit_transform: X_trans = pipeline.fit_transform(X, y) X_trans2 = transft.fit(X, y).transform(X) assert_array_almost_equal(X_trans, X_trans2)
def test_pipeline_wrong_memory(): # Test that an error is raised when memory is not a string or a Memory # instance iris = load_iris() X = iris.data y = iris.target # Define memory as an integer memory = 1 cached_pipe = Pipeline( [('transf', DummyTransf()), ('svc', SVC(gamma='scale'))], memory=memory) error_regex = ("string or have the same interface as") with raises(ValueError, match=error_regex): cached_pipe.fit(X, y)
def test_pipeline_wrong_memory(): # Test that an error is raised when memory is not a string or a Memory # instance iris = load_iris() X = iris.data y = iris.target # Define memory as an integer memory = 1 cached_pipe = Pipeline([('transf', DummyTransf()), ('svc', SVC())], memory=memory) error_regex = ("'memory' should either be a string or a joblib.Memory" " instance, got 'memory=1' instead.") with raises(ValueError, match=error_regex): cached_pipe.fit(X, y)
def test_pipeline_sample_transform(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pca = PCA() pca2 = PCA() pipeline = Pipeline([('pca', pca), ('rus', rus), ('pca2', pca2)]) pipeline.fit(X, y).transform(X)
def test_fit_predict_on_pipeline(): # test that the fit_predict method is implemented on a pipeline # test that the fit_predict on pipeline yields same results as applying # transform and clustering steps separately iris = load_iris() scaler = StandardScaler() km = KMeans(random_state=0) # first compute the transform and clustering step separately scaled = scaler.fit_transform(iris.data) separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step pipe = Pipeline([('scaler', scaler), ('Kmeans', km)]) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred)
def test_pipeline_transform(): # Test whether pipeline works with a transformer at the end. # Also test pipeline.transform and pipeline.inverse_transform iris = load_iris() X = iris.data pca = PCA(n_components=2) pipeline = Pipeline([('pca', pca)]) # test transform and fit_transform: X_trans = pipeline.fit(X).transform(X) X_trans2 = pipeline.fit_transform(X) X_trans3 = pca.fit_transform(X) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) X_back = pipeline.inverse_transform(X_trans) X_back2 = pca.inverse_transform(X_trans) assert_array_almost_equal(X_back, X_back2)
def illigal_genralization_checking(self, X_test, y_test): X = self.df[self.features] X_test = X_test[self.features] Y = self.df[self.target] pipe = Pipeline(steps=[('classifier', XGBClassifier(n_estimators=1000, scale_pos_weight=3, reg_alpha=1))]) y_test = y_test["intrusion_cutoff"].apply(lambda x: int(x)) scores = cross_val_score(pipe, X, Y, scoring='precision', cv=StratifiedKFold(5)) print(self.features) print("cross vl scores") print(sum(scores)/5) pipe.fit(X, Y.values) y_pred = pipe.predict(X_test) acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) print("test scores") print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def test_pipeline_sample_weight_supported(): # Pipeline should pass sample_weight X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, y=None) == 3 assert pipe.score(X, y=None, sample_weight=None) == 3 assert pipe.score(X, sample_weight=np.array([2, 3])) == 8
def test_pipeline_sample_weight_unsupported(): # When sample_weight is None it shouldn't be passed X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', Mult())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, sample_weight=None) == 3 with raises(TypeError, match="unexpected keyword argument"): pipe.score(X, sample_weight=np.array([2, 3]))
def test_pipeline_sample(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pipeline = Pipeline([('rus', rus)]) # test transform and fit_transform: X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_trans2, y_trans2 = pipeline.fit_sample(X, y) X_trans3, y_trans3 = rus.fit_sample(X, y) assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(X_trans, X_trans3, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans3, rtol=R_TOL) pca = PCA() pipeline = Pipeline([('pca', PCA()), ('rus', rus)]) X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_pca = pca.fit_transform(X) X_trans2, y_trans2 = rus.fit_sample(X_pca, y) # We round the value near to zero. It seems that PCA has some issue # with that X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0 X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0 assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL)
def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert pipe.predict(None) # and transformer params should not be changed assert pipe.named_steps['transf'].a is None assert pipe.named_steps['transf'].b is None # invalid parameters should raise an error message with raises(TypeError, match="unexpected keyword argument"): pipe.fit(None, None, clf__bad=True)
# pre-process them accordingly. categorical_features = X.loc[:, X.dtypes == 'object'].columns numeric_features = X.loc[:, (X.dtypes == 'float64') | (X.columns == 'age')].columns indicator_features = X.loc[:, (X.dtypes == 'int64') & (X.columns != 'age')].columns # For numeric features we first imputed missing data using median. We choose median over mean, # because our data is not normalized yet. It could have been skewed and had outliers, which would # have given biased mean estimate. # After imputing missing data, we used quantile transformer. This transformation method is robust to # outliers, and transforms variables so they have normal distribution and # all have similar range. numeric_transformer = Pipeline(steps=[( 'imputer', SimpleImputer(strategy='median') ), ('scaler', QuantileTransformer(output_distribution='normal', random_state=0))]) # For categorical features we imputed missing data with the most frequent value of the column. # After that we encoded these variables using bayesian encoder LeaveOneOutEncoder. We chose this encoder # because our categorical variables were of high cardinality. categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='most_frequent') ), ('leaveoneout', LeaveOneOutEncoder(return_df=False))]) # for Indicator variables we imputed missing data with 0, as they only have values # 0 and 1, 1 for event occuring) indicator_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])
def make(): return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)])
def train(self, data: pd.DataFrame) -> Pipeline: """ Return the best fitted estimator, that is, the one that maximizes the average_precision :yield: the cross-validation report for every classifier :return: an estimator of type Pipeline """ if not self.classifiers: raise RuntimeError( 'A classifier is missing. Please call DefectPredictor.classifiers = [\'choiche]\' to set a classifier for training.' ) X, y = prepare_training_data(data) releases = X.group.tolist() X = X.drop(['group'], axis=1) scoring = dict(roc_auc='roc_auc', average_precision='average_precision', accuracy='accuracy', balanced_accuracy='balanced_accuracy', precision='precision', recall='recall', f1='f1', mcc=make_scorer(matthews_corrcoef)) for classifier in self.classifiers: estimator = classifiers_map[classifier] pipe = Pipeline([ ('variance', VarianceThreshold(threshold=0)), # Remove constant features ( 'balancing', None ), # To balance the training data See search_params['balancing'] below) ( 'normalization', None ), # To scale (and center) data. See search_params['normalization'] below # TODO feature_selection here ('classification', estimator) ]) search_params = search_params_map[classifier] if self.balancers: search_params['balancing'] = self.balancers if self.normalizers: search_params['normalization'] = self.normalizers search = RandomizedSearchCV(pipe, search_params, cv=walk_forward_release( X, y, releases), scoring=scoring, refit='average_precision', verbose=self._verbose) search.fit(X, y) # Add additional metadata to the cv_results search.cv_results_['best_index_'] = search.best_index_ buffer = io.StringIO() pd.DataFrame(search.cv_results_).to_json(buffer, orient='table', index=False) self.cv_report_map[classifier] = json.loads(buffer.getvalue()) # Get the highest average_precision for this randomized search local_best_average_precision = search.cv_results_[ 'mean_test_average_precision'][search.best_index_] if (not self.best_estimator) or ( local_best_average_precision > self.best_estimator_average_precision): self.cv_report_map['best_classifier'] = classifier self.best_estimator = search.best_estimator_ selected_features_indices = self.best_estimator.named_steps[ 'variance'].fit(X).get_support(indices=True) self.selected_features = X.iloc[:, selected_features_indices].columns.tolist( ) return self.best_estimator
def test_pipeline_methods_rus_pca_svm(): # Test the various methods of the pipeline (pca + svm). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) # Test with PCA + SVC clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([("rus", rus), ("pca", pca), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_memory_transformer(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def test_set_pipeline_step_passthrough(passthrough): # Test setting Pipeline steps to None X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=passthrough) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) expected_params = { "steps": pipeline.steps, "m2": mult2, "m3": passthrough, "last": mult5, "memory": None, "m2__mult": 2, "last__mult": 5, "verbose": False, } assert pipeline.get_params(deep=True) == expected_params pipeline.set_params(m2=passthrough) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = [ "predict_proba", "predict_log_proba", "decision_function", "transform", "score", ] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=passthrough) # mult2 and mult3 are active exp = 6 pipeline.fit(X, y) pipeline.transform(X) assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) with raises(AttributeError, match="has no attribute 'predict'"): getattr(pipeline, "predict") # Check 'passthrough' step at construction time exp = 2 * 5 pipeline = Pipeline([("m2", mult2), ("m3", passthrough), ("last", mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]]))
def test_pipeline_init(): # Test the various init parameters of the pipeline. with raises(TypeError): Pipeline() # Check that we can't instantiate pipelines with objects without fit # method error_regex = 'Last step of Pipeline should implement fit. .*NoFit.*' with raises(TypeError, match=error_regex): Pipeline([('clf', NoFit())]) # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([('svc', clf)]) expected = dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)) assert pipe.get_params(deep=True) == expected # Check that params are set pipe.set_params(svc__a=0.1) assert clf.a == 0.1 assert clf.b is None # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC() filter1 = SelectKBest(f_classif) pipe = Pipeline([('anova', filter1), ('svc', clf)]) # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform error_regex = 'implement fit and transform or sample' with raises(TypeError, match=error_regex): Pipeline([('t', NoTrans()), ('svc', clf)]) # Check that params are set pipe.set_params(svc__C=0.1) assert clf.C == 0.1 # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong with raises(ValueError): pipe.set_params(anova__C=0.1) # Test clone pipe2 = clone(pipe) assert not pipe.named_steps['svc'] is pipe2.named_steps['svc'] # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('svc') params.pop('anova') params2.pop('svc') params2.pop('anova') assert params == params2
__all__ = ["rf_pipeline", "xgb_pipeline"] # pipeline base steps definition base_steps = [ ( "filter_dep", CategorySelector(variable=cfg.ZONE_VAR, category=cfg.SELECTED_DEP), ), ( "add_lags", LagTransformer( date_column=cfg.DATE_VAR, zone_column=cfg.ZONE_VAR, columns=cfg.LAG_ERA5T_VARS, ), ), ("imputer", Imputer(columns=cfg.MODEL_ERA5T_VARS, strategy="median")), ("binarize_target", TargetDiscretizer(discretizer=discretizer)), ("subset_features", FeatureSubsetter(columns=cfg.MODEL_ERA5T_VARS)), ] # Add estimator to base step lists xgb_steps = [*base_steps, ("xgboost", XGBClassifier(**cfg.XGB_PARAMS))] rf_steps = [ *base_steps, ("random_forest", RandomForestClassifier(**cfg.RF_PARAMS)) ] # Define sklearn / imblearn pipelines xgb_pipeline = Pipeline(xgb_steps) rf_pipeline = Pipeline(rf_steps)
print("Before", counter) x_train = np.reshape(x_train, [x_train.shape[0], 40 * 40 * 3]) steps = [] if (smote_val != -1): print("Applying SMOTE with value", smote_val) smote = SMOTE(sampling_strategy=smote_val) steps.append(('o', smote)) if (oversample_val != -1): print("Applying oversampling with value", oversample_val) oversample = RandomOverSampler(sampling_strategy=oversample_val) steps.append(('o', oversample)) if (undersample_val != -1): print("Applying undersampling with value", undersample_val) undersample = RandomUnderSampler(sampling_strategy=undersample_val) steps.append(('u', undersample)) pipeline = Pipeline(steps=steps) #x_train, y_train = pipeline.fit_resample(x_train, y_train) counter = Counter(y_train) print("After", counter) x_train = np.reshape(x_train, [x_train.shape[0], 40, 40, 3]) # initialize output bias neg, pos = np.bincount(y_train) output_bias = np.log(pos / neg) output_bias = keras.initializers.Constant(output_bias) print("Positive Class Counter:", pos) print("Negative Class Counter:", neg) # output weights weight_for_0 = (1 / neg) * (neg + pos) / 2.0 weight_for_1 = (1 / pos) * (neg + pos) / 2.0
import numpy as np from sklearn.svm import SVC from hyperopt import hp from sklearn.decomposition import PCA from imblearn.pipeline import Pipeline from imblearn.under_sampling import OneSidedSelection from config import random_seed from utils.python_utils import quniform_int steps = [('undersampler', OneSidedSelection(random_state=random_seed)), ('SVC', SVC(C=1, kernel='linear', random_state=random_seed, probability=True))] model = Pipeline(steps=steps) params_space = {'svm__C': hp.quniform('C', 1, 100, 5)}
from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import SMOTE from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.preprocessing import Normalizer classifier = SVC() #classifier = MLPClassifier() #classifier = DecisionTreeClassifier() #classifier = KNeighborsClassifier() resampler = SMOTE(random_state=22) estimator = Pipeline([ #('Normalizer', Normalizer()), #('resample', resampler), #('feature_selection', SelectKBest(f_classif, k = 5)), ('classification', classifier) ]) kfoldcv = StratifiedKFold(n_splits=4) mean_acc = cross_val_score(estimator=estimator, X=X, y=y, cv=kfoldcv) #permutation_test_score(estimator=estimator,X=X,y=y,cv=kfoldcv)''' results = cross_validate(estimator=estimator, X=X, y=y, cv=kfoldcv, return_estimator=True) best_estimator = results['estimator'][-1] #X_test = X = np.array([alphas[0]+alphas[1], betas[0]+betas[1], thetas[0]+thetas[1]]).T
n_estimators=150, min_samples_leaf=10, criterion='mse', max_features=None, max_depth=None), } print( "Execution Model RMSE R2 Pearson Correlation / p-value Spearman Correlation / p-value" ) # For each base learner... for learner_name, learner in base_learners.items(): # build pipeline steps = [('scale', StandardScaler()), ('learner', learner)] pipeline = Pipeline(steps=steps) pipeline.fit(exec_training_features, exec_training_target) # prediction predicted = pipeline.predict(exec_test_features) # evaluation r2 = r2_score(exec_test_target, predicted) rmse = mean_squared_error(exec_test_target, predicted) pearson_corr, pvalue = stats.pearsonr(exec_test_target, predicted) spearmanr, spr_pvalue = stats.spearmanr(exec_test_target, predicted) output.write("{},{},{:.3f},{:.3f},{:.3f},{:.3f}\n".format( EXECUTION_NAME, learner_name, rmse, r2, pearson_corr, spearmanr)) print("{} {:20s} {:.3f} {:.3f} {:.3f}/{:.5f} {:.3f}/{:.5f}".format( EXECUTION_NAME, learner_name, rmse, r2, pearson_corr, pvalue,
# df = df.drop(columns=["Num1"]) imp.fit(df) vars = df.columns[range(len(df.columns) - 1)] df = imp.transform(df) # df = pd.DataFrame(vals, columns=vars) # df = df[["WBC0", "Plt0", "Mg0", "Age", "Ca0", "BMI", "Na0", "P0", "HB0", "AST0", "PH0", "ALT0", "CRP0_Quantitative", "HeartFailure0", "Nausea0", "WeaknessFatigue0", "Cough0", "K0", "PR0", "Cr0", # "COVID19_outcome"]] X = np.round(df[:, range(0, df.shape[1] - 1)]) Y = np.round(df[:, targetIndex]) # remove label -1 mask = Y != -1 Y = Y[mask] X = X[mask, :] base_estimator = Pipeline([('scaler', StandardScaler()), ('model', LogisticRegression(penalty='l2'))]) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__C', lambda_grid=np.logspace(-5, -1, 50)).fit(X, Y) fig, ax = plot_stability_path(selector, vars=vars) fig.show() selected_variables = selector.get_support(indices=True) selected_scores = selector.stability_scores_.max(axis=1) pd.DataFrame({ "selectedVars": vars[selected_variables], "score": selected_scores[selected_variables] }).to_excel("stabilityFeatureSelection.xlsx") # print(selector.get_support(indices=True))
def score(self, pipeline: Pipeline, dataset: Dataset, class_names: List[str] = None): """ Computes scores for metrics provided in Scorer constructor. If y_true is multi class then scorer does macro average mode for precision/recall/f1. If y_true is multilabel the scores performs macro averaging. Parameters ---------- pipeline: Pipeline Complete pipeline including features pipeline and classifier. dataset: Dataset Dataset containing x and y pd.DataFrames For Multiclass the shape of y_true should be 1-D (NOT one-hot encoded). For Multilabel the shape should be n-dimensional (where n is number of classes). class_names: List of strings, optional If given, the scores for separate classes will be displayed with appropriate names. Returns ------- metrics: Dict Dictionary with metrics' names as keys and scores as values """ x, y_true = dataset.x, dataset.y.to_numpy() if len(y_true.shape) == 1: y_true = y_true.reshape(-1, 1) # run inference for probabilities probabilities = pipeline.predict_proba(x) # check if the output of inference is a list (sklearn models often output probabilities in this form # in case of multilabel task). If yes, convert to a single array. if isinstance(probabilities, list): probabilities = convert_list_probas_to_array(probabilities) # check task type based on the true and predicted arrays. self.task = check_task(y_true, probabilities) # turn probabilities into predictions with chosen threshold if self.task in ['binary', 'multiclass']: predictions = np.argmax(probabilities, axis=-1) else: predictions = np.where(probabilities >= self.threshold, 1, 0) # assign number of classes based on given array self.n_classes = probabilities.shape[-1] # assign names of classes self.class_names = class_names if class_names else [f'class_{i}' for i in range(self.n_classes)] # check if any of ['precision', 'recall', 'f1', 'accuracy'] are in the metrics. # if yes generate classification report - it calculates all of these metrics. # it does not calculate accuracy for multilabel problem so additional check is done in such case. if [metric for metric in ['precision', 'recall', 'f1', 'accuracy'] if metric in self.metrics]: self.scores_dict.update(classification_report(y_true, predictions, target_names=self.class_names, output_dict=True)) if self.task == 'multilabel': self.scores_dict['accuracy'] = accuracy_score(y_true, predictions) if 'auc' in self.metrics: self.fpr, self.tpr, self.roc_auc_dict = calculate_roc_auc(y_true, probabilities, self.class_names, self.task) for key, value in self.roc_auc_dict.items(): self.scores_dict[key]['auc'] = value if self.report: print(pd.DataFrame(self.scores_dict).transpose()) return self._get_metrics()
def RepeatedSampling(X_train, Y_train, X_test, Y_test, classifier, sampling, sampling_params, no_seeds): """ Repeated sampling experiments for algorithms with randomized sampling. Considered Performance Criteria: - F2-Score - Balanced Accuracy - Precision - Recall Inputs: X_train = features of the training data (must be in pd.Dataframe format!!) Y_train = outcome of the training data (must be in pd.Series format!!) X_test = features of the test data (must be in pd.Dataframe format!!) Y_test = outcome of the test data (must be in pd.Series format!!) classifier = model, e.g. BaggingClassifier() sampling = sampling object, e.g. RandomOverSampler() sampling_params = parameters for the sampling object no_seeds = number of experiments to be executed/number of different seeds to be considered. """ seeds = random.sample(range(1, 10000), no_seeds) i=0 test_frames = [] for seed in seeds: sampling_new = sampling(**sampling_params, random_state = seeds[i]) pipe = Pipeline(steps=[('s', sampling_new), ('m', classifier)]) test_performance = pd.DataFrame(np.zeros((1,4)), columns = list(['F2-Score', 'bacc', 'Precision', 'Recall'])) pipe.fit(X_train, Y_train) Y_pred = pipe.predict(X_test) f2 = metrics.fbeta_score(Y_test, Y_pred, beta = 2) bacc = metrics.balanced_accuracy_score(Y_test, Y_pred) precision = metrics.precision_score(Y_test, Y_pred) recall = metrics.recall_score(Y_test, Y_pred) test_performance.iloc[0,0] = f2 test_performance.iloc[0,1] = bacc test_performance.iloc[0,2] = precision test_performance.iloc[0,3] = recall test_frames.append(test_performance) i = i+1 test_table = pd.concat(test_frames) f2_vals = test_table['F2-Score'] bacc_vals = test_table['bacc'] precision_vals = test_table['Precision'] recall_vals = test_table['Recall'] final_performance = pd.DataFrame(np.zeros((1,8)), columns = list(['F2-Score MEAN', 'F2-Score STD', 'bacc MEAN', 'bacc STD', 'Precision MEAN', 'Precision STD', 'Recall MEAN', 'Recall STD'])) final_performance.iloc[0,0] = np.mean(f2_vals) final_performance.iloc[0,2] = np.mean(bacc_vals) final_performance.iloc[0,4] = np.mean(precision_vals) final_performance.iloc[0,6] = np.mean(recall_vals) final_performance.iloc[0,1] = np.std(f2_vals) final_performance.iloc[0,3] = np.std(bacc_vals) final_performance.iloc[0,5] = np.std(precision_vals) final_performance.iloc[0,7] = np.std(recall_vals) final_performance = round(final_performance, 3) return final_performance
def test_pipeline_memory_transformer(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline( [("transf_2", transf_2), ("svc", clf_2)], memory=memory ) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal( pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X) ) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def test_pipeline_methods_rus_pca_svm(): # Test the various methods of the pipeline (pca + svm). X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) # Test with PCA + SVC clf = SVC(probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([('rus', rus), ('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def main(): # 'LR', 'DT', 'SVC', 'LSTM', 'NN', # 'MLP', 'CNN', 'LSTM', 'ConvLSTM', 'CNNLSTM', 'EncodeDecodeLSTMs' models = ['RF'] targets = ['DOcategory', 'pHcategory', 'ph', 'dissolved_oxygen'] sondefilename = 'leavon_wo_2019-07-01-2020-01-15' n_job = -1 for model_name in models: print(model_name) for target in targets: if target.find('category') > 0: cat = 1 directory = 'Results/bookOne/output_Cat_' + \ model_name+'/oversampling_cv_models/' data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta', 'imfeatures': 'imfeatures'} else: cat = 0 directory = 'Results/bookOne/output_Reg_' + \ model_name+'/oversampling_cv_models/' data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse', 'R2': 'R2', 'imfeatures': 'imfeatures'} if not os.path.exists(directory): os.makedirs(directory) resultFileName = 'results_'+target+str(time.time())+'.csv' dfheader = pd.DataFrame(data=data, index=[0]) dfheader.to_csv(directory+resultFileName, index=False, header=False) path = 'Sondes_data/train/train_data/' method = 'OrgData' for n_steps in [1, 3, 6, 12]: # for PrH_index in [1, 3, 6, 12, 24, 36, 48]: files = [f for f in os.listdir(path) if f.endswith( '.csv') and f.startswith(sondefilename)] file = files[0] print('Window: '+str(n_steps) + ' TH: ' + str(PrH_index)+' '+method+' '+target) dataset = pd.read_csv(path+file) train_X_grid, train_y_grid, input_dim, features = func.preparedata( dataset, PrH_index, n_steps, target, cat) print(train_X_grid[0:1]) if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'): train_y_grid = to_categorical(train_y_grid, 3) if model_name == 'LSTM' or model_name == 'NN': n_job = 1 start_time = time.time() model = func.algofind(model_name, input_dim, n_steps, cat) if cat == 1: metric = make_scorer(f2_measure) else: metric = make_scorer(R2_measure) # cat_ix = train_X_grid[:, 7:] # print(cat_ix[0:2]) # num_ix = train_X_grid[:, : 7] # print(num_ix[0:2]) # one hot encode categorical, normalize numerical # ct = ColumnTransformer( # [('c', OneHotEncoder(), cat_ix), ('n', StandardScaler(), num_ix)]) if model_name == 'RF' or model_name == 'DT': pipeline = Pipeline(steps=[('model', model)]) else: # model_name == 'LSTM' or model_name == 'NN': pipeline = Pipeline( steps=[('n', StandardScaler()), ('model', model)]) # else: # pipeline = Pipeline( # steps=[('transforms', ct), ('model', model)]) custom_cv = func.custom_cv_2folds(train_X_grid, 5) if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'): gs = RandomizedSearchCV( estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job) clf = gs.fit(train_X_grid, train_y_grid, model__class_weight={0: 1, 1: 50, 2: 100}) elif cat == 0 and (model_name == 'LSTM' or model_name == 'NN'): gs = RandomizedSearchCV( estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job) clf = gs.fit(train_X_grid, train_y_grid) else: gs = RandomizedSearchCV( estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, scoring=metric, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job) clf = gs.fit(train_X_grid, train_y_grid) test_Score = clf.cv_results_['mean_test_score'].mean() test_std = clf.cv_results_['std_test_score'].mean() print('Mean test scores: %.3f' % test_Score) i = 1 custom_cv = func.custom_cv_2folds(train_X_grid, 3) for train_index, test_index in custom_cv: test_X = train_X_grid[test_index] test_y = train_y_grid[test_index] predictions = clf.predict(test_X) fpath = 'predictions_' + method+target+'_Window' + \ str(n_steps) + '_TH' + \ str(PrH_index)+'_CV' + str(i)+file if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'): test_y = argmax(test_y, axis=1) cm0 = func.forecast_accuracy(predictions, test_y, cat) plt.scatter(np.arange(len(test_y)), test_y, s=1) plt.scatter(np.arange(len(predictions)), predictions, s=1) plt.legend(['actual', 'predictions'], loc='upper right') plt.savefig(directory+fpath+'.jpg') plt.close() data = {'Actual': test_y, 'Predictions': predictions} print(test_y.shape) print(predictions.shape) df = pd.DataFrame(data=data) df.to_csv(directory+fpath, index=False) if cat == 1: data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': fpath, 'std_test_score': [test_std], 'mean_test_score': [test_Score], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]], 'imfeatures': [clf.best_estimator_]} elif cat == 0: data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': fpath, 'std_test_score': [test_std], 'mean_test_score': [test_Score], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5], 'imfeatures': [clf.best_estimator_]} df = pd.DataFrame(data=data, index=[0]) df.to_csv(directory+resultFileName, index=False, mode='a', header=False) elapsed_time = time.time() - start_time print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) i = i+1
def main(): import time start = time.time() with open('./pkl/X.pkl', 'rb') as fh: # Load data set X = dill.load(fh) with open('./pkl/y.pkl', 'rb') as fh: y = dill.load(fh) scaler = Normalizer() smote_etomek = SMOTETomek(ratio='auto') cachedir = mkdtemp() cv = StratifiedKFold(n_splits=5, shuffle=True) classifier = XGBClassifier() # A parameter grid for XGBoost params = { 'min_child_weight': [1, 5, 10], 'gamma': [0, 0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [1, 3, 4, 5, 10], } pipeline = Pipeline([ ('scaler', scaler), ('smt', smote_etomek), ('clf', classifier), ], memory=cachedir) sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0) sss.get_n_splits(X, y) for train_index, test_index in sss.split(X, y): #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[ test_index] # make training and test set y_train, y_test = y[train_index], y[test_index] clf = dasksearchCV(classifier, params, n_jobs=8, cv=3, scoring='roc_auc', refit=True) clf.fit(X_train, y_train) print(clf.best_params_) print(clf.best_score_) best_parameters, score = clf.best_params_, clf.best_score_ print('Raw AUC score:', score) for param_name in sorted(best_parameters.keys()): print("%s: %r" % (param_name, best_parameters[param_name])) classifier = XGBClassifier(**best_parameters, njobs=-1) plot_cross_validation( cv, X_train, y_train, pipeline) # do 5 fold stratified cross-validation clf = pipeline.fit(X_train, y_train) # print(classifier.get_params()) expected = y_test predicted = clf.predict(X_test) # test performance on test set plot_confusion_matrix(confusion_matrix(expected, predicted), classes=["Non-Zika", "Zika"]) print(time.time() - start) from sklearn import metrics print("Classification report for classifier %s:\n%s\n" % (clf, metrics.classification_report(expected, predicted)))
def test_set_pipeline_step_none(): # Test setting Pipeline steps to None X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=None) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) expected_params = {'steps': pipeline.steps, 'm2': mult2, 'm3': None, 'last': mult5, 'memory': None, 'm2__mult': 2, 'last__mult': 5} assert pipeline.get_params(deep=True) == expected_params pipeline.set_params(m2=None) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = ['predict_proba', 'predict_log_proba', 'decision_function', 'transform', 'score'] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=None) # mult2 and mult3 are active exp = 6 pipeline.fit(X, y) pipeline.transform(X) assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) with raises(AttributeError, match="has no attribute 'predict'"): getattr(pipeline, 'predict') # Check None step at construction time exp = 2 * 5 pipeline = Pipeline([('m2', mult2), ('m3', None), ('last', mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]]))
scores.iloc[0, 3] = np.std(scores_temp['test_bacc']) scores.iloc[0, 4] = np.mean(scores_temp['test_Precision']) scores.iloc[0, 5] = np.std(scores_temp['test_Precision']) scores.iloc[0, 6] = np.mean(scores_temp['test_Recall']) scores.iloc[0, 7] = np.std(scores_temp['test_Recall']) scores = np.round(scores, 3) return scores ############### DEFINE COMBINATIONS VIA PIPELINES (FOR LATEX TABLE) ############### models = [] pip_BAG0 = Pipeline(steps=[('m', model_BAG)]) models.append(pip_BAG0) pip_WBAG = Pipeline(steps=[('m', model_WBAG)]) models.append(pip_WBAG) pip_BAGROS = Pipeline(steps=[('s', ROS1), ('m', model_BAG)]) models.append(pip_BAGROS) pip_BAGSMOTE = Pipeline(steps=[('s', SMOTE1), ('m', model_BAG)]) models.append(pip_BAGSMOTE) pip_BAGADASYN = Pipeline(steps=[('s', ADASYN1), ('m', model_BAG)]) models.append(pip_BAGADASYN) pip_BAGRUS = Pipeline(steps=[('s', RUS1), ('m', model_BAG)]) models.append(pip_BAGRUS) pip_BAGENN = Pipeline(steps=[('s', ENN1), ('m', model_BAG)]) models.append(pip_BAGENN)
def test_pipeline_memory_sampler(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline( [("transf_2", transf_2), ("svc", clf_2)], memory=memory ) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal( pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X) ) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def build_model_pipeline(self): from imblearn.pipeline import Pipeline model = Pipeline([('clf', LightGbmClassifier.model)]) return model
def test_pipeline_init_tuple(): # Pipeline accepts steps as tuple X = np.array([[1, 2]]) pipe = Pipeline((("transf", Transf()), ("clf", FitParamT()))) pipe.fit(X, y=None) pipe.score(X) pipe.set_params(transf="passthrough") pipe.fit(X, y=None) pipe.score(X)
def test_set_pipeline_steps(): transf1 = Transf() transf2 = Transf() pipeline = Pipeline([('mock', transf1)]) assert pipeline.named_steps['mock'] is transf1 # Directly setting attr pipeline.steps = [('mock2', transf2)] assert 'mock' not in pipeline.named_steps assert pipeline.named_steps['mock2'] is transf2 assert [('mock2', transf2)] == pipeline.steps # Using set_params pipeline.set_params(steps=[('mock', transf1)]) assert [('mock', transf1)] == pipeline.steps # Using set_params to replace single step pipeline.set_params(mock=transf2) assert [('mock', transf2)] == pipeline.steps # With invalid data pipeline.set_params(steps=[('junk', ())]) with raises(TypeError): pipeline.fit([[1]], [1]) with raises(TypeError): pipeline.fit_transform([[1]], [1])
def fit(self, X_train, y_train): # create list of targets # self.pipelines_list = [] # self.preds = [] # for i in targets : # x. feature engineering (i) # y = df[i] # cv_scores (x, y, pipeline_per_target[i]) # model = pipeline_per_target[i].train(x, y) # pipelines_list.append(model) # preds.append(model.pred(x)) # y = df[y] # combined_model = LogReg.train(preds, y) # print results.... # def pred(X): # if intrusion: y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion) else: y_pred_intrusion = 1 if avoidance: y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance) else: y_pred_avoidance = 1 if hypertension: y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension) else: y_pred_hypertension = 1 if depression: y_pred_depression = self.pipe_depression.predict(X_depression) else: y_pred_depression = 1 if only_avoidance: y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_only_avoidance) else: y_pred_only_avoidance = 1 if PCL_Strict3: y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_PCL_Strict3) else: y_pred_PCL_Strict3 = 1 if regression_cutoff_33: y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_regression_cutoff_33) else: y_pred_regression_cutoff_33 = 1 if regression_cutoff_50: y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_regression_cutoff_50) else: y_pred_regression_cutoff_50 = 1 if tred_cutoff: y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_tred_cutoff) else: y_pred_tred_cutoff = 1 y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression & y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 & y_pred_regression_cutoff_50 & y_pred_tred_cutoff) y_target = y_train acc = accuracy_score(y_target, y_pred) f1 = f1_score(y_target, y_pred) recall = recall_score(y_target, y_pred) precision = precision_score(y_target, y_pred) print("training scores") print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}") # combined y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension) y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance) y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion) y_pred_regression = self.pipe_regression.predict(X_regression) X_train["y_pred_hypertension"] = y_pred_hypertension X_train["y_pred_avoidance"] = y_pred_avoidance X_train["y_pred_intrusion"] = y_pred_intrusion X_train["y_pred_regression"] = y_pred_regression preds = ["y_pred_hypertension", "y_pred_avoidance", "y_pred_intrusion", "y_pred_regression"] X_combined = X_train[['q6.11_NUMB_pcl2', 'q6.13_SLEEP_pcl1', 'intrusion_pcl2', 'phq2'] + preds].values y_combined = y_train self.pipe_combined = Pipeline(steps=[ ('classifier', DecisionTreeClassifier())]) scores = cross_val_score(self.pipe_combined, X_combined, y_combined, scoring='precision', cv=StratifiedKFold(5)) print(f"hypertension {sum(scores)/5}") self.pipe_combined.fit(X_combined, y_combined)
X = pd.DataFrame(X, columns=['X1', 'X2', 'Run Date']) cv = None n_jobs = 1 # BULID THE TRANSFORMATION PORTION OF THE ML PIPELINE preprocess = PreProcessPipeline( imputer_method=imputer_method, normalize_method=normalize_method, resample_method=resample_method, cols_to_remove=X.columns.to_list().index('Run Date')) steps = preprocess.get_steps() # GET THE ESTIMATOR OBJECT AND APPEND IT AS THE FINAL STEP IN THE PIPELINE model = get_classifier_model(model_name, params=None) steps.append(('model', model)) print(steps) # INITIALIZE THE PIPELINE pipe = Pipeline(steps) clf = CalibratedClassifierCV(pipe, cv=cv, method='isotonic', n_jobs=n_jobs) print(clf) clf.fit(X, y) joblib.dump(clf, 'test_pipeline.joblib') clf = joblib.load('test_pipeline.joblib') clf.predict(X)
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(gamma='scale', probability=True, random_state=0) pca = PCA(svd_solver='full', n_components='mle', whiten=True) pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_memory_sampler(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) cachedir = mkdtemp() try: memory = Memory(cachedir=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def test_pipeline_methods_anova_rus(): # Test the various methods of the pipeline (anova). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) # Test with RandomUnderSampling + Anova + LogisticRegression clf = LogisticRegression(solver="lbfgs") rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([("rus", rus), ("anova", filter1), ("logistic", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression() filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(probability=True, random_state=0) pca = PCA() pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_param_error(): clf = make_pipeline(LogisticRegression()) with pytest.raises( ValueError, match="Pipeline.fit does not accept the sample_weight parameter", ): clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1]) parameter_grid_test_verbose = ( (est, pattern, method) for (est, pattern), method in itertools.product( [ ( Pipeline([("transf", Transf()), ("clf", FitParamT())]), r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n" r"\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$", ), ( Pipeline([("transf", Transf()), ("noop", None), ("clf", FitParamT())]), r"\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n" r"\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n" r"\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$", ), ( Pipeline( [ ("transf", Transf()), ("noop", "passthrough"), ("clf", FitParamT()),
def f(colname, criteria): task = Classify() X, y = task.process_data(colname, data) print(X.columns, X.shape) task.summary() model_rf = Pipeline([ ('random', RandomOverSampler(random_state=1)), # ('sampling', SMOTE()), ('classification', RandomForestClassifier(random_state=1)) ]) parameters_rf = { 'classification__max_depth': [2, 5, 10], 'classification__n_estimators': [50, 100, 200, 500, 1000], 'classification__min_samples_split': [2] } model_log = Pipeline([ ('random', RandomOverSampler(random_state=1)), # ('sampling', SMOTE()), ('classification', LogisticRegression(multi_class='ovr', random_state=1, fit_intercept=False)) ]) parameters_log = [{ 'classification__penalty': ['l2'], 'classification__solver': ['lbfgs'], 'classification__C': list(range(1, 11)) }, { 'classification__penalty': ['l1'], 'classification__solver': ['liblinear'], 'classification__C': list(range(1, 11)) }] model_gdbt = Pipeline([ ('random', RandomOverSampler(random_state=1)), # ('sampling', SMOTE()), ('classification', boosting(random_state=1)) ]) parameters_gdbt = { 'classification__max_depth': [2, 5], 'classification__n_estimators': [100, 500, 1000], 'classification__min_samples_split': [2, 4, 6], 'classification__learning_rate': [0.01, 0.1] } # X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=30, stratify=y) clf_log, best_parameters, best_score_log = task.classify( parameters_log, model_log, criteria) #training_process(X, y, parameters_log, model_log) clf_rf, best_parameters, best_score_rf = task.classify( parameters_rf, model_rf, criteria) # training_process(X, y, parameters_rf, model_rf) # clf_gdbt, best_parameters, best_score_gdbt = task.classify(parameters_gdbt, model_gdbt, criteria) # training_process(X, y, parameters_gdbt, model_gdbt) # prediction y_rf, y_log = clf_rf.predict(X), clf_log.predict(X) # y_gdbt = clf_gdbt.predict(X) res = { # 'gdbt': {'best_score': best_score_gdbt, 'balanced_acc': balanced_accuracy_score(y, y_gdbt), 'acc': accuracy_score(y, y_gdbt)}, 'rf': { 'best_score': best_score_rf, 'balanced_acc': balanced_accuracy_score(y, y_rf), 'acc': accuracy_score(y, y_rf) }, 'log': { 'best_score': best_score_log, 'balanced_acc': balanced_accuracy_score(y, y_log), 'acc': accuracy_score(y, y_log) } } ''' clf_log, best_parameters, best_score_log, bacc_log, acc_log = task.classify(parameters_log, model_log, criteria) #training_process(X, y, parameters_log, model_log) clf_rf, best_parameters, best_score_rf, bacc_rf, acc_rf = task.classify(parameters_rf, model_rf, criteria) # training_process(X, y, parameters_rf, model_rf) clf_gdbt, best_parameters, best_score_gdbt, bacc_gdbt, acc_gdbt = task.classify(parameters_gdbt, model_gdbt, criteria) # training_process(X, y, parameters_gdbt, model_gdbt) res = { 'gdbt': {'best_score': best_score_gdbt, 'balanced_acc': bacc_gdbt, 'acc': acc_gdbt}, 'rf': {'best_score': best_score_rf, 'balanced_acc': bacc_rf, 'acc': acc_rf}, 'log': {'best_score': best_score_log, 'balanced_acc': bacc_log, 'acc': acc_log} } ''' # print(best_score_gdbt, best_score_rf, best_score_log) return clf_log, clf_rf, res #clf_gdbt, res
ratio_list = [0.042, 0.111, 0.333, 0.538, 1] percentage_list = [4, 10, 25, 35, 50] #This loop executes the oversampling strategy (In this case SMOTE) for all the ratio's that were tested. for ratio, percentage in zip(ratio_list, percentage_list): # Create a train-test split where the ratio of target class is maintained x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=47, stratify=y) #Initialize a SMOTE sampler with ratio that will be tested over = SMOTE(sampling_strategy=ratio) #Initialize a pipeline (One can add extra steps here if required) steps = [('o', over)] pipeline = Pipeline(steps) #Resample data x_res, y_res = pipeline.fit_resample(x_train, y_train) print('resample finished') #Train an xg_boost model with resampled data xgb = xg_boost(x_res, y_res, x_test, y_test, f"SMOTE_{percentage}") # The code below was used to calculate the running times. # Since some running times were very long, we let the code time-out after 10 hours. # # It is less relevant for WWF, hence it is commented out. #List of sub-sample sizes that were evaluated to calculate running times. # subset_list = [30000, 50000, 75000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1500000, 2000000] # times_subsetsize_list = [] # def calculate_running_times():
def extract_categorical_visualize_graphs(data_input_path="04_Model" + "/" + "prepared_input.pickle", top_percentage=0.2): ''' Of the results of a wide search algorithm, find the top x percent (deafult=20%), calculate the median value for each parameter and select the parameter value with the best median result. Visualize results of the search algorithm. :args: data_path: Path to the pickle with the complete results of the run top_percentage: Top share of results to consider. def: 0.2. :return: Nothing ''' # Get necessary data from the data preparation r = open(data_input_path, "rb") prepared_data = pickle.load(r) model_name = prepared_data['paths']['dataset_name'] model_directory = prepared_data['paths']['model_directory'] results_file_path = prepared_data['paths']['svm_run1_result_filename'] refit_scorer_name = prepared_data['refit_scorer_name'] selected_features = prepared_data['selected_features'] feature_dict = prepared_data['feature_dict'] X_train = prepared_data['X_train'] svm_pipe_first_selection = prepared_data['paths'][ 'svm_pipe_first_selection'] s = open(results_file_path, "rb") results = pickle.load(s) results_run1 = results['result'] params_run1 = results['parameter'] #Create result table #merged_params_run1 = {} #for d in params_run1: # merged_params_run1.update(d) # Get the top x% values from the results # number of results to consider #top_percentage = 0.2 number_results = np.int(results_run1.shape[0] * top_percentage) print("The top {} of the results are used, i.e {} samples".format( top_percentage * 100, number_results)) results_subset = results_run1.iloc[0:number_results, :] ## %% Plot graphs # Prepare the inputs: Replace the lists with strings result_subset_copy = results_subset.copy() print("Convert feature lists to names") sup.list_to_name(selected_features, list(feature_dict.keys()), result_subset_copy['param_feat__cols']) # Replace lists in the parameters with strings params_run1_copy = copy.deepcopy(params_run1) sup.replace_lists_in_grid_search_params_with_strings( selected_features, feature_dict, params_run1_copy) # Plot the graphs _, scaler_medians = vis.visualize_parameter_grid_search( 'scaler', params_run1, results_subset, refit_scorer_name, save_fig_prefix=model_directory + '/images/' + model_name) _, sampler_medians = vis.visualize_parameter_grid_search( 'sampling', params_run1, results_subset, refit_scorer_name, save_fig_prefix=model_directory + '/images/' + model_name) _, kernel_medians = vis.visualize_parameter_grid_search( 'svm__kernel', params_run1, results_subset, refit_scorer_name, save_fig_prefix=model_directory + '/images/' + model_name) _, feat_cols_medians = vis.visualize_parameter_grid_search( 'feat__cols', params_run1_copy, result_subset_copy, refit_scorer_name, save_fig_prefix=model_directory + '/images/' + model_name) ## Get the best parameters # Get the best scaler from median best_scaler = max(scaler_medians, key=scaler_medians.get) print("Best scaler: ", best_scaler) best_sampler = max(sampler_medians, key=sampler_medians.get) print("Best sampler: ", best_sampler) best_kernel = max(kernel_medians, key=kernel_medians.get) print("Best kernel: ", best_kernel) # Get best feature result # Get the best kernel best_feat_cols = max(feat_cols_medians, key=feat_cols_medians.get) # source.idxmax() # print("Best {}: {}".format(name, best_feature_combi)) best_columns = feature_dict.get(best_feat_cols) print("Best feature selection: ", best_feat_cols) print( "Best column indices: ", best_columns ) # feature_dict.get((results_run1[result_columns_run1].loc[indexList]['param_feat__cols'].iloc[best_feature_combi]))) print("Best column names: ", list(X_train.columns[best_columns])) # Define pipeline, which is constant for all tests pipe_run_best_first_selection = Pipeline([ ('scaler', best_scaler), ('sampling', best_sampler), ('feat', modelutil.ColumnExtractor(cols=best_columns)), ('svm', SVC(kernel=best_kernel)) ]) display(pipe_run_best_first_selection) # Save best pipe dump(pipe_run_best_first_selection, open(svm_pipe_first_selection, 'wb')) print("Stored pipe_run_best_first_selection at ", svm_pipe_first_selection) print("Method end")
def fit(self, X_train, y_train): predictions_list = [] for target in self.targets_list: if self.use_feature_engineering: X = FeatureEngineering(X_train[self.features], target).engineer_features().values else: X = X_train[self.features].values if target == "PCL_Strict3": y = y_train[target].apply(lambda x: int(x)) else: y = X_train[target].apply(lambda x: int(x)) pipeline = pipeline_per_target[target] scores = cross_val_score(pipeline, X, y, scoring='f1', cv=StratifiedKFold(5)) print(f"{target} - {sum(scores)/len(scores)}") if self.train_on_partial_prediction: combined_y = pd.DataFrame(y, columns=[target]) if target != "PCL_Strict3": combined_y["PCL_Strict3"] = y_train["PCL_Strict3"].apply(lambda x: int(x)) _X_train, _X_test, _y_train, _y_test = \ train_test_split(X, combined_y, test_size=0.25) self.trained_pipelines[target] = pipeline.fit(_X_train, _y_train[target]) y_pred = self.trained_pipelines[target].predict(_X_test) predictions_list.append(y_pred) print("test f1", target, f1_score(_y_test[target], y_pred)) self.trained_pipelines[target] = pipeline.fit(X, y) y = _y_test["PCL_Strict3"] else: self.trained_pipelines[target] = pipeline.fit(X, y) predictions_list.append([self.trained_pipelines[target].predict(X)]) y = y_train["PCL_Strict3"] if self.check_on_test_set: if target == "PCL_Strict3": y_test = self.y_test[target].apply(lambda x: int(x)) else: y_test = X_train[target].apply(lambda x: int(x)) if self.use_feature_engineering: X_test = FeatureEngineering(self.X_test[self.features], target).engineer_features().values else: X_test = self.X_test[self.features].values model = self.trained_pipelines[target] y_pred = model.predict(X_test) s_f = f1_score(self.y_test, y_pred) s_p = precision_score(self.y_test, y_pred) s_r = recall_score(self.y_test, y_pred) print(f"test f1 {target}", s_f) print(f"test recall {target}", s_r) print(f"test precision {target}", s_p) #pipe = Pipeline(steps=[ # ('scaling', StandardScaler()), # ('sampling', SMOTE()), # ('classifier', LogisticRegression(penalty='l1'))]) #c = ((len(y) - sum(y)) / sum(y)) if not self.use_and_func: c = 10 pipe = Pipeline(steps=[('feature_selection', RFE(XGBClassifier(n_estimators=10, scale_pos_weight=c))), ('clf', XGBClassifier(scale_pos_weight=c))]) X = predictions_list self.combined_model = pipe.fit(np.array(X).reshape(-1, len(predictions_list)), y)
def test_pipeline_init(): # Test the various init parameters of the pipeline. with raises(TypeError): Pipeline() # Check that we can't instantiate pipelines with objects without fit # method error_regex = 'Last step of Pipeline should implement fit. .*NoFit.*' with raises(TypeError, match=error_regex): Pipeline([('clf', NoFit())]) # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([('svc', clf)]) expected = dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)) assert pipe.get_params(deep=True) == expected # Check that params are set pipe.set_params(svc__a=0.1) assert clf.a == 0.1 assert clf.b is None # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC(gamma='scale') filter1 = SelectKBest(f_classif) pipe = Pipeline([('anova', filter1), ('svc', clf)]) # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform error_regex = 'implement fit and transform or sample' with raises(TypeError, match=error_regex): Pipeline([('t', NoTrans()), ('svc', clf)]) # Check that params are set pipe.set_params(svc__C=0.1) assert clf.C == 0.1 # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong with raises(ValueError): pipe.set_params(anova__C=0.1) # Test clone pipe2 = clone(pipe) assert not pipe.named_steps['svc'] is pipe2.named_steps['svc'] # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('svc') params.pop('anova') params2.pop('svc') params2.pop('anova') assert params == params2
def test_set_pipeline_steps(): transf1 = Transf() transf2 = Transf() pipeline = Pipeline([("mock", transf1)]) assert pipeline.named_steps["mock"] is transf1 # Directly setting attr pipeline.steps = [("mock2", transf2)] assert "mock" not in pipeline.named_steps assert pipeline.named_steps["mock2"] is transf2 assert [("mock2", transf2)] == pipeline.steps # Using set_params pipeline.set_params(steps=[("mock", transf1)]) assert [("mock", transf1)] == pipeline.steps # Using set_params to replace single step pipeline.set_params(mock=transf2) assert [("mock", transf2)] == pipeline.steps # With invalid data pipeline.set_params(steps=[("junk", ())]) with raises(TypeError): pipeline.fit([[1]], [1]) with raises(TypeError): pipeline.fit_transform([[1]], [1])
def test_pipeline_methods_anova_rus(): # Test the various methods of the pipeline (anova). X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) # Test with RandomUnderSampling + Anova + LogisticRegression clf = LogisticRegression() rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression(solver='lbfgs', multi_class='auto') filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)