def train_model(self, X_train, y_train): ''' Train the model with specified experiments params: X_train pd.Dataframe with train data y_train pd.Series with train labels return: dict with trained model ''' for alg in self.tested_algorithms.keys(): print('Treinando o modelo', alg) test = self.tested_algorithms[alg] print(test) steps = [('over', SMOTE()), ('model', test)] pipeline = Pipeline(steps=steps) pipeline.fit(X_train, y_train) print('Cross val score using RepeatedStratifiedKFold') cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=42) scores = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1) print(np.mean(scores)) if self.models is None: self.models = {alg: test} else: self.models.update({alg: test}) return self.models
def test_evaluate_pipeline(self): X, y = make_classification( n_samples=100, n_features=5, n_informative=2, n_redundant=2 ) X_train, _, y_train, _ = train_test_split( X, y, test_size=cfg.TEST_SIZE, random_state=cfg.RANDOM_STATE ) dummy_pipeline = Pipeline( [("dummy_classifier", DummyClassifier(strategy="constant", constant=0))] ) dummy_pipeline.fit(X_train, y_train) with tempfile.TemporaryDirectory() as destination: threshold = destination + "/DUMMY_threshold.json" save_pipeline( pipeline=dummy_pipeline, model="DUMMY", optimal_threshold=0, destination=destination, ) evaluate_pipeline( X=X, y=y, pipeline=dummy_pipeline, threshold=threshold, prefix="DUMMY", destination=destination, ) files = glob.glob(destination + "/*") self.assertTrue(any([".png" in file for file in files])) self.assertTrue(any([".json" in file for file in files])) self.assertTrue(any([".csv" in file for file in files]))
def trainPipeLine(databaseName, samplerName, scalerName, featureSelectorName, modelName, expectedVariance): dataSet = getAllRecordsFromDatabase(databaseName) availableSamplers = getRandomSamplers() availableScalers = getScalers() availableFeatureSelectors = getFeatureSelectors() availableModels = getModels() features = dataSet[:, 1:-1] binaries = dataSet[:, -1:] binaries = binaries.astype(int) sampler = availableSamplers.get(samplerName) sampledFeatures, sampledLabels = sampler.fit_resample(features, binaries) scaler = availableScalers.get(scalerName) featureSelector = availableFeatureSelectors.get(featureSelectorName) model = availableModels.get(modelName) pipeline = Pipeline([('scaler', scaler), ('featureSelector', featureSelector), ('m', model)]) trainFeatures, testFeatures, trainLabels, testLabels = train_test_split( sampledFeatures, sampledLabels, test_size=0.2, random_state=0) pipeline.fit(trainFeatures, trainLabels) pickledPipelineName = "Pipeline_" + databaseName + "_" + samplerName + "_" + scalerName + "_" + featureSelectorName + "_" + modelName storePickledPipeline(pipeline, pickledPipelineName) qualifyPipeline(pipeline, pickledPipelineName, testFeatures, testLabels, databaseName, samplerName, scalerName, featureSelectorName, modelName)
def test_predict_with_predict_params(): # tests that Pipeline passes predict_params to the final estimator # when predict is invoked pipe = Pipeline([('transf', Transf()), ('clf', DummyEstimatorParams())]) pipe.fit(None, None) pipe.predict(X=None, got_attribute=True) assert pipe.named_steps['clf'].got_attribute
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples, )) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def Predict(data, mode): train, test = data idx = test.id.values.astype(int) y = train.median_relevance.values train_query = list( train.apply(lambda x: '%s' % x['query_preprocessed'], axis=1)) train_title = list( train.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1)) test_query = list( test.apply(lambda x: '%s' % x['query_preprocessed'], axis=1)) test_title = list( test.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1)) stop_words = text.ENGLISH_STOP_WORDS.union(['http','www','img','border','color','style','padding','table','font', \ 'thi','inch','ha','width','height','0','1','2','3','4','5','6','7','8','9']) stop_words = text.ENGLISH_STOP_WORDS.union(set(stopwords.words('english'))) tfv = text.TfidfVectorizer(min_df=7, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', \ ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words=stop_words) tfv.fit(train_query + train_title) X_train = hstack([tfv.transform(train_query), tfv.transform(train_title)]) X_test = hstack([tfv.transform(test_query), tfv.transform(test_title)]) sim = similarlity_stack() if mode == 'eda': svd = TruncatedSVD(n_components=200) scl = StandardScaler(with_mean=False) svm = SVC(C=10, gamma="auto", kernel="rbf", class_weight=None, probability=True) clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\ ('scl', scl),\ ('svm', svm)]) elif mode == 'sampling': svd = TruncatedSVD(n_components=200) scl = StandardScaler(with_mean=False) svm = SVC(C=10, gamma="auto", kernel="rbf", class_weight=None, probability=True) sampling = SVMSMOTE(svm_estimator=svm, k_neighbors=4) clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\ ('scl', scl),\ ('sampling', sampling),\ ('svm', svm)]) clf.fit(X_train, y) preds = clf.predict(X_test) pred_probas = clf.predict_proba(X_test) submission = pd.DataFrame({"id": idx, "prediction": preds}) submission_probas = pd.DataFrame(pred_probas, index=idx) return submission, submission_probas
def test_set_pipeline_steps(): transf1 = Transf() transf2 = Transf() pipeline = Pipeline([("mock", transf1)]) assert pipeline.named_steps["mock"] is transf1 # Directly setting attr pipeline.steps = [("mock2", transf2)] assert "mock" not in pipeline.named_steps assert pipeline.named_steps["mock2"] is transf2 assert [("mock2", transf2)] == pipeline.steps # Using set_params pipeline.set_params(steps=[("mock", transf1)]) assert [("mock", transf1)] == pipeline.steps # Using set_params to replace single step pipeline.set_params(mock=transf2) assert [("mock", transf2)] == pipeline.steps # With invalid data pipeline.set_params(steps=[("junk", ())]) with raises(TypeError): pipeline.fit([[1]], [1]) with raises(TypeError): pipeline.fit_transform([[1]], [1])
def test_set_pipeline_steps(): transf1 = Transf() transf2 = Transf() pipeline = Pipeline([('mock', transf1)]) assert pipeline.named_steps['mock'] is transf1 # Directly setting attr pipeline.steps = [('mock2', transf2)] assert 'mock' not in pipeline.named_steps assert pipeline.named_steps['mock2'] is transf2 assert [('mock2', transf2)] == pipeline.steps # Using set_params pipeline.set_params(steps=[('mock', transf1)]) assert [('mock', transf1)] == pipeline.steps # Using set_params to replace single step pipeline.set_params(mock=transf2) assert [('mock', transf2)] == pipeline.steps # With invalid data pipeline.set_params(steps=[('junk', ())]) with raises(TypeError): pipeline.fit([[1]], [1]) with raises(TypeError): pipeline.fit_transform([[1]], [1])
def test_pipeline_sample(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pipeline = Pipeline([('rus', rus)]) # test transform and fit_transform: X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_trans2, y_trans2 = pipeline.fit_sample(X, y) X_trans3, y_trans3 = rus.fit_sample(X, y) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) assert_array_almost_equal(y_trans, y_trans2) assert_array_almost_equal(y_trans, y_trans3) pca = PCA() pipeline = Pipeline([('pca', pca), ('rus', rus)]) X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_pca = pca.fit_transform(X) X_trans2, y_trans2 = rus.fit_sample(X_pca, y) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(y_trans, y_trans2)
def test_pipeline_score_samples_pca_lof(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0, ) # Test that the score_samples method is implemented on a pipeline. # Test that the score_samples method on pipeline yields same results as # applying transform and score_samples steps separately. rus = RandomUnderSampler(random_state=42) pca = PCA(svd_solver="full", n_components="mle", whiten=True) lof = LocalOutlierFactor(novelty=True) pipe = Pipeline([("rus", rus), ("pca", pca), ("lof", lof)]) pipe.fit(X, y) # Check the shapes assert pipe.score_samples(X).shape == (X.shape[0], ) # Check the values X_res, _ = rus.fit_resample(X, y) lof.fit(pca.fit_transform(X_res)) assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X)))
def test_pipeline_methods_rus_pca_svm(): # Test the various methods of the pipeline (pca + svm). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) # Test with PCA + SVC clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([("rus", rus), ("pca", pca), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_predict_with_predict_params(): # tests that Pipeline passes predict_params to the final estimator # when predict is invoked pipe = Pipeline([("transf", Transf()), ("clf", DummyEstimatorParams())]) pipe.fit(None, None) pipe.predict(X=None, got_attribute=True) assert pipe.named_steps["clf"].got_attribute
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples,)) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def find_expert(tag): """ 输出话题标签[TAG]下模型预测的最有可能是潜在专家的20名用户 """ fold = StratifiedKFold(n_splits=4) params = best_solution(tag) data, target, ratio = load_data(tag) fold.random_state = int(params['seed']) samp = ADASYN(n_neighbors=2, sampling_strategy=float(params['sampling_strategy']) * ratio, random_state=int(params['seed'])) clf = XGBClassifier(n_estimators=int(params['n_estimators']), gamma=float(params['gamma']), eta=float(params['eta']), reg_lambda=int(params['reg_lambda']), verbosity=0, n_jobs=-1, random_state=int(params['seed'])) pipeline = Pipeline([(type(samp).__name__, samp), (type(clf).__name__, clf)]) experts = pd.DataFrame(columns=['id', 'probability']) for _, (train, test) in tqdm(enumerate(fold.split(data, target)), total=4): pipeline.fit(data.iloc[train], target.iloc[train]) pred_proba = pd.Series(pipeline.predict_proba(data.iloc[test])[:, 1], index=target.iloc[test].index, name='probability') experts = experts.append(pred_proba.to_frame().reset_index()) experts = experts.sort_values(by=['probability'], ascending=False).iloc[:20] experts['probability'] = experts['probability'].astype(float).map( "{:.1%}".format) print(experts.to_string(index=False))
def illigal_genralization_checking(self, X_test, y_test): X = self.df[self.features] X_test = X_test[self.features] Y = self.df[self.target] pipe = Pipeline( steps=[('classifier', XGBClassifier( n_estimators=1000, scale_pos_weight=3, reg_alpha=1))]) y_test = y_test["intrusion_cutoff"].apply(lambda x: int(x)) scores = cross_val_score(pipe, X, Y, scoring='precision', cv=StratifiedKFold(5)) print(self.features) print("cross vl scores") print(sum(scores) / 5) pipe.fit(X, Y.values) y_pred = pipe.predict(X_test) acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) print("test scores") print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver="randomized", whiten=True) clf = SVC( gamma="scale", probability=True, random_state=0, decision_function_shape="ovr", ) for preprocessing in [scaler, pca]: pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert predict.shape == (n_samples, ) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) log_proba = pipe.predict_log_proba(X) assert log_proba.shape == (n_samples, n_classes) decision_function = pipe.decision_function(X) assert decision_function.shape == (n_samples, n_classes) pipe.score(X, y)
def main(): """ Trains a logistic regression, an attempt to be 'production' grade """ logger = logging.getLogger(__name__) logger.info(f'Reading data') processed_df = pd.read_csv('../../data/processed/processed.csv') X = processed_df.drop('Class', axis=1).values y = processed_df['Class'].values accuracy_lst = [] precision_lst = [] recall_lst = [] f1_lst = [] rand_log_reg = RandomizedSearchCV( baseline_classifiers['LogisticRegression'], LogisticRegression_rndm_params, n_iter=4) skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False) logger.info(f'Constructing model pipeline and cross validating') idx = 1 for train, test in skf.split(X=X, y=y): logger.info(f'Run {idx}') model = Pipeline([('sampling', SMOTE(sampling_strategy='minority')), ('classification', rand_log_reg)]) model.fit(X[train], y[train]) best_estimators = rand_log_reg.best_estimator_ prediction = best_estimators.predict(X[test]) accuracy_lst.append(model.score(X[test], y[test])) precision_lst.append(precision_score(y[test], prediction)) recall_lst.append(recall_score(y[test], prediction)) f1_lst.append(f1_score(y[test], prediction)) idx += 1 metrics = f''' Accuracy: {mean(accuracy_lst)} \n Precision: {mean(precision_lst)} \n Recall: {mean(recall_lst)} \n F1: {mean(f1_lst)} ''' print(metrics) f = open(f'../../models/metrics.txt', 'w') f.write(metrics) f.close() joblib.dump(rand_log_reg, f'../../models/{best_model_file_name}', compress=9) logger.info(f'Serialised model as {best_model_file_name}') return rand_log_reg
def test_row_selector_pipeline_integration(): """Test the integration of row selector and pipelines.""" pipeline = Pipeline([('selector', RowSelector(sampling_strategy=0.8, selection_strategy=0)), ('lr', LinearRegression())]) pipeline.fit(X, y)
def test_pipeline_sample_weight_unsupported(): # When sample_weight is None it shouldn't be passed X = np.array([[1, 2]]) pipe = Pipeline([("transf", Transf()), ("clf", Mult())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, sample_weight=None) == 3 with raises(TypeError, match="unexpected keyword argument"): pipe.score(X, sample_weight=np.array([2, 3]))
def test_pipeline_sample_weight_unsupported(): # When sample_weight is None it shouldn't be passed X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', Mult())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, sample_weight=None) == 3 with raises(TypeError, match="unexpected keyword argument"): pipe.score(X, sample_weight=np.array([2, 3]))
def test_pipeline_sample_weight_supported(): # Pipeline should pass sample_weight X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, y=None) == 3 assert pipe.score(X, y=None, sample_weight=None) == 3 assert pipe.score(X, sample_weight=np.array([2, 3])) == 8
def test_pipeline_sample_weight_supported(): # Pipeline should pass sample_weight X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) pipe.fit(X, y=None) assert_equal(pipe.score(X), 3) assert_equal(pipe.score(X, y=None), 3) assert_equal(pipe.score(X, y=None, sample_weight=None), 3) assert_equal(pipe.score(X, sample_weight=np.array([2, 3])), 8)
def test_pipeline_sample_weight_supported(): # Pipeline should pass sample_weight X = np.array([[1, 2]]) pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, y=None) == 3 assert pipe.score(X, y=None, sample_weight=None) == 3 assert pipe.score(X, sample_weight=np.array([2, 3])) == 8
def test_pipeline_init_tuple(): # Pipeline accepts steps as tuple X = np.array([[1, 2]]) pipe = Pipeline((("transf", Transf()), ("clf", FitParamT()))) pipe.fit(X, y=None) pipe.score(X) pipe.set_params(transf="passthrough") pipe.fit(X, y=None) pipe.score(X)
def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([('transf', TransfT()), ('clf', FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert_true(pipe.predict(None)) # and transformer params should not be changed assert_true(pipe.named_steps['transf'].a is None) assert_true(pipe.named_steps['transf'].b is None)
class ImblearnRecalibrator(BaseEstimator, ClassifierMixin): """ imblearnのリサンプリングの偏りを再較正するやつ 再較正のコードを毎回書きたくない. scikit-learnの設計思想に則りオブジェクト指向プログラミングをしよう estimator, resampler, サンプリング割合を指定したら後は fit & predict/predict_proba するだけ * 注意: 不均衡データに対するリサンプリングは分類性能を目的としているので判別性能等に効果があるかは知らない :param estimatror: scikit-learn API 準拠の estimator オブジェクト :param resampler: imblearn で使われる各種 resampler オブジェクト :param post_minor_rate: リサンプリング後の**全件に対する少数例の割合**を指定. default is None. alpha とどちらか片方を使う. :param alpha: **リサンプリング前に対する**事後の少数例の割合**を指定. default is 'auto'. post_minor_rate とどちらか片方を使う. """ def __init__(self, estimator, resampler, alpha='auto', post_minor_rate=None): resampler = clone(resampler) if post_minor_rate is None and alpha is None: warnings.warn( 'neither of `post_minor_rate` nor `alpha` are specified. Instead resampling stragegy specified in `resampler` object is used.' ) elif post_minor_rate and alpha: warnings.warn( 'both of `post_minor_rate` and `alpha` are specified. the former is applied.' ) self.post_minor_rate = post_minor_rate self.resampling_strategy = 'posterior_rate' elif post_minor_rate: self.post_minor_rate = post_minor_rate self.resampling_strategy = 'posterior_rate' elif alpha: self.alpha = alpha self.resampling_strategy = 'alpha' resampler.set_params(sampling_strategy=alpha) else: raise ('initialized error') self.estimator_ = Pipeline([('resampler', resampler), ('estimator', clone(estimator))]) def fit(self, X, y): if self.resampling_strategy == 'posterior_rate': alpha = get_oversampling_rate(self.post_minor_rate) self.alpha = alpha self.estimator_['resampler'].set_params(sampling_strategy=alpha) self.estimator_.fit(X, y) self.minor_rate_ = np.min([y.mean(), 1 - y.mean()]) return self def predict(self, X): return self.estimator_.predict(X) def predict_proba(self, X): return calibrate_imbalanceness(self.estimator_.predict_proba(X), pos_rate=get_oversampling_power( self.alpha, self.minor_rate_))
def test_pipeline_correctly_adjusts_steps(passthrough): X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) pipeline = Pipeline([('m2', mult2), ('bad', passthrough), ('m3', mult3), ('m5', mult5)]) pipeline.fit(X, y) expected_names = ['m2', 'bad', 'm3', 'm5'] actual_names = [name for name, _ in pipeline.steps] assert expected_names == actual_names
def test_pipeline_correctly_adjusts_steps(passthrough): X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) pipeline = Pipeline([("m2", mult2), ("bad", passthrough), ("m3", mult3), ("m5", mult5)]) pipeline.fit(X, y) expected_names = ["m2", "bad", "m3", "m5"] actual_names = [name for name, _ in pipeline.steps] assert expected_names == actual_names
def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert pipe.predict(None) # and transformer params should not be changed assert pipe.named_steps["transf"].a is None assert pipe.named_steps["transf"].b is None # invalid parameters should raise an error message with raises(TypeError, match="unexpected keyword argument"): pipe.fit(None, None, clf__bad=True)
def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert pipe.predict(None) # and transformer params should not be changed assert pipe.named_steps['transf'].a is None assert pipe.named_steps['transf'].b is None # invalid parameters should raise an error message with raises(TypeError, match="unexpected keyword argument"): pipe.fit(None, None, clf__bad=True)
def test_pipeline_sample_weight_unsupported(): # When sample_weight is None it shouldn't be passed X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', Mult())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, sample_weight=None) == 3 assert_raise_message( TypeError, "score() got an unexpected keyword argument 'sample_weight'", pipe.score, X, sample_weight=np.array([2, 3]))
def test_pipeline_sample_transform(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pca = PCA() pca2 = PCA() pipeline = Pipeline([('pca', pca), ('rus', rus), ('pca2', pca2)]) pipeline.fit(X, y).transform(X)
def test_pipeline_wrong_memory(): # Test that an error is raised when memory is not a string or a Memory # instance iris = load_iris() X = iris.data y = iris.target # Define memory as an integer memory = 1 cached_pipe = Pipeline([('transf', DummyTransf()), ('svc', SVC())], memory=memory) error_regex = ("'memory' should either be a string or a joblib.Memory" " instance, got 'memory=1' instead.") with raises(ValueError, match=error_regex): cached_pipe.fit(X, y)
def test_pipeline_memory_transformer(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(probability=True, random_state=0) pca = PCA() pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression() filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression(solver="lbfgs", multi_class="auto") filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([("anova", filter1), ("logistic", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_wrong_memory(): # Test that an error is raised when memory is not a string or a Memory # instance iris = load_iris() X = iris.data y = iris.target # Define memory as an integer memory = 1 cached_pipe = Pipeline([("transf", DummyTransf()), ("svc", SVC(gamma="scale"))], memory=memory) error_regex = "string or have the same interface as" with raises(ValueError, match=error_regex): cached_pipe.fit(X, y)
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA(svd_solver="full", n_components="mle", whiten=True) pipe = Pipeline([("pca", pca), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_wrong_memory(): # Test that an error is raised when memory is not a string or a Memory # instance iris = load_iris() X = iris.data y = iris.target # Define memory as an integer memory = 1 cached_pipe = Pipeline( [('transf', DummyTransf()), ('svc', SVC(gamma='scale'))], memory=memory) error_regex = ("string or have the same interface as") with raises(ValueError, match=error_regex): cached_pipe.fit(X, y)
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(gamma='scale', probability=True, random_state=0) pca = PCA(svd_solver='full', n_components='mle', whiten=True) pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_anova_rus(): # Test the various methods of the pipeline (anova). X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) # Test with RandomUnderSampling + Anova + LogisticRegression clf = LogisticRegression() rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_rus_pca_svm(): # Test the various methods of the pipeline (pca + svm). X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) # Test with PCA + SVC clf = SVC(probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([('rus', rus), ('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def illigal_genralization_checking(self, X_test, y_test): X = self.df[self.features] X_test = X_test[self.features] Y = self.df[self.target] pipe = Pipeline(steps=[('classifier', XGBClassifier(n_estimators=1000, scale_pos_weight=3, reg_alpha=1))]) y_test = y_test["intrusion_cutoff"].apply(lambda x: int(x)) scores = cross_val_score(pipe, X, Y, scoring='precision', cv=StratifiedKFold(5)) print(self.features) print("cross vl scores") print(sum(scores)/5) pipe.fit(X, Y.values) y_pred = pipe.predict(X_test) acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) print("test scores") print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def three_models_combined(self, intrusion_features, avoidance_features, hypertension_features): self.df = self.df[~self.df['intrusion_cutoff'].isna()] self.df = self.df[~self.df['avoidance_cutoff'].isna()] self.df = self.df[~self.df['hypertention_cutoff'].isna()] print("self.df.shape", self.df.shape) X = self.df Y = self.df[self.target]# strict all_Y = [self.target, "intrusion_cutoff", "avoidance_cutoff", "hypertention_cutoff"] X_train, X_test, y_train, y_test = train_test_split(X, self.df[all_Y], test_size=0.25, random_state = 8526566, stratify=Y) # intrusion X_intrusion = X_train[intrusion_features].values y_intrusion = y_train["intrusion_cutoff"].apply(lambda x: int(x)) pipe_intrusion = Pipeline(steps=[ ('rfe', BorderlineSMOTE()), ('classifier', XGBClassifier(n_estimators=100, reg_alpha=1))]) scores = cross_val_score(pipe_intrusion, X_intrusion, y_intrusion, scoring='precision', cv=StratifiedKFold(5)) print(f"intrusion {sum(scores)/5}") pipe_intrusion.fit(X_intrusion, y_intrusion) # avoidance X_avoidance = X_train[avoidance_features].values y_avoidance = y_train["avoidance_cutoff"].apply(lambda x: int(x)) pipe_avoidance = Pipeline(steps=[ ('classifier', XGBClassifier(n_estimators=100, scale_pos_weight=3, reg_alpha=1))]) scores = cross_val_score(pipe_avoidance, X_avoidance, y_avoidance, scoring='precision', cv=StratifiedKFold(5)) print(f"avoidance {sum(scores)/5}") pipe_avoidance.fit(X_avoidance, y_avoidance) # hypertension X_hypertension = X_train[hypertension_features].values y_hypertention = y_train["hypertention_cutoff"].apply(lambda x: int(x)) pipe_hypertension = Pipeline(steps=[ ('classifier', BalancedBaggingClassifier(n_estimators=100))]) scores = cross_val_score(pipe_hypertension, X_hypertension, y_hypertention, scoring='precision', cv=StratifiedKFold(5)) print(f"hypertension {sum(scores)/5}") pipe_hypertension.fit(X_hypertension, y_hypertention) ## combine three classifiers X_test_hypertension = X_test[hypertension_features].values X_test_avoidance = X_test[avoidance_features].values X_test_intrusion = X_test[intrusion_features].values y_pred_hypertension = pipe_hypertension.predict(X_test_hypertension) y_pred_avoidance = pipe_avoidance.predict(X_test_avoidance) y_pred_intrusion = pipe_intrusion.predict(X_test_intrusion) y_pred = (y_pred_hypertension * y_pred_avoidance * y_pred_intrusion) y_target = y_test["PCL_Strict3"].apply(lambda x: int(x)) acc = accuracy_score(y_target, y_pred) f1 = f1_score(y_target, y_pred) recall = recall_score(y_target, y_pred) precision = precision_score(y_target, y_pred) print("test scores") print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def test_pipeline_sample(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pipeline = Pipeline([('rus', rus)]) # test transform and fit_transform: X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_trans2, y_trans2 = pipeline.fit_sample(X, y) X_trans3, y_trans3 = rus.fit_sample(X, y) assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(X_trans, X_trans3, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans3, rtol=R_TOL) pca = PCA() pipeline = Pipeline([('pca', PCA()), ('rus', rus)]) X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_pca = pca.fit_transform(X) X_trans2, y_trans2 = rus.fit_sample(X_pca, y) # We round the value near to zero. It seems that PCA has some issue # with that X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0 X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0 assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL)
def test_pipeline_transform(): # Test whether pipeline works with a transformer at the end. # Also test pipeline.transform and pipeline.inverse_transform iris = load_iris() X = iris.data pca = PCA(n_components=2) pipeline = Pipeline([('pca', pca)]) # test transform and fit_transform: X_trans = pipeline.fit(X).transform(X) X_trans2 = pipeline.fit_transform(X) X_trans3 = pca.fit_transform(X) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) X_back = pipeline.inverse_transform(X_trans) X_back2 = pca.inverse_transform(X_trans) assert_array_almost_equal(X_back, X_back2)
class TargetEnsembler(object): def __init__(self, features): self.features = features def fit(self, X_train, y_train): # intrusion if intrusion: X_intrusion = FeatureEngineering(X_train[self.features], "intrusion_cutoff").engineer_features().values y_intrusion = X_train["intrusion_cutoff"].apply(lambda x: int(x)) self.pipe_intrusion = Pipeline(steps=[ ('feature_selection', SelectFpr(alpha=0.05)), ('sampling', BorderlineSMOTE(k_neighbors=10)), ('classifier', XGBClassifier(n_estimators=300, max_depth=5))]) scores = cross_val_score(self.pipe_intrusion, X_intrusion, y_intrusion, scoring='f1', cv=StratifiedKFold(5)) print(f"intrusion {sum(scores)/5}") self.pipe_intrusion.fit(X_intrusion, y_intrusion) # avoidance if avoidance: X_avoidance = FeatureEngineering(X_train[self.features], "avoidance_cutoff").engineer_features().values y_avoidance = X_train["avoidance_cutoff"].apply(lambda x: int(x)) self.pipe_avoidance = Pipeline(steps=[ ('feature_selection', RFE(estimator=XGBClassifier(scale_pos_weight=5.88, n_estimators=100), n_features_to_select=20)), ('classifier', BalancedRandomForestClassifier(n_estimators=300, max_depth=10))]) scores = cross_val_score(self.pipe_avoidance, X_avoidance, y_avoidance, scoring='f1', cv=StratifiedKFold(5)) print(f"avoidance {sum(scores)/5}") self.pipe_avoidance.fit(X_avoidance, y_avoidance) # hypertension if hypertension: X_hypertension = FeatureEngineering(X_train[self.features], "hypertention_cutoff").engineer_features().values y_hypertention = X_train["hypertention_cutoff"].apply(lambda x: int(x)) self.pipe_hypertension = Pipeline(steps=[ ('feature_selection', RFE(estimator=XGBClassifier(n_estimators=100, scale_pos_weight=3.51), n_features_to_select=20)), ( 'sampling', SMOTE(k_neighbors=10)), ('classifier', BalancedRandomForestClassifier(n_estimators=100))]) scores = cross_val_score(self.pipe_hypertension, X_hypertension, y_hypertention, scoring='f1', cv=StratifiedKFold(5)) print(f"hypertension {sum(scores)/5}") self.pipe_hypertension.fit(X_hypertension, y_hypertention) # depression if depression: X_depression = FeatureEngineering(X_train[self.features], "depression_cutoff").engineer_features().values y_depression = X_train["depression_cutoff"].apply(lambda x: int(x)) self.pipe_depression = Pipeline(steps=[ ('feature_selection', SelectFdr(alpha=0.1)), ('sampling', SMOTE(k_neighbors=5)), ('classifier', RandomForestClassifier(n_estimators=100))]) scores = cross_val_score(self.pipe_depression, X_depression, y_depression, scoring='f1', cv=StratifiedKFold(5)) print(f"depression {sum(scores)/5}") self.pipe_depression.fit(X_depression, y_depression) # only_avoidance if only_avoidance: X_only_avoidance = FeatureEngineering(X_train[self.features], "only_avoidance_cutoff").engineer_features().values y_only_avoidance = X_train["only_avoidance_cutoff"].apply(lambda x: int(x)) self.pipe_only_avoidance = Pipeline(steps=[ ('feature_selection', RFE(XGBClassifier(n_estimators=100,max_depth=3), n_features_to_select=10)), ('classifier', BalancedRandomForestClassifier( n_estimators=500, max_depth=10))]) scores = cross_val_score(self.pipe_only_avoidance, X_only_avoidance, y_only_avoidance, scoring='f1', cv=StratifiedKFold(5)) print(f"only_avoidance {sum(scores)/5}") self.pipe_only_avoidance.fit(X_only_avoidance, y_only_avoidance) # pcl_strict3 if PCL_Strict3: X_PCL_Strict3 = FeatureEngineering(X_train[self.features], "PCL_Strict3").engineer_features().values y_PCL_Strict3 = y_train["PCL_Strict3"].apply(lambda x: int(x)) self.pipe_PCL_Strict3 = Pipeline(steps=[ ('feature_selection', SelectKBest(k=20)), ('sampling', SMOTE(k_neighbors=5)), ('classifier', XGBClassifier(max_depth=3, n_estimators=100))]) scores = cross_val_score(self.pipe_PCL_Strict3, X_PCL_Strict3, y_PCL_Strict3, scoring='f1', cv=StratifiedKFold(5)) print(f"PCL_Strict3 {sum(scores)/5}") self.pipe_PCL_Strict3.fit(X_PCL_Strict3, y_PCL_Strict3) # cutoff_33 if regression_cutoff_33: X_regression_cutoff_33 = FeatureEngineering(X_train[self.features], "regression_cutoff_33").engineer_features().values y_regression_cutoff_33 = X_train["regression_cutoff_33"].apply(lambda x: int(x)) self.pipe_regression_cutoff_33 = Pipeline(steps=[ ('feature_selection', SelectFpr(alpha=0.033)), ('sampling', SMOTE(k_neighbors=10)), ('classifier', RandomForestClassifier(n_estimators=100, max_depth=5))]) scores = cross_val_score(self.pipe_regression_cutoff_33, X_regression_cutoff_33, y_regression_cutoff_33, scoring='f1', cv=StratifiedKFold(5)) print(f"regression_cutoff_33 {sum(scores)/5}") self.pipe_regression_cutoff_33.fit(X_regression_cutoff_33, y_regression_cutoff_33) # cutoff 50 if regression_cutoff_50: X_regression_cutoff_50 = FeatureEngineering(X_train[self.features], "regression_cutoff_50").engineer_features().values y_regression_cutoff_50 = X_train["regression_cutoff_50"].apply(lambda x: int(x)) self.pipe_regression_cutoff_50 = Pipeline(steps=[ ('feature_selection', SelectKBest(k=10)), ('sampling', SMOTE(k_neighbors=10)), ('classifier', XGBClassifier(max_depth=2, n_estimators=100))]) scores = cross_val_score(self.pipe_regression_cutoff_50, X_regression_cutoff_50, y_regression_cutoff_50, scoring='f1', cv=StratifiedKFold(5)) print(f"regression_cutoff_50 {sum(scores)/5}") self.pipe_regression_cutoff_50.fit(X_regression_cutoff_50, y_regression_cutoff_50) # tred_cutoff if tred_cutoff: X_tred_cutoff = FeatureEngineering(X_train[self.features], "tred_cutoff").engineer_features().values y_tred_cutoff = X_train["tred_cutoff"].apply(lambda x: int(x)) self.pipe_tred_cutoff = Pipeline(steps=[ ('feature_selection', SelectKBest(k=20)), ('sampling', SMOTE(k_neighbors=10)), ('classifier', XGBClassifier(n_estimators=100, max_depth=2))]) scores = cross_val_score(self.pipe_tred_cutoff, X_tred_cutoff, y_tred_cutoff, scoring='f1', cv=StratifiedKFold(5)) print(f"tred_cutoff {sum(scores)/5}") self.pipe_tred_cutoff.fit(X_tred_cutoff, y_tred_cutoff) # target if intrusion: y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion) else: y_pred_intrusion = 1 if avoidance: y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance) else: y_pred_avoidance = 1 if hypertension: y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension) else: y_pred_hypertension = 1 if depression: y_pred_depression = self.pipe_depression.predict(X_depression) else: y_pred_depression = 1 if only_avoidance: y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_only_avoidance) else: y_pred_only_avoidance = 1 if PCL_Strict3: y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_PCL_Strict3) else: y_pred_PCL_Strict3 = 1 if regression_cutoff_33: y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_regression_cutoff_33) else: y_pred_regression_cutoff_33 = 1 if regression_cutoff_50: y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_regression_cutoff_50) else: y_pred_regression_cutoff_50 = 1 if tred_cutoff: y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_tred_cutoff) else: y_pred_tred_cutoff = 1 y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression & y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 & y_pred_regression_cutoff_50 & y_pred_tred_cutoff) y_target = y_train acc = accuracy_score(y_target, y_pred) f1 = f1_score(y_target, y_pred) recall = recall_score(y_target, y_pred) precision = precision_score(y_target, y_pred) print("training scores") print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}") def predict(self, X_test): if intrusion: X_test_intrusion_cutoff = FeatureEngineering(X_test[self.features], "intrusion_cutoff").engineer_features().values y_pred_intrusion = self.pipe_intrusion.predict(X_test_intrusion_cutoff) else: y_pred_intrusion = 1 if avoidance: X_test_avoidance_cutoff = FeatureEngineering(X_test[self.features], "avoidance_cutoff").engineer_features().values y_pred_avoidance = self.pipe_avoidance.predict(X_test_avoidance_cutoff) else: y_pred_avoidance = 1 if hypertension: X_test_hypertention_cutoff = FeatureEngineering(X_test[self.features], "hypertention_cutoff").engineer_features().values y_pred_hypertension = self.pipe_hypertension.predict(X_test_hypertention_cutoff) else: y_pred_hypertension = 1 if depression: X_test_depression_cutoff = FeatureEngineering(X_test[self.features], "depression_cutoff").engineer_features().values y_pred_depression = self.pipe_depression.predict(X_test_depression_cutoff) else: y_pred_depression = 1 if only_avoidance: X_test_only_avoidance_cutoff = FeatureEngineering(X_test[self.features], "only_avoidance_cutoff").engineer_features().values y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_test_only_avoidance_cutoff) else: y_pred_only_avoidance = 1 if PCL_Strict3: X_test_PCL_Strict3 = FeatureEngineering(X_test[self.features], "PCL_Strict3").engineer_features().values y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_test_PCL_Strict3) else: y_pred_PCL_Strict3 = 1 if regression_cutoff_33: X_test_regression_cutoff_33 = FeatureEngineering(X_test[self.features], "regression_cutoff_33").engineer_features().values y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_test_regression_cutoff_33) else: y_pred_regression_cutoff_33 =1 if regression_cutoff_50: X_test_regression_cutoff_50 = FeatureEngineering(X_test[self.features], "regression_cutoff_50").engineer_features().values y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_test_regression_cutoff_50) else: y_pred_regression_cutoff_50 = 1 if tred_cutoff: X_test_tred_cutoff = FeatureEngineering(X_test[self.features], "tred_cutoff").engineer_features().values y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_test_tred_cutoff) else: y_pred_tred_cutoff = 1 y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression & y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 & y_pred_regression_cutoff_50 & y_pred_tred_cutoff) return y_pred
def test_pipeline_memory_sampler(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) cachedir = mkdtemp() try: memory = Memory(cachedir=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def fit(self, X_train, y_train): predictions_list = [] for target in self.targets_list: if self.use_feature_engineering: X = FeatureEngineering(X_train[self.features], target).engineer_features().values else: X = X_train[self.features].values if target == "PCL_Strict3": y = y_train[target].apply(lambda x: int(x)) else: y = X_train[target].apply(lambda x: int(x)) pipeline = pipeline_per_target[target] scores = cross_val_score(pipeline, X, y, scoring='f1', cv=StratifiedKFold(5)) print(f"{target} - {sum(scores)/len(scores)}") if self.train_on_partial_prediction: combined_y = pd.DataFrame(y, columns=[target]) if target != "PCL_Strict3": combined_y["PCL_Strict3"] = y_train["PCL_Strict3"].apply(lambda x: int(x)) _X_train, _X_test, _y_train, _y_test = \ train_test_split(X, combined_y, test_size=0.25) self.trained_pipelines[target] = pipeline.fit(_X_train, _y_train[target]) y_pred = self.trained_pipelines[target].predict(_X_test) predictions_list.append(y_pred) print("test f1", target, f1_score(_y_test[target], y_pred)) self.trained_pipelines[target] = pipeline.fit(X, y) y = _y_test["PCL_Strict3"] else: self.trained_pipelines[target] = pipeline.fit(X, y) predictions_list.append([self.trained_pipelines[target].predict(X)]) y = y_train["PCL_Strict3"] if self.check_on_test_set: if target == "PCL_Strict3": y_test = self.y_test[target].apply(lambda x: int(x)) else: y_test = X_train[target].apply(lambda x: int(x)) if self.use_feature_engineering: X_test = FeatureEngineering(self.X_test[self.features], target).engineer_features().values else: X_test = self.X_test[self.features].values model = self.trained_pipelines[target] y_pred = model.predict(X_test) s_f = f1_score(self.y_test, y_pred) s_p = precision_score(self.y_test, y_pred) s_r = recall_score(self.y_test, y_pred) print(f"test f1 {target}", s_f) print(f"test recall {target}", s_r) print(f"test precision {target}", s_p) #pipe = Pipeline(steps=[ # ('scaling', StandardScaler()), # ('sampling', SMOTE()), # ('classifier', LogisticRegression(penalty='l1'))]) #c = ((len(y) - sum(y)) / sum(y)) if not self.use_and_func: c = 10 pipe = Pipeline(steps=[('feature_selection', RFE(XGBClassifier(n_estimators=10, scale_pos_weight=c))), ('clf', XGBClassifier(scale_pos_weight=c))]) X = predictions_list self.combined_model = pipe.fit(np.array(X).reshape(-1, len(predictions_list)), y)
class TargetEnsembler(object): def __init__(self, features): self.features = features def fit(self, X_train, y_train): # create list of targets # self.pipelines_list = [] # self.preds = [] # for i in targets : # x. feature engineering (i) # y = df[i] # cv_scores (x, y, pipeline_per_target[i]) # model = pipeline_per_target[i].train(x, y) # pipelines_list.append(model) # preds.append(model.pred(x)) # y = df[y] # combined_model = LogReg.train(preds, y) # print results.... # def pred(X): # if intrusion: y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion) else: y_pred_intrusion = 1 if avoidance: y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance) else: y_pred_avoidance = 1 if hypertension: y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension) else: y_pred_hypertension = 1 if depression: y_pred_depression = self.pipe_depression.predict(X_depression) else: y_pred_depression = 1 if only_avoidance: y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_only_avoidance) else: y_pred_only_avoidance = 1 if PCL_Strict3: y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_PCL_Strict3) else: y_pred_PCL_Strict3 = 1 if regression_cutoff_33: y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_regression_cutoff_33) else: y_pred_regression_cutoff_33 = 1 if regression_cutoff_50: y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_regression_cutoff_50) else: y_pred_regression_cutoff_50 = 1 if tred_cutoff: y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_tred_cutoff) else: y_pred_tred_cutoff = 1 y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression & y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 & y_pred_regression_cutoff_50 & y_pred_tred_cutoff) y_target = y_train acc = accuracy_score(y_target, y_pred) f1 = f1_score(y_target, y_pred) recall = recall_score(y_target, y_pred) precision = precision_score(y_target, y_pred) print("training scores") print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}") # combined y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension) y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance) y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion) y_pred_regression = self.pipe_regression.predict(X_regression) X_train["y_pred_hypertension"] = y_pred_hypertension X_train["y_pred_avoidance"] = y_pred_avoidance X_train["y_pred_intrusion"] = y_pred_intrusion X_train["y_pred_regression"] = y_pred_regression preds = ["y_pred_hypertension", "y_pred_avoidance", "y_pred_intrusion", "y_pred_regression"] X_combined = X_train[['q6.11_NUMB_pcl2', 'q6.13_SLEEP_pcl1', 'intrusion_pcl2', 'phq2'] + preds].values y_combined = y_train self.pipe_combined = Pipeline(steps=[ ('classifier', DecisionTreeClassifier())]) scores = cross_val_score(self.pipe_combined, X_combined, y_combined, scoring='precision', cv=StratifiedKFold(5)) print(f"hypertension {sum(scores)/5}") self.pipe_combined.fit(X_combined, y_combined) def predict(self, X_test): if intrusion: X_test_intrusion_cutoff = FeatureEngineering(X_test[self.features], "intrusion_cutoff").engineer_features().values y_pred_intrusion = self.pipe_intrusion.predict(X_test_intrusion_cutoff) else: y_pred_intrusion = 1 if avoidance: X_test_avoidance_cutoff = FeatureEngineering(X_test[self.features], "avoidance_cutoff").engineer_features().values y_pred_avoidance = self.pipe_avoidance.predict(X_test_avoidance_cutoff) else: y_pred_avoidance = 1 if hypertension: X_test_hypertention_cutoff = FeatureEngineering(X_test[self.features], "hypertention_cutoff").engineer_features().values y_pred_hypertension = self.pipe_hypertension.predict(X_test_hypertention_cutoff) else: y_pred_hypertension = 1 if depression: X_test_depression_cutoff = FeatureEngineering(X_test[self.features], "depression_cutoff").engineer_features().values y_pred_depression = self.pipe_depression.predict(X_test_depression_cutoff) else: y_pred_depression = 1 if only_avoidance: X_test_only_avoidance_cutoff = FeatureEngineering(X_test[self.features], "only_avoidance_cutoff").engineer_features().values y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_test_only_avoidance_cutoff) else: y_pred_only_avoidance = 1 if PCL_Strict3: X_test_PCL_Strict3 = FeatureEngineering(X_test[self.features], "PCL_Strict3").engineer_features().values y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_test_PCL_Strict3) else: y_pred_PCL_Strict3 = 1 if regression_cutoff_33: X_test_regression_cutoff_33 = FeatureEngineering(X_test[self.features], "regression_cutoff_33").engineer_features().values y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_test_regression_cutoff_33) else: y_pred_regression_cutoff_33 = 1 if regression_cutoff_50: X_test_regression_cutoff_50 = FeatureEngineering(X_test[self.features], "regression_cutoff_50").engineer_features().values y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_test_regression_cutoff_50) else: y_pred_regression_cutoff_50 = 1 if tred_cutoff: X_test_tred_cutoff = FeatureEngineering(X_test[self.features], "tred_cutoff").engineer_features().values y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_test_tred_cutoff) else: y_pred_tred_cutoff = 1 y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression & y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 & y_pred_regression_cutoff_50 & y_pred_tred_cutoff) preds = ["y_pred_hypertension", "y_pred_avoidance", "y_pred_intrusion", "y_pred_regression"] X_combined = X_test[['q6.11_NUMB_pcl2', 'q6.13_SLEEP_pcl1', 'intrusion_pcl2', 'phq2'] + preds].values y_pred = self.pipe_combined.predict(X_combined) return y_pred
def test_set_pipeline_step_none(): # Test setting Pipeline steps to None X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=None) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) expected_params = {'steps': pipeline.steps, 'm2': mult2, 'm3': None, 'last': mult5, 'memory': None, 'm2__mult': 2, 'last__mult': 5} assert pipeline.get_params(deep=True) == expected_params pipeline.set_params(m2=None) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = ['predict_proba', 'predict_log_proba', 'decision_function', 'transform', 'score'] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=None) # mult2 and mult3 are active exp = 6 pipeline.fit(X, y) pipeline.transform(X) assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) with raises(AttributeError, match="has no attribute 'predict'"): getattr(pipeline, 'predict') # Check None step at construction time exp = 2 * 5 pipeline = Pipeline([('m2', mult2), ('m3', None), ('last', mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]]))