def test_predict_with_predict_params(): # tests that Pipeline passes predict_params to the final estimator # when predict is invoked pipe = Pipeline([("transf", Transf()), ("clf", DummyEstimatorParams())]) pipe.fit(None, None) pipe.predict(X=None, got_attribute=True) assert pipe.named_steps["clf"].got_attribute
def test_predict_with_predict_params(): # tests that Pipeline passes predict_params to the final estimator # when predict is invoked pipe = Pipeline([('transf', Transf()), ('clf', DummyEstimatorParams())]) pipe.fit(None, None) pipe.predict(X=None, got_attribute=True) assert pipe.named_steps['clf'].got_attribute
def test_pipeline_methods_rus_pca_svm(): # Test the various methods of the pipeline (pca + svm). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) # Test with PCA + SVC clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([("rus", rus), ("pca", pca), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def three_models_combined(self, intrusion_features, avoidance_features, hypertension_features): self.df = self.df[~self.df['intrusion_cutoff'].isna()] self.df = self.df[~self.df['avoidance_cutoff'].isna()] self.df = self.df[~self.df['hypertention_cutoff'].isna()] print("self.df.shape", self.df.shape) X = self.df Y = self.df[self.target]# strict all_Y = [self.target, "intrusion_cutoff", "avoidance_cutoff", "hypertention_cutoff"] X_train, X_test, y_train, y_test = train_test_split(X, self.df[all_Y], test_size=0.25, random_state = 8526566, stratify=Y) # intrusion X_intrusion = X_train[intrusion_features].values y_intrusion = y_train["intrusion_cutoff"].apply(lambda x: int(x)) pipe_intrusion = Pipeline(steps=[ ('rfe', BorderlineSMOTE()), ('classifier', XGBClassifier(n_estimators=100, reg_alpha=1))]) scores = cross_val_score(pipe_intrusion, X_intrusion, y_intrusion, scoring='precision', cv=StratifiedKFold(5)) print(f"intrusion {sum(scores)/5}") pipe_intrusion.fit(X_intrusion, y_intrusion) # avoidance X_avoidance = X_train[avoidance_features].values y_avoidance = y_train["avoidance_cutoff"].apply(lambda x: int(x)) pipe_avoidance = Pipeline(steps=[ ('classifier', XGBClassifier(n_estimators=100, scale_pos_weight=3, reg_alpha=1))]) scores = cross_val_score(pipe_avoidance, X_avoidance, y_avoidance, scoring='precision', cv=StratifiedKFold(5)) print(f"avoidance {sum(scores)/5}") pipe_avoidance.fit(X_avoidance, y_avoidance) # hypertension X_hypertension = X_train[hypertension_features].values y_hypertention = y_train["hypertention_cutoff"].apply(lambda x: int(x)) pipe_hypertension = Pipeline(steps=[ ('classifier', BalancedBaggingClassifier(n_estimators=100))]) scores = cross_val_score(pipe_hypertension, X_hypertension, y_hypertention, scoring='precision', cv=StratifiedKFold(5)) print(f"hypertension {sum(scores)/5}") pipe_hypertension.fit(X_hypertension, y_hypertention) ## combine three classifiers X_test_hypertension = X_test[hypertension_features].values X_test_avoidance = X_test[avoidance_features].values X_test_intrusion = X_test[intrusion_features].values y_pred_hypertension = pipe_hypertension.predict(X_test_hypertension) y_pred_avoidance = pipe_avoidance.predict(X_test_avoidance) y_pred_intrusion = pipe_intrusion.predict(X_test_intrusion) y_pred = (y_pred_hypertension * y_pred_avoidance * y_pred_intrusion) y_target = y_test["PCL_Strict3"].apply(lambda x: int(x)) acc = accuracy_score(y_target, y_pred) f1 = f1_score(y_target, y_pred) recall = recall_score(y_target, y_pred) precision = precision_score(y_target, y_pred) print("test scores") print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(gamma='scale', probability=True, random_state=0) pca = PCA(svd_solver='full', n_components='mle', whiten=True) pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression(solver="lbfgs", multi_class="auto") filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([("anova", filter1), ("logistic", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression() filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(probability=True, random_state=0) pca = PCA() pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA(svd_solver="full", n_components="mle", whiten=True) pipe = Pipeline([("pca", pca), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def illigal_genralization_checking(self, X_test, y_test): X = self.df[self.features] X_test = X_test[self.features] Y = self.df[self.target] pipe = Pipeline( steps=[('classifier', XGBClassifier( n_estimators=1000, scale_pos_weight=3, reg_alpha=1))]) y_test = y_test["intrusion_cutoff"].apply(lambda x: int(x)) scores = cross_val_score(pipe, X, Y, scoring='precision', cv=StratifiedKFold(5)) print(self.features) print("cross vl scores") print(sum(scores) / 5) pipe.fit(X, Y.values) y_pred = pipe.predict(X_test) acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) print("test scores") print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver="randomized", whiten=True) clf = SVC( gamma="scale", probability=True, random_state=0, decision_function_shape="ovr", ) for preprocessing in [scaler, pca]: pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert predict.shape == (n_samples, ) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) log_proba = pipe.predict_log_proba(X) assert log_proba.shape == (n_samples, n_classes) decision_function = pipe.decision_function(X) assert decision_function.shape == (n_samples, n_classes) pipe.score(X, y)
def Predict(data, mode): train, test = data idx = test.id.values.astype(int) y = train.median_relevance.values train_query = list( train.apply(lambda x: '%s' % x['query_preprocessed'], axis=1)) train_title = list( train.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1)) test_query = list( test.apply(lambda x: '%s' % x['query_preprocessed'], axis=1)) test_title = list( test.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1)) stop_words = text.ENGLISH_STOP_WORDS.union(['http','www','img','border','color','style','padding','table','font', \ 'thi','inch','ha','width','height','0','1','2','3','4','5','6','7','8','9']) stop_words = text.ENGLISH_STOP_WORDS.union(set(stopwords.words('english'))) tfv = text.TfidfVectorizer(min_df=7, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', \ ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words=stop_words) tfv.fit(train_query + train_title) X_train = hstack([tfv.transform(train_query), tfv.transform(train_title)]) X_test = hstack([tfv.transform(test_query), tfv.transform(test_title)]) sim = similarlity_stack() if mode == 'eda': svd = TruncatedSVD(n_components=200) scl = StandardScaler(with_mean=False) svm = SVC(C=10, gamma="auto", kernel="rbf", class_weight=None, probability=True) clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\ ('scl', scl),\ ('svm', svm)]) elif mode == 'sampling': svd = TruncatedSVD(n_components=200) scl = StandardScaler(with_mean=False) svm = SVC(C=10, gamma="auto", kernel="rbf", class_weight=None, probability=True) sampling = SVMSMOTE(svm_estimator=svm, k_neighbors=4) clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\ ('scl', scl),\ ('sampling', sampling),\ ('svm', svm)]) clf.fit(X_train, y) preds = clf.predict(X_test) pred_probas = clf.predict_proba(X_test) submission = pd.DataFrame({"id": idx, "prediction": preds}) submission_probas = pd.DataFrame(pred_probas, index=idx) return submission, submission_probas
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples, )) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def test_pipeline_methods_anova_rus(): # Test the various methods of the pipeline (anova). X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) # Test with RandomUnderSampling + Anova + LogisticRegression clf = LogisticRegression() rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples,)) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def test_pipeline_methods_rus_pca_svm(): # Test the various methods of the pipeline (pca + svm). X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) # Test with PCA + SVC clf = SVC(probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([('rus', rus), ('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def train_test_and_evaluate(seed, X_train, X_test, y_train, y_test, onehots, numericals, cv): # define transformer for features that are one hot encoded and pre-select features using chi square selector = SelectKBest(chi2, k=15) ohc = OneHotEncoder(handle_unknown='ignore') onehot_transformer = Pipeline(steps=[('selector', selector), ('ohc', ohc)]) # define transformer for numerical features, scale features and shrink space using pca scaler = StandardScaler() pca = PCA() numeric_transformer = Pipeline(steps=[('scaler', scaler), ('pca', pca)]) # define resampling: oversample minority class first and undersample majority class afterwards over = RandomOverSampler(random_state=random_seed, sampling_strategy=0.1) under = RandomUnderSampler(random_state=random_seed, sampling_strategy=0.5) # combine steps into preprocessing and machine learning pipeline using logistic regression preprocessor = ColumnTransformer( transformers=[('numeric', numeric_transformer, numericals), ('onehot', onehot_transformer, onehots)]) pipe_model = Pipeline( steps=[('over', over), ('under', under), ( 'prep', preprocessor), ('classifier', LogisticRegression(max_iter=1000))]) # Cross-validate model on training data to estimate performance cv_results = 2 * roc_auc_score( y_train, cross_val_predict( pipe_model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]) - 1 print("Mean training gini after CV: {res}".format(res=cv_results.mean())) # Fit the model to training data and evaluate on test data and finally on evaluation data pipe_model.fit(X_train, y_train) y_true, y_pred = y_test, pipe_model.predict(X_test) gini = 2 * roc_auc_score(y_true, pipe_model.predict_proba(X_test)[:, 1]) - 1 print("Gini score on test set: " + str(gini)) y_true, y_pred = y_val, pipe_model.predict(X_val) gini = 2 * roc_auc_score(y_true, pipe_model.predict_proba(X_val)[:, 1]) - 1 print("Gini score on validation set: " + str(gini))
def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([('transf', TransfT()), ('clf', FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert_true(pipe.predict(None)) # and transformer params should not be changed assert_true(pipe.named_steps['transf'].a is None) assert_true(pipe.named_steps['transf'].b is None)
class ImblearnRecalibrator(BaseEstimator, ClassifierMixin): """ imblearnのリサンプリングの偏りを再較正するやつ 再較正のコードを毎回書きたくない. scikit-learnの設計思想に則りオブジェクト指向プログラミングをしよう estimator, resampler, サンプリング割合を指定したら後は fit & predict/predict_proba するだけ * 注意: 不均衡データに対するリサンプリングは分類性能を目的としているので判別性能等に効果があるかは知らない :param estimatror: scikit-learn API 準拠の estimator オブジェクト :param resampler: imblearn で使われる各種 resampler オブジェクト :param post_minor_rate: リサンプリング後の**全件に対する少数例の割合**を指定. default is None. alpha とどちらか片方を使う. :param alpha: **リサンプリング前に対する**事後の少数例の割合**を指定. default is 'auto'. post_minor_rate とどちらか片方を使う. """ def __init__(self, estimator, resampler, alpha='auto', post_minor_rate=None): resampler = clone(resampler) if post_minor_rate is None and alpha is None: warnings.warn( 'neither of `post_minor_rate` nor `alpha` are specified. Instead resampling stragegy specified in `resampler` object is used.' ) elif post_minor_rate and alpha: warnings.warn( 'both of `post_minor_rate` and `alpha` are specified. the former is applied.' ) self.post_minor_rate = post_minor_rate self.resampling_strategy = 'posterior_rate' elif post_minor_rate: self.post_minor_rate = post_minor_rate self.resampling_strategy = 'posterior_rate' elif alpha: self.alpha = alpha self.resampling_strategy = 'alpha' resampler.set_params(sampling_strategy=alpha) else: raise ('initialized error') self.estimator_ = Pipeline([('resampler', resampler), ('estimator', clone(estimator))]) def fit(self, X, y): if self.resampling_strategy == 'posterior_rate': alpha = get_oversampling_rate(self.post_minor_rate) self.alpha = alpha self.estimator_['resampler'].set_params(sampling_strategy=alpha) self.estimator_.fit(X, y) self.minor_rate_ = np.min([y.mean(), 1 - y.mean()]) return self def predict(self, X): return self.estimator_.predict(X) def predict_proba(self, X): return calibrate_imbalanceness(self.estimator_.predict_proba(X), pos_rate=get_oversampling_power( self.alpha, self.minor_rate_))
def predict(test_set_name: list, pipeline: Pipeline) -> pd.DataFrame: """ Use the trained pipeline to predict on the test set given and output the final prediction as submission file """ X_test = model_pipeline_io.get_test_set(test_set_name) y_pred = pipeline.predict(X_test) submission = pd.DataFrame(data=y_pred) model_pipeline_io.save_submit_file(submission, "submission.csv") return submission
def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert pipe.predict(None) # and transformer params should not be changed assert pipe.named_steps['transf'].a is None assert pipe.named_steps['transf'].b is None # invalid parameters should raise an error message with raises(TypeError, match="unexpected keyword argument"): pipe.fit(None, None, clf__bad=True)
def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert pipe.predict(None) # and transformer params should not be changed assert pipe.named_steps["transf"].a is None assert pipe.named_steps["transf"].b is None # invalid parameters should raise an error message with raises(TypeError, match="unexpected keyword argument"): pipe.fit(None, None, clf__bad=True)
def test_pipeline_memory_transformer(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def test_pipeline_memory_transformer(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma='scale', probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma='scale', probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert pipe.predict(None) # and transformer params should not be changed assert pipe.named_steps['transf'].a is None assert pipe.named_steps['transf'].b is None # invalid parameters should raise an error message assert_raise_message(TypeError, "fit() got an unexpected keyword argument 'bad'", pipe.fit, None, None, clf__bad=True)
def start_identification_twitter(user_list, text): if len(text) < 240: print('Text Length must be 240 characters. Currently:', len(text)) return -1 tw = twitScrape() user_profiles = [] with open('../config/labels_twitter.json', 'r') as f: labels = json.load(f) downloaded_users = [ user[:-4] for user in os.listdir(TWITTER_PATH) if user.endswith('.csv') ] user_list = [user for user in user_list if len(user) > 0] print('checking user_profiles for downloaded users') for user in user_list: if user in downloaded_users: user_profiles.append(pd.read_csv(f'{TWITTER_PATH}{user}.csv')) continue r = tw.getIndivTweets(user) if r <= -1: continue user_path = TWITTER_PATH + user + '.txt' ip = IdentProfile(user_path, labels['new_label'], email_size=240) labels['labels'][labels['new_label']] = user labels['new_label'] += 1 df = ip.create_profile() df.to_csv(f'../corpora/twitter_corpus_csv/{user}.csv', index=False) user_profiles.append(df.copy()) print('created new user profile for ', user) with open('../config/labels_twitter.json', 'w') as f: json.dump(labels, f) df = pd.concat(user_profiles, ignore_index=True) x, y = split_x_y(df, numpy=True) pipe = Pipeline([('MinMaxScaler', MinMaxScaler()), ('SVC', LinearSVC(C=0.5, penalty='l2', dual=True))]) # type: ignore print('Beginning Training') pipe.fit(x, y) ip = IdentText('', text, email_size=240) text_df = ip.create_profile() x_test = text_df.to_numpy() prediction = pipe.predict(x_test) prediction = prediction[0] print('prediction', prediction) choice = labels['labels'][str(prediction)] print('Best candidate', choice) return choice
def main(): """ Trains a logistic regression, an attempt to be 'production' grade """ logger = logging.getLogger(__name__) logger.info(f'Reading data') processed_df = pd.read_csv('../../data/processed/processed.csv') X = processed_df.drop('Class', axis=1).values y = processed_df['Class'].values X_train, X_test, y_train, y_test = tts(X, y, random_state=random_seed) logger.info(f'Constructing model pipeline') model = Pipeline( [ ('sampling', SMOTE()), ('classification', baseline_classifiers['LogisiticRegression']) ] ) logger.info(f'Constructing baseline model') model.fit(X_train, y_train) baseline_y_hat = model.predict(X_test) baseline_report = classification_report(y_test, baseline_y_hat) print(f'Classification report for Baseline model \n{baseline_report}') logger.info(f'Performing Gridsearch') gridsearch_cv = GridSearchCV( estimator=model, param_grid=LogisiticRegression_grid, cv=5, scoring=model_metrics, n_jobs=1, refit='F1', return_train_score=True ) gridsearch_cv.fit(X_train, y_train) print(f'Best score (log-loss): {gridsearch_cv.best_score_}\nBest Parameters: {gridsearch_cv.best_params_}') gridsearch_y_hat = gridsearch_cv.predict(X_test) gridsearch_report = classification_report(y_test, gridsearch_y_hat) print(f'Classification report for tuned model \n{gridsearch_report}') joblib.dump(gridsearch_cv, f'../../models/{best_model_file_name}', compress=9) logger.info(f'Serialised model as {best_model_file_name}')
def train_and_evaluate_model(model, params, X_train, y_train, X_test, y_test): for k, v in params.items(): model.set_params(**{k: v}) model_name = type(model).__name__ # print(" ") # print("Training model {0}: ".format(model_name)) model = Pipeline([('sampling', SMOTE(sampling_strategy='minority')), ('model', model)]) model.fit(X_train, y_train) y_pred = model.predict(X_test) score_acc = accuracy_score(y_test, y_pred) score_precision = precision_score(y_test, y_pred) score_recall = recall_score(y_test, y_pred) score_f1 = f1_score(y_test, y_pred) score_roc = roc_auc_score(y_test, y_pred) # plot_confusion_matrix(y_test, y_pred, score_f1, model_name) return score_acc, score_precision, score_recall, score_f1, score_roc
def illigal_genralization_checking(self, X_test, y_test): X = self.df[self.features] X_test = X_test[self.features] Y = self.df[self.target] pipe = Pipeline(steps=[('classifier', XGBClassifier(n_estimators=1000, scale_pos_weight=3, reg_alpha=1))]) y_test = y_test["intrusion_cutoff"].apply(lambda x: int(x)) scores = cross_val_score(pipe, X, Y, scoring='precision', cv=StratifiedKFold(5)) print(self.features) print("cross vl scores") print(sum(scores)/5) pipe.fit(X, Y.values) y_pred = pipe.predict(X_test) acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) print("test scores") print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def output_test_smote(train, test): features = np.array([ 'reviewText', 'summary', 'reviewSentiment', 'summarySentiment', 'reviewSub', 'summarySub' ]) # set up pipeline transformers revTf = Pipe(steps=[('selector', Selector( key='reviewText')), ('cf', CountVectorizer( max_df=.95)), ('tf', TfidfTransformer(sublinear_tf=True))]) sumTf = Pipe(steps=[('selector', Selector( key='summary')), ('cf', CountVectorizer( max_df=.95)), ('tf', TfidfTransformer(sublinear_tf=True))]) # set up column transformers combine = ColumnTransformer(transformers=[( 'revSen', StandardScaler(), ['reviewSentiment'] ), ('sumSen', StandardScaler(), ['summarySentiment']), ( 'revSub', StandardScaler(), ['reviewSub']), ('sumSub', StandardScaler(), ['summarySub']), ( 'rev', revTf, ['reviewText']), ('sum', sumTf, ['summary'])]) # set up final model pipeline model = Pipe( steps=[('combine', combine), ('smote', SMOTE()), ('classifier', LogisticRegression( max_iter=500, warm_start=True, penalty='l2', C=.8))]) # Fit the data and predict print("Fitting") model.fit(train[features], train['label']) print("Model fit") preds = model.predict(test[features]) # Output to an output file df = pd.DataFrame(np.dstack((test["asin"], preds))[0], columns=["asin", "label"]) df.to_csv("output_smote.csv")
def model_select(): for nome_balanceador, balanceador in balanceadores: if classificador_ja_executado(nome, nome_balanceador): continue else: print(balanceador) pipeline = Pipeline([('dimension', PCA(n_components=250)), ('balance', balanceador), ('clf', modelo)]) print("# Rodando o algoritmo %s" % nome) print() np.set_printoptions(precision=4) pipeline.fit(dados_completo_x, dados_completo_y) print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_pred = pipeline.predict(test_x) matriz_confusao = confusion_matrix(test_y, y_pred) nome_arquivo = nome + '_' + nome_balanceador + '_best_mucilage' plot_confusion_matrix(matriz_confusao, nome_arquivo, [1, 2, 3, 4], False, title='Confusion matrix' + nome + ' (best parameters)') plot_confusion_matrix(matriz_confusao, nome_arquivo, [1, 2, 3, 4], True, title='Confusion matrix ' + nome + ', normalized') print('Matriz de Confusão') print(matriz_confusao) print(classification_report(y_true=test_y, y_pred=y_pred, digits=4)) y_pred = pipeline.predict_proba(test_x) roc_auc_aux(test_y, y_pred, nome, nome_balanceador) print() sys.stdout.flush()
def train_model(X_train, X_test, y_train, y_test,variablepath): # define pipeline steps = [('model', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='entropy', max_depth=90, max_features=2, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=16, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False))] pipeline = Pipeline(steps=steps) # evaluate pipeline cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=4, random_state=1) scores = cross_val_score(pipeline, X_train, y_train, scoring='f1_micro', cv=cv, n_jobs=-1) score = mean(scores) print('Cross Validated f1 score: %.3f' % score) pipeline.fit(X_train,y_train) y_pred = pipeline.predict(X_test) evaluate(pipeline,X_test, y_test) os.chdir(variablepath) # save the model to disk filename = 'RFC.sav' pickle.dump(pipeline, open(filename, 'wb')) return pipeline
def resampling(X, Y, r): # print(sorted(Counter(Y).items())) smote_enn = TomekLinks() X_resampled, y_resampled = smote_enn.fit_resample(X, Y) #print(sorted(Counter(y_resampled).items())) return X_resampled, y_resampled # pipeline pipeline = Pipeline([ ('und', RandomUnderSampler()), #('power', preprocessing.PowerTransformer()), ('standardize', preprocessing.StandardScaler()), ('normalizer', preprocessing.Normalizer()), ('lda', LinearDiscriminantAnalysis()), #('logistic', sk.linear_model.SGDClassifier(loss="hinge", eta0=1, learning_rate="constant", penalty='l2')) ('svm', LinearSVC(verbose=0, max_iter=3000, class_weight='balanced')), ]) com_values = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10] for c in com_values: pipeline.set_params(svm__C=c, und__random_state=42).fit(X_train, Y_train) # clf = CalibratedClassifierCV(base_estimator=pipeline, cv=10).fit(X,Y) y_p = pipeline.decision_function(X_dev) y_pred = pipeline.predict(X_dev) print("With:", c) print("Confusion matrix:\n", sk.metrics.confusion_matrix(Y_dev, y_pred)) one = sk.metrics.recall_score(Y_dev, y_pred, pos_label=0) two = sk.metrics.recall_score(Y_dev, y_pred, pos_label=1) print("UAR:", (one + two) / 2, "\n")
axis=1, inplace=True) cols_to_encode = ['Departure', 'Arrival', 'month', 'day', 'year', 'season'] cols_to_scale = ['WeeksToDeparture', 'std_wtd', 'distance'] from sklearn.preprocessing import OneHotEncoder, MinMaxScaler scaler = MinMaxScaler() ohe = OneHotEncoder(categories='auto', sparse=False) scaled_cols = scaler.fit_transform(df_train[cols_to_scale]) encoded_cols = ohe.fit_transform(df_train[cols_to_encode]) processed_df = np.concatenate([scaled_cols, encoded_cols], axis=1) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(processed_df, y_train, test_size=0.25) y_train = np.ravel(y_train) from imblearn.over_sampling import SMOTE from sklearn.neighbors import KNeighborsClassifier pipeline = Pipeline([('ovs', SMOTE()), ('clf', KNeighborsClassifier())]) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) from sklearn.metrics import f1_score score = f1_score(y_test, y_pred, average='micro')
class TargetEnsembler(object): def __init__(self, features): self.features = features def fit(self, X_train, y_train): # intrusion if intrusion: X_intrusion = FeatureEngineering(X_train[self.features], "intrusion_cutoff").engineer_features().values y_intrusion = X_train["intrusion_cutoff"].apply(lambda x: int(x)) self.pipe_intrusion = Pipeline(steps=[ ('feature_selection', SelectFpr(alpha=0.05)), ('sampling', BorderlineSMOTE(k_neighbors=10)), ('classifier', XGBClassifier(n_estimators=300, max_depth=5))]) scores = cross_val_score(self.pipe_intrusion, X_intrusion, y_intrusion, scoring='f1', cv=StratifiedKFold(5)) print(f"intrusion {sum(scores)/5}") self.pipe_intrusion.fit(X_intrusion, y_intrusion) # avoidance if avoidance: X_avoidance = FeatureEngineering(X_train[self.features], "avoidance_cutoff").engineer_features().values y_avoidance = X_train["avoidance_cutoff"].apply(lambda x: int(x)) self.pipe_avoidance = Pipeline(steps=[ ('feature_selection', RFE(estimator=XGBClassifier(scale_pos_weight=5.88, n_estimators=100), n_features_to_select=20)), ('classifier', BalancedRandomForestClassifier(n_estimators=300, max_depth=10))]) scores = cross_val_score(self.pipe_avoidance, X_avoidance, y_avoidance, scoring='f1', cv=StratifiedKFold(5)) print(f"avoidance {sum(scores)/5}") self.pipe_avoidance.fit(X_avoidance, y_avoidance) # hypertension if hypertension: X_hypertension = FeatureEngineering(X_train[self.features], "hypertention_cutoff").engineer_features().values y_hypertention = X_train["hypertention_cutoff"].apply(lambda x: int(x)) self.pipe_hypertension = Pipeline(steps=[ ('feature_selection', RFE(estimator=XGBClassifier(n_estimators=100, scale_pos_weight=3.51), n_features_to_select=20)), ( 'sampling', SMOTE(k_neighbors=10)), ('classifier', BalancedRandomForestClassifier(n_estimators=100))]) scores = cross_val_score(self.pipe_hypertension, X_hypertension, y_hypertention, scoring='f1', cv=StratifiedKFold(5)) print(f"hypertension {sum(scores)/5}") self.pipe_hypertension.fit(X_hypertension, y_hypertention) # depression if depression: X_depression = FeatureEngineering(X_train[self.features], "depression_cutoff").engineer_features().values y_depression = X_train["depression_cutoff"].apply(lambda x: int(x)) self.pipe_depression = Pipeline(steps=[ ('feature_selection', SelectFdr(alpha=0.1)), ('sampling', SMOTE(k_neighbors=5)), ('classifier', RandomForestClassifier(n_estimators=100))]) scores = cross_val_score(self.pipe_depression, X_depression, y_depression, scoring='f1', cv=StratifiedKFold(5)) print(f"depression {sum(scores)/5}") self.pipe_depression.fit(X_depression, y_depression) # only_avoidance if only_avoidance: X_only_avoidance = FeatureEngineering(X_train[self.features], "only_avoidance_cutoff").engineer_features().values y_only_avoidance = X_train["only_avoidance_cutoff"].apply(lambda x: int(x)) self.pipe_only_avoidance = Pipeline(steps=[ ('feature_selection', RFE(XGBClassifier(n_estimators=100,max_depth=3), n_features_to_select=10)), ('classifier', BalancedRandomForestClassifier( n_estimators=500, max_depth=10))]) scores = cross_val_score(self.pipe_only_avoidance, X_only_avoidance, y_only_avoidance, scoring='f1', cv=StratifiedKFold(5)) print(f"only_avoidance {sum(scores)/5}") self.pipe_only_avoidance.fit(X_only_avoidance, y_only_avoidance) # pcl_strict3 if PCL_Strict3: X_PCL_Strict3 = FeatureEngineering(X_train[self.features], "PCL_Strict3").engineer_features().values y_PCL_Strict3 = y_train["PCL_Strict3"].apply(lambda x: int(x)) self.pipe_PCL_Strict3 = Pipeline(steps=[ ('feature_selection', SelectKBest(k=20)), ('sampling', SMOTE(k_neighbors=5)), ('classifier', XGBClassifier(max_depth=3, n_estimators=100))]) scores = cross_val_score(self.pipe_PCL_Strict3, X_PCL_Strict3, y_PCL_Strict3, scoring='f1', cv=StratifiedKFold(5)) print(f"PCL_Strict3 {sum(scores)/5}") self.pipe_PCL_Strict3.fit(X_PCL_Strict3, y_PCL_Strict3) # cutoff_33 if regression_cutoff_33: X_regression_cutoff_33 = FeatureEngineering(X_train[self.features], "regression_cutoff_33").engineer_features().values y_regression_cutoff_33 = X_train["regression_cutoff_33"].apply(lambda x: int(x)) self.pipe_regression_cutoff_33 = Pipeline(steps=[ ('feature_selection', SelectFpr(alpha=0.033)), ('sampling', SMOTE(k_neighbors=10)), ('classifier', RandomForestClassifier(n_estimators=100, max_depth=5))]) scores = cross_val_score(self.pipe_regression_cutoff_33, X_regression_cutoff_33, y_regression_cutoff_33, scoring='f1', cv=StratifiedKFold(5)) print(f"regression_cutoff_33 {sum(scores)/5}") self.pipe_regression_cutoff_33.fit(X_regression_cutoff_33, y_regression_cutoff_33) # cutoff 50 if regression_cutoff_50: X_regression_cutoff_50 = FeatureEngineering(X_train[self.features], "regression_cutoff_50").engineer_features().values y_regression_cutoff_50 = X_train["regression_cutoff_50"].apply(lambda x: int(x)) self.pipe_regression_cutoff_50 = Pipeline(steps=[ ('feature_selection', SelectKBest(k=10)), ('sampling', SMOTE(k_neighbors=10)), ('classifier', XGBClassifier(max_depth=2, n_estimators=100))]) scores = cross_val_score(self.pipe_regression_cutoff_50, X_regression_cutoff_50, y_regression_cutoff_50, scoring='f1', cv=StratifiedKFold(5)) print(f"regression_cutoff_50 {sum(scores)/5}") self.pipe_regression_cutoff_50.fit(X_regression_cutoff_50, y_regression_cutoff_50) # tred_cutoff if tred_cutoff: X_tred_cutoff = FeatureEngineering(X_train[self.features], "tred_cutoff").engineer_features().values y_tred_cutoff = X_train["tred_cutoff"].apply(lambda x: int(x)) self.pipe_tred_cutoff = Pipeline(steps=[ ('feature_selection', SelectKBest(k=20)), ('sampling', SMOTE(k_neighbors=10)), ('classifier', XGBClassifier(n_estimators=100, max_depth=2))]) scores = cross_val_score(self.pipe_tred_cutoff, X_tred_cutoff, y_tred_cutoff, scoring='f1', cv=StratifiedKFold(5)) print(f"tred_cutoff {sum(scores)/5}") self.pipe_tred_cutoff.fit(X_tred_cutoff, y_tred_cutoff) # target if intrusion: y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion) else: y_pred_intrusion = 1 if avoidance: y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance) else: y_pred_avoidance = 1 if hypertension: y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension) else: y_pred_hypertension = 1 if depression: y_pred_depression = self.pipe_depression.predict(X_depression) else: y_pred_depression = 1 if only_avoidance: y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_only_avoidance) else: y_pred_only_avoidance = 1 if PCL_Strict3: y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_PCL_Strict3) else: y_pred_PCL_Strict3 = 1 if regression_cutoff_33: y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_regression_cutoff_33) else: y_pred_regression_cutoff_33 = 1 if regression_cutoff_50: y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_regression_cutoff_50) else: y_pred_regression_cutoff_50 = 1 if tred_cutoff: y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_tred_cutoff) else: y_pred_tred_cutoff = 1 y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression & y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 & y_pred_regression_cutoff_50 & y_pred_tred_cutoff) y_target = y_train acc = accuracy_score(y_target, y_pred) f1 = f1_score(y_target, y_pred) recall = recall_score(y_target, y_pred) precision = precision_score(y_target, y_pred) print("training scores") print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}") def predict(self, X_test): if intrusion: X_test_intrusion_cutoff = FeatureEngineering(X_test[self.features], "intrusion_cutoff").engineer_features().values y_pred_intrusion = self.pipe_intrusion.predict(X_test_intrusion_cutoff) else: y_pred_intrusion = 1 if avoidance: X_test_avoidance_cutoff = FeatureEngineering(X_test[self.features], "avoidance_cutoff").engineer_features().values y_pred_avoidance = self.pipe_avoidance.predict(X_test_avoidance_cutoff) else: y_pred_avoidance = 1 if hypertension: X_test_hypertention_cutoff = FeatureEngineering(X_test[self.features], "hypertention_cutoff").engineer_features().values y_pred_hypertension = self.pipe_hypertension.predict(X_test_hypertention_cutoff) else: y_pred_hypertension = 1 if depression: X_test_depression_cutoff = FeatureEngineering(X_test[self.features], "depression_cutoff").engineer_features().values y_pred_depression = self.pipe_depression.predict(X_test_depression_cutoff) else: y_pred_depression = 1 if only_avoidance: X_test_only_avoidance_cutoff = FeatureEngineering(X_test[self.features], "only_avoidance_cutoff").engineer_features().values y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_test_only_avoidance_cutoff) else: y_pred_only_avoidance = 1 if PCL_Strict3: X_test_PCL_Strict3 = FeatureEngineering(X_test[self.features], "PCL_Strict3").engineer_features().values y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_test_PCL_Strict3) else: y_pred_PCL_Strict3 = 1 if regression_cutoff_33: X_test_regression_cutoff_33 = FeatureEngineering(X_test[self.features], "regression_cutoff_33").engineer_features().values y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_test_regression_cutoff_33) else: y_pred_regression_cutoff_33 =1 if regression_cutoff_50: X_test_regression_cutoff_50 = FeatureEngineering(X_test[self.features], "regression_cutoff_50").engineer_features().values y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_test_regression_cutoff_50) else: y_pred_regression_cutoff_50 = 1 if tred_cutoff: X_test_tred_cutoff = FeatureEngineering(X_test[self.features], "tred_cutoff").engineer_features().values y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_test_tred_cutoff) else: y_pred_tred_cutoff = 1 y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression & y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 & y_pred_regression_cutoff_50 & y_pred_tred_cutoff) return y_pred
import time selector1 = VarianceThreshold(threshold=3) scaler1 = StandardScaler() ros1 = RandomOverSampler() pca1 = PCA(n_components=1) gnb1 = GaussianNB() Gnb_mic = Pipeline(steps=[('selector', selector1), ('scaler', scaler1), ('sampler', ros1), ('pca', pca1), ('gnb', gnb1)]) time1 = time.time() Gnb_mic.fit(X_train, y_train) time2 = time.time() Gnb_mic_pred = Gnb_mic.predict(X_test) time3 = time.time() _,_,Gnb_f1_mic,_ = precision_recall_fscore_support(y_test, Gnb_mic_pred, average='micro') _,_,f1_mac,_ = precision_recall_fscore_support(y_test, Gnb_mic_pred, average='macro') print("f1-micro: ", Gnb_f1_mic) print("f1-macro: ", f1_mac) print("Accuracy: ", accuracy_score(y_test, Gnb_mic_pred)) # Και έχουμε τον εξής Confusion Matrix: # In[ ]:
def test_pipeline_memory_sampler(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline([("transf_2", transf_2), ("svc", clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def test_pipeline_memory_sampler(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) cachedir = mkdtemp() try: memory = Memory(cachedir=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
class PennyModel: """ The Model for the penny auction. Takes a sklearn classifier and fits the model after transformation. Attributes: model (SklearnClassifier): The model for the regression user_scaler (bool): Whether or not to scale the data first sampling_ratio (float): The ratio of the minority class to the majority class numeric_features (list(str)): The numerical features of the model cateogorical_features (list(str)): The categorical features of the model """ def __init__(self, model, use_scaler=False, sampling_ratio=1): """ Parameters: Returns: """ self.model = model self.sampling_ratio = sampling_ratio self.use_scaler = use_scaler self.categorical_features = [ 'cardtype', 'limited_allowed', 'is_locked', 'is_bidomatic', 'is_bidomatic0', 'is_bidomatic1', 'is_bidomatic2', 'is_bidomatic3' ] self.numeric_features = [ 'bid', 'cashvalue', 'bidvalue', 'prevusers', 'bids_so_far0', 'perc_to_bin0', 'bom_bids_so_far0', 'bom_streak0', 'prev_is_new_user0', 'prev_auction_count0', 'prev_overbid0', 'prev_giveup_one0', 'prev_give_before_six0', 'prev_wins0', 'prev_bids0', 'prev_bom_bids0', 'distance1', 'bids_so_far1', 'perc_to_bin1', 'bom_bids_so_far1', 'bom_streak1', 'prev_is_new_user1', 'prev_auction_count1', 'prev_overbid1', 'prev_giveup_one1', 'prev_give_before_six1', 'prev_wins1', 'prev_bids1', 'prev_bom_bids1', 'distance2', 'bids_so_far2', 'perc_to_bin2', 'bom_bids_so_far2', 'bom_streak2', 'prev_is_new_user2', 'prev_auction_count2', 'prev_overbid2', 'prev_giveup_one2', 'prev_give_before_six2', 'prev_wins2', 'prev_bids2', 'prev_bom_bids2', 'distance3', 'bids_so_far3', 'perc_to_bin3', 'bom_bids_so_far3', 'bom_streak3', 'prev_is_new_user3', 'prev_auction_count3', 'prev_overbid3', 'prev_giveup_one3', 'prev_give_before_six3', 'prev_wins3', 'prev_bids3', 'prev_bom_bids3', 'is_weekend', 'time_of_day' ] def get_features_as_string(self): """ Returns all the features of the model. Parameters: Returns: """ return ",".join(self.categorical_features + self.numeric_features) def get_column_names_from_ColumnTransformer(self, column_transformer): """ Parameters: Returns: """ col_name = [] for transformer_in_columns in column_transformer.transformers_[: -1]: #the last transformer is ColumnTransformer's 'remainder' raw_col_name = transformer_in_columns[2] if isinstance(transformer_in_columns[1], Pipeline): transformer = transformer_in_columns[1].steps[-1][1] else: transformer = transformer_in_columns[1] try: names = transformer.get_feature_names( self.categorical_features) except AttributeError: # if no 'get_feature_names' function, use raw column name names = raw_col_name if isinstance(names, np.ndarray): # eg. col_name += names.tolist() elif isinstance(names, list): col_name += names elif isinstance(names, str): col_name.append(names) return col_name def transform(self, X): """ Parameters: Returns: """ rX = X.copy() return self.transform_no_copy(rX) def transform_no_copy(self, X): """ Parameters: Returns: """ #rX = X.copy() #print ("2. Transforming data") X.is_bidomatic0 = X.is_bidomatic0.astype(str) X.is_bidomatic1 = X.is_bidomatic1.astype(str) X.is_bidomatic2 = X.is_bidomatic2.astype(str) X.is_bidomatic3 = X.is_bidomatic3.astype(str) X["fee"] = [ 0 if x == 0 else (1 if x < 50 else 1.99) for x in X["cardvalue"] ] X["time_of_day"] = [x.hour for x in X["auctiontime"]] X["is_weekend"] = [x.weekday() >= 6 for x in X["auctiontime"]] return X def internal_fit(self, X, y): """ Fits self.model Parameters: Returns: """ self.train_pop = X.shape[0] self.target_pop = sum(y) self.sampled_train_pop = self.target_pop / self.sampling_ratio + self.target_pop self.sampled_target_pop = self.target_pop numeric_transformer = Pipeline_imb( steps=[('imputer', SimpleImputer(strategy='constant', fill_value=-1)) # ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline_imb( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='unknown')), ('onehot', OneHotEncoder(handle_unknown='error', drop='first'))]) preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, self.numeric_features), ('cat', categorical_transformer, self.categorical_features)]) steps = [('preprocessor', preprocessor)] steps.append( ('sampler', RandomUnderSampler(sampling_strategy=self.sampling_ratio))) steps.append(('classifier', self.model)) self.pipeline = Pipeline_imb(steps=steps) print("4. Fitting model") self.pipeline.fit(X, y) def fit_already_transformed(self, X, y): """ fits X if it's already been transformed. Parameters: Returns: """ self.internal_fit(X, y) def fit_transform(self, X, y): """ fits and transforms X. Parameters: Returns: """ self.transform_no_copy(X) self.internal_fit(X, y) def pickle(self, filename): """ Writes this class as a pickle file to filename Parameters: Returns: """ print("5. Pickling model as penny_auction.pickle") pickle.dump(self, open(filename, "wb")) def predict_proba(self, X): """ Returns the predicted probabilities that the auction will end, in the UNDERSAMPLED data set. Parameters: Returns: """ return self.pipeline.predict_proba(self.transform(X)) def predict_proba_calibrated(self, X): """ Returns the probabilities from the model AFTER accounting for the undersampling. Parameters: Returns: """ return self.calibrate_probabilties(self.predict_proba(X)) def predict(self, X): """ Calls predict on the model to get binary whether or not the auction will end. Parameters: Returns: """ return self.pipeline.predict(self.transform(X)) def get_feature_scores(self): """ Returns the feature importances from the model Parameters: Returns: """ return pd.Series(self.pipeline.steps[2][1].feature_importances_, index=self.get_column_names_from_ColumnTransformer( self.pipeline.named_steps['preprocessor'])) def calibrate_probabilties(self, data): """ Recalibrates the probabilities to account for the undersampling. So if the model says 20%, it will comeout as something like 1.2% Parameters: Returns: """ calibrated_data = \ ((data * (self.target_pop / self.train_pop) / (self.sampled_target_pop / self.sampled_train_pop)) / (( (1 - data) * (1 - self.target_pop / self.train_pop) / (1 - self.sampled_target_pop / self.sampled_train_pop) ) + ( data * (self.target_pop / self.train_pop) / (self.sampled_target_pop / self.sampled_train_pop) ))) return calibrated_data def get_actual_and_potential_profits(self, X, y): """ returns the actual and potential profits over X Parameters: Returns: """ potential_profits = (X.cashvalue - X.fee - X.bid / 100) - .4 actual_profits = y * (X.cashvalue - X.fee - X.bid / 100) - .4 return potential_profits, actual_profits def get_score(self, X, y): """ Returns the expected profit over the set X Parameters: Returns: """ cprobs = self.predict_proba_calibrated(X)[:, 1] pp, ap = self.get_actual_and_potential_profits(X, y) expected_value = np.multiply(cprobs, pp) - (1 - cprobs) * .4 return sum(ap[expected_value > 0])
def train_model(transactions_details): X = transactions_details.drop(columns='fraudster') y = transactions_details['fraudster'].copy() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) categorical_cols = [ 'currency', 'transaction_state', 'type', 'source', 'entry_method', 'is_crypto', 'merchant_country', 'phone_country', 'user_country', 'kyc' ] numerical_cols = [ 'failed_sign_in_attempts', 'age', 'diff_in_days', 'amount_usd' ] # pre processing pipeline # Feature Scaling # One Hot Encoding preprocess = make_column_transformer( (make_pipeline(SimpleImputer(), StandardScaler()), numerical_cols), (OneHotEncoder(handle_unknown='ignore'), categorical_cols)) # Create a pipeline model = Pipeline([('preprocess', preprocess), ('sampling', SMOTE(random_state=42)), ('classification', RandomForestClassifier())]) # fit model model.fit(X_train, y_train) # Predict target vector y_pred = model.predict(X_test) print('Confusion matrix:\n', confusion_matrix(y_test, y_pred)) print('Classification report:\n', classification_report(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) # Create true and false positive rates false_positive_rate, true_positive_rate, threshold = roc_curve( y_test, y_pred) # Calculate Area Under the Receiver Operating Characteristic Curve probs = model.predict_proba(X_test) roc_auc = roc_auc_score(y_test, probs[:, 1]) print('ROC AUC Score:', roc_auc) # Obtain precision and recall precision, recall, thresholds = precision_recall_curve(y_test, y_pred) # Calculate average precision average_precision = average_precision_score(y_test, y_pred) # Plot the roc curve plot_roc_curve(false_positive_rate, true_positive_rate, roc_auc) # Plot recall precision curve plot_pr_curve(recall, precision, average_precision) return model
class TargetEnsembler(object): def __init__(self, features): self.features = features def fit(self, X_train, y_train): # create list of targets # self.pipelines_list = [] # self.preds = [] # for i in targets : # x. feature engineering (i) # y = df[i] # cv_scores (x, y, pipeline_per_target[i]) # model = pipeline_per_target[i].train(x, y) # pipelines_list.append(model) # preds.append(model.pred(x)) # y = df[y] # combined_model = LogReg.train(preds, y) # print results.... # def pred(X): # if intrusion: y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion) else: y_pred_intrusion = 1 if avoidance: y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance) else: y_pred_avoidance = 1 if hypertension: y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension) else: y_pred_hypertension = 1 if depression: y_pred_depression = self.pipe_depression.predict(X_depression) else: y_pred_depression = 1 if only_avoidance: y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_only_avoidance) else: y_pred_only_avoidance = 1 if PCL_Strict3: y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_PCL_Strict3) else: y_pred_PCL_Strict3 = 1 if regression_cutoff_33: y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_regression_cutoff_33) else: y_pred_regression_cutoff_33 = 1 if regression_cutoff_50: y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_regression_cutoff_50) else: y_pred_regression_cutoff_50 = 1 if tred_cutoff: y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_tred_cutoff) else: y_pred_tred_cutoff = 1 y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression & y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 & y_pred_regression_cutoff_50 & y_pred_tred_cutoff) y_target = y_train acc = accuracy_score(y_target, y_pred) f1 = f1_score(y_target, y_pred) recall = recall_score(y_target, y_pred) precision = precision_score(y_target, y_pred) print("training scores") print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}") # combined y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension) y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance) y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion) y_pred_regression = self.pipe_regression.predict(X_regression) X_train["y_pred_hypertension"] = y_pred_hypertension X_train["y_pred_avoidance"] = y_pred_avoidance X_train["y_pred_intrusion"] = y_pred_intrusion X_train["y_pred_regression"] = y_pred_regression preds = ["y_pred_hypertension", "y_pred_avoidance", "y_pred_intrusion", "y_pred_regression"] X_combined = X_train[['q6.11_NUMB_pcl2', 'q6.13_SLEEP_pcl1', 'intrusion_pcl2', 'phq2'] + preds].values y_combined = y_train self.pipe_combined = Pipeline(steps=[ ('classifier', DecisionTreeClassifier())]) scores = cross_val_score(self.pipe_combined, X_combined, y_combined, scoring='precision', cv=StratifiedKFold(5)) print(f"hypertension {sum(scores)/5}") self.pipe_combined.fit(X_combined, y_combined) def predict(self, X_test): if intrusion: X_test_intrusion_cutoff = FeatureEngineering(X_test[self.features], "intrusion_cutoff").engineer_features().values y_pred_intrusion = self.pipe_intrusion.predict(X_test_intrusion_cutoff) else: y_pred_intrusion = 1 if avoidance: X_test_avoidance_cutoff = FeatureEngineering(X_test[self.features], "avoidance_cutoff").engineer_features().values y_pred_avoidance = self.pipe_avoidance.predict(X_test_avoidance_cutoff) else: y_pred_avoidance = 1 if hypertension: X_test_hypertention_cutoff = FeatureEngineering(X_test[self.features], "hypertention_cutoff").engineer_features().values y_pred_hypertension = self.pipe_hypertension.predict(X_test_hypertention_cutoff) else: y_pred_hypertension = 1 if depression: X_test_depression_cutoff = FeatureEngineering(X_test[self.features], "depression_cutoff").engineer_features().values y_pred_depression = self.pipe_depression.predict(X_test_depression_cutoff) else: y_pred_depression = 1 if only_avoidance: X_test_only_avoidance_cutoff = FeatureEngineering(X_test[self.features], "only_avoidance_cutoff").engineer_features().values y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_test_only_avoidance_cutoff) else: y_pred_only_avoidance = 1 if PCL_Strict3: X_test_PCL_Strict3 = FeatureEngineering(X_test[self.features], "PCL_Strict3").engineer_features().values y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_test_PCL_Strict3) else: y_pred_PCL_Strict3 = 1 if regression_cutoff_33: X_test_regression_cutoff_33 = FeatureEngineering(X_test[self.features], "regression_cutoff_33").engineer_features().values y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_test_regression_cutoff_33) else: y_pred_regression_cutoff_33 = 1 if regression_cutoff_50: X_test_regression_cutoff_50 = FeatureEngineering(X_test[self.features], "regression_cutoff_50").engineer_features().values y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_test_regression_cutoff_50) else: y_pred_regression_cutoff_50 = 1 if tred_cutoff: X_test_tred_cutoff = FeatureEngineering(X_test[self.features], "tred_cutoff").engineer_features().values y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_test_tred_cutoff) else: y_pred_tred_cutoff = 1 y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression & y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 & y_pred_regression_cutoff_50 & y_pred_tred_cutoff) preds = ["y_pred_hypertension", "y_pred_avoidance", "y_pred_intrusion", "y_pred_regression"] X_combined = X_test[['q6.11_NUMB_pcl2', 'q6.13_SLEEP_pcl1', 'intrusion_pcl2', 'phq2'] + preds].values y_pred = self.pipe_combined.predict(X_combined) return y_pred