def test_predict_with_predict_params():
    # tests that Pipeline passes predict_params to the final estimator
    # when predict is invoked
    pipe = Pipeline([("transf", Transf()), ("clf", DummyEstimatorParams())])
    pipe.fit(None, None)
    pipe.predict(X=None, got_attribute=True)
    assert pipe.named_steps["clf"].got_attribute
def test_predict_with_predict_params():
    # tests that Pipeline passes predict_params to the final estimator
    # when predict is invoked
    pipe = Pipeline([('transf', Transf()), ('clf', DummyEstimatorParams())])
    pipe.fit(None, None)
    pipe.predict(X=None, got_attribute=True)
    assert pipe.named_steps['clf'].got_attribute
def test_pipeline_methods_rus_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0,
    )

    # Test with PCA + SVC
    clf = SVC(gamma="scale", probability=True, random_state=0)
    pca = PCA()
    rus = RandomUnderSampler(random_state=0)
    pipe = Pipeline([("rus", rus), ("pca", pca), ("svc", clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Exemple #4
0
    def three_models_combined(self, intrusion_features, avoidance_features, hypertension_features):

        self.df = self.df[~self.df['intrusion_cutoff'].isna()]
        self.df = self.df[~self.df['avoidance_cutoff'].isna()]
        self.df = self.df[~self.df['hypertention_cutoff'].isna()]
        print("self.df.shape", self.df.shape)
        X = self.df
        Y = self.df[self.target]# strict
        all_Y = [self.target, "intrusion_cutoff", "avoidance_cutoff", "hypertention_cutoff"]


        X_train, X_test, y_train, y_test = train_test_split(X, self.df[all_Y], test_size=0.25, random_state = 8526566, stratify=Y)

        # intrusion
        X_intrusion = X_train[intrusion_features].values
        y_intrusion = y_train["intrusion_cutoff"].apply(lambda x: int(x))
        pipe_intrusion = Pipeline(steps=[
            ('rfe', BorderlineSMOTE()),
            ('classifier', XGBClassifier(n_estimators=100, reg_alpha=1))])
        scores = cross_val_score(pipe_intrusion, X_intrusion, y_intrusion, scoring='precision', cv=StratifiedKFold(5))
        print(f"intrusion {sum(scores)/5}")
        pipe_intrusion.fit(X_intrusion, y_intrusion)

        # avoidance
        X_avoidance = X_train[avoidance_features].values
        y_avoidance = y_train["avoidance_cutoff"].apply(lambda x: int(x))
        pipe_avoidance = Pipeline(steps=[
            ('classifier', XGBClassifier(n_estimators=100, scale_pos_weight=3, reg_alpha=1))])
        scores = cross_val_score(pipe_avoidance, X_avoidance, y_avoidance, scoring='precision', cv=StratifiedKFold(5))
        print(f"avoidance {sum(scores)/5}")
        pipe_avoidance.fit(X_avoidance, y_avoidance)


        # hypertension
        X_hypertension = X_train[hypertension_features].values
        y_hypertention = y_train["hypertention_cutoff"].apply(lambda x: int(x))
        pipe_hypertension = Pipeline(steps=[
            ('classifier', BalancedBaggingClassifier(n_estimators=100))])
        scores = cross_val_score(pipe_hypertension, X_hypertension, y_hypertention, scoring='precision', cv=StratifiedKFold(5))
        print(f"hypertension {sum(scores)/5}")
        pipe_hypertension.fit(X_hypertension, y_hypertention)

        ## combine three classifiers
        X_test_hypertension = X_test[hypertension_features].values
        X_test_avoidance = X_test[avoidance_features].values
        X_test_intrusion = X_test[intrusion_features].values

        y_pred_hypertension = pipe_hypertension.predict(X_test_hypertension)
        y_pred_avoidance = pipe_avoidance.predict(X_test_avoidance)
        y_pred_intrusion = pipe_intrusion.predict(X_test_intrusion)
        y_pred = (y_pred_hypertension * y_pred_avoidance * y_pred_intrusion)

        y_target = y_test["PCL_Strict3"].apply(lambda x: int(x))

        acc = accuracy_score(y_target, y_pred)
        f1 = f1_score(y_target, y_pred)
        recall = recall_score(y_target, y_pred)
        precision = precision_score(y_target, y_pred)
        print("test scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(gamma='scale', probability=True, random_state=0)
    pca = PCA(svd_solver='full', n_components='mle', whiten=True)
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression(solver="lbfgs", multi_class="auto")
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([("anova", filter1), ("logistic", clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Exemple #7
0
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA()
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Exemple #10
0
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA()
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(gamma="scale", probability=True, random_state=0)
    pca = PCA(svd_solver="full", n_components="mle", whiten=True)
    pipe = Pipeline([("pca", pca), ("svc", clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Exemple #12
0
    def illigal_genralization_checking(self, X_test, y_test):

        X = self.df[self.features]
        X_test = X_test[self.features]
        Y = self.df[self.target]
        pipe = Pipeline(
            steps=[('classifier',
                    XGBClassifier(
                        n_estimators=1000, scale_pos_weight=3, reg_alpha=1))])
        y_test = y_test["intrusion_cutoff"].apply(lambda x: int(x))
        scores = cross_val_score(pipe,
                                 X,
                                 Y,
                                 scoring='precision',
                                 cv=StratifiedKFold(5))
        print(self.features)
        print("cross vl scores")
        print(sum(scores) / 5)
        pipe.fit(X, Y.values)
        y_pred = pipe.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        print("test scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = PCA(n_components=2, svd_solver="randomized", whiten=True)
    clf = SVC(
        gamma="scale",
        probability=True,
        random_state=0,
        decision_function_shape="ovr",
    )

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert predict.shape == (n_samples, )

        proba = pipe.predict_proba(X)
        assert proba.shape == (n_samples, n_classes)

        log_proba = pipe.predict_log_proba(X)
        assert log_proba.shape == (n_samples, n_classes)

        decision_function = pipe.decision_function(X)
        assert decision_function.shape == (n_samples, n_classes)

        pipe.score(X, y)
def Predict(data, mode):
    train, test = data
    idx = test.id.values.astype(int)
    y = train.median_relevance.values

    train_query = list(
        train.apply(lambda x: '%s' % x['query_preprocessed'], axis=1))
    train_title = list(
        train.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1))

    test_query = list(
        test.apply(lambda x: '%s' % x['query_preprocessed'], axis=1))
    test_title = list(
        test.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1))

    stop_words = text.ENGLISH_STOP_WORDS.union(['http','www','img','border','color','style','padding','table','font', \
                                                'thi','inch','ha','width','height','0','1','2','3','4','5','6','7','8','9'])
    stop_words = text.ENGLISH_STOP_WORDS.union(set(stopwords.words('english')))

    tfv = text.TfidfVectorizer(min_df=7,  max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', \
                               ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words=stop_words)

    tfv.fit(train_query + train_title)
    X_train = hstack([tfv.transform(train_query), tfv.transform(train_title)])
    X_test = hstack([tfv.transform(test_query), tfv.transform(test_title)])

    sim = similarlity_stack()
    if mode == 'eda':
        svd = TruncatedSVD(n_components=200)
        scl = StandardScaler(with_mean=False)
        svm = SVC(C=10,
                  gamma="auto",
                  kernel="rbf",
                  class_weight=None,
                  probability=True)
        clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\
                            ('scl', scl),\
                            ('svm', svm)])
    elif mode == 'sampling':
        svd = TruncatedSVD(n_components=200)
        scl = StandardScaler(with_mean=False)
        svm = SVC(C=10,
                  gamma="auto",
                  kernel="rbf",
                  class_weight=None,
                  probability=True)
        sampling = SVMSMOTE(svm_estimator=svm, k_neighbors=4)
        clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\
                                  ('scl', scl),\
                                  ('sampling', sampling),\
                                  ('svm', svm)])

    clf.fit(X_train, y)
    preds = clf.predict(X_test)
    pred_probas = clf.predict_proba(X_test)

    submission = pd.DataFrame({"id": idx, "prediction": preds})
    submission_probas = pd.DataFrame(pred_probas, index=idx)

    return submission, submission_probas
Exemple #15
0
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = PCA(n_components=2)
    clf = SVC(probability=True, random_state=0, decision_function_shape='ovr')

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert_equal(predict.shape, (n_samples, ))

        proba = pipe.predict_proba(X)
        assert_equal(proba.shape, (n_samples, n_classes))

        log_proba = pipe.predict_log_proba(X)
        assert_equal(log_proba.shape, (n_samples, n_classes))

        decision_function = pipe.decision_function(X)
        assert_equal(decision_function.shape, (n_samples, n_classes))

        pipe.score(X, y)
Exemple #16
0
def test_pipeline_methods_anova_rus():
    # Test the various methods of the pipeline (anova).
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)
    # Test with RandomUnderSampling + Anova + LogisticRegression
    clf = LogisticRegression()
    rus = RandomUnderSampler(random_state=0)
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = PCA(n_components=2)
    clf = SVC(probability=True, random_state=0, decision_function_shape='ovr')

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert_equal(predict.shape, (n_samples,))

        proba = pipe.predict_proba(X)
        assert_equal(proba.shape, (n_samples, n_classes))

        log_proba = pipe.predict_log_proba(X)
        assert_equal(log_proba.shape, (n_samples, n_classes))

        decision_function = pipe.decision_function(X)
        assert_equal(decision_function.shape, (n_samples, n_classes))

        pipe.score(X, y)
def test_pipeline_methods_anova_rus():
    # Test the various methods of the pipeline (anova).
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)
    # Test with RandomUnderSampling + Anova + LogisticRegression
    clf = LogisticRegression()
    rus = RandomUnderSampler(random_state=0)
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_methods_rus_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)

    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA()
    rus = RandomUnderSampler(random_state=0)
    pipe = Pipeline([('rus', rus), ('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def train_test_and_evaluate(seed, X_train, X_test, y_train, y_test, onehots,
                            numericals, cv):

    # define transformer for features that are one hot encoded and pre-select features using chi square
    selector = SelectKBest(chi2, k=15)
    ohc = OneHotEncoder(handle_unknown='ignore')
    onehot_transformer = Pipeline(steps=[('selector', selector), ('ohc', ohc)])

    # define transformer for numerical features, scale features and shrink space using pca
    scaler = StandardScaler()
    pca = PCA()
    numeric_transformer = Pipeline(steps=[('scaler', scaler), ('pca', pca)])

    # define resampling: oversample minority class first and undersample majority class afterwards
    over = RandomOverSampler(random_state=random_seed, sampling_strategy=0.1)
    under = RandomUnderSampler(random_state=random_seed, sampling_strategy=0.5)

    # combine steps into preprocessing and machine learning pipeline using logistic regression
    preprocessor = ColumnTransformer(
        transformers=[('numeric', numeric_transformer,
                       numericals), ('onehot', onehot_transformer, onehots)])

    pipe_model = Pipeline(
        steps=[('over', over), ('under', under), (
            'prep',
            preprocessor), ('classifier', LogisticRegression(max_iter=1000))])

    # Cross-validate model on training data to estimate performance
    cv_results = 2 * roc_auc_score(
        y_train,
        cross_val_predict(
            pipe_model, X_train, y_train, cv=cv,
            method='predict_proba')[:, 1]) - 1
    print("Mean training gini after CV: {res}".format(res=cv_results.mean()))

    # Fit the model to training data and evaluate on test data and finally on evaluation data
    pipe_model.fit(X_train, y_train)
    y_true, y_pred = y_test, pipe_model.predict(X_test)
    gini = 2 * roc_auc_score(y_true,
                             pipe_model.predict_proba(X_test)[:, 1]) - 1
    print("Gini score on test set: " + str(gini))

    y_true, y_pred = y_val, pipe_model.predict(X_val)
    gini = 2 * roc_auc_score(y_true, pipe_model.predict_proba(X_val)[:, 1]) - 1
    print("Gini score on validation set: " + str(gini))
def test_pipeline_fit_params():
    # Test that the pipeline can take fit parameters
    pipe = Pipeline([('transf', TransfT()), ('clf', FitParamT())])
    pipe.fit(X=None, y=None, clf__should_succeed=True)
    # classifier should return True
    assert_true(pipe.predict(None))
    # and transformer params should not be changed
    assert_true(pipe.named_steps['transf'].a is None)
    assert_true(pipe.named_steps['transf'].b is None)
Exemple #22
0
def test_pipeline_fit_params():
    # Test that the pipeline can take fit parameters
    pipe = Pipeline([('transf', TransfT()), ('clf', FitParamT())])
    pipe.fit(X=None, y=None, clf__should_succeed=True)
    # classifier should return True
    assert_true(pipe.predict(None))
    # and transformer params should not be changed
    assert_true(pipe.named_steps['transf'].a is None)
    assert_true(pipe.named_steps['transf'].b is None)
class ImblearnRecalibrator(BaseEstimator, ClassifierMixin):
    """
    imblearnのリサンプリングの偏りを再較正するやつ
    再較正のコードを毎回書きたくない. scikit-learnの設計思想に則りオブジェクト指向プログラミングをしよう
    estimator, resampler, サンプリング割合を指定したら後は fit & predict/predict_proba するだけ
    * 注意: 不均衡データに対するリサンプリングは分類性能を目的としているので判別性能等に効果があるかは知らない
    
    :param estimatror: scikit-learn API 準拠の estimator オブジェクト
    :param resampler: imblearn で使われる各種 resampler オブジェクト
    :param post_minor_rate: リサンプリング後の**全件に対する少数例の割合**を指定. default is None. alpha とどちらか片方を使う.
    :param alpha: **リサンプリング前に対する**事後の少数例の割合**を指定. default is 'auto'. post_minor_rate とどちらか片方を使う.
    """
    def __init__(self,
                 estimator,
                 resampler,
                 alpha='auto',
                 post_minor_rate=None):
        resampler = clone(resampler)
        if post_minor_rate is None and alpha is None:
            warnings.warn(
                'neither of `post_minor_rate` nor `alpha` are specified. Instead resampling stragegy specified in `resampler` object is used.'
            )
        elif post_minor_rate and alpha:
            warnings.warn(
                'both of `post_minor_rate` and `alpha` are specified. the former is applied.'
            )
            self.post_minor_rate = post_minor_rate
            self.resampling_strategy = 'posterior_rate'
        elif post_minor_rate:
            self.post_minor_rate = post_minor_rate
            self.resampling_strategy = 'posterior_rate'
        elif alpha:
            self.alpha = alpha
            self.resampling_strategy = 'alpha'
            resampler.set_params(sampling_strategy=alpha)
        else:
            raise ('initialized error')
        self.estimator_ = Pipeline([('resampler', resampler),
                                    ('estimator', clone(estimator))])

    def fit(self, X, y):
        if self.resampling_strategy == 'posterior_rate':
            alpha = get_oversampling_rate(self.post_minor_rate)
            self.alpha = alpha
            self.estimator_['resampler'].set_params(sampling_strategy=alpha)
        self.estimator_.fit(X, y)
        self.minor_rate_ = np.min([y.mean(), 1 - y.mean()])
        return self

    def predict(self, X):
        return self.estimator_.predict(X)

    def predict_proba(self, X):
        return calibrate_imbalanceness(self.estimator_.predict_proba(X),
                                       pos_rate=get_oversampling_power(
                                           self.alpha, self.minor_rate_))
def predict(test_set_name: list, pipeline: Pipeline) -> pd.DataFrame:
    """
    Use the trained pipeline to predict on the test set given and output the final prediction as submission file
    """
    X_test = model_pipeline_io.get_test_set(test_set_name)

    y_pred = pipeline.predict(X_test)
    submission = pd.DataFrame(data=y_pred)

    model_pipeline_io.save_submit_file(submission, "submission.csv")
    return submission
def test_pipeline_fit_params():
    # Test that the pipeline can take fit parameters
    pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())])
    pipe.fit(X=None, y=None, clf__should_succeed=True)
    # classifier should return True
    assert pipe.predict(None)
    # and transformer params should not be changed
    assert pipe.named_steps['transf'].a is None
    assert pipe.named_steps['transf'].b is None
    # invalid parameters should raise an error message
    with raises(TypeError, match="unexpected keyword argument"):
        pipe.fit(None, None, clf__bad=True)
def test_pipeline_fit_params():
    # Test that the pipeline can take fit parameters
    pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())])
    pipe.fit(X=None, y=None, clf__should_succeed=True)
    # classifier should return True
    assert pipe.predict(None)
    # and transformer params should not be changed
    assert pipe.named_steps["transf"].a is None
    assert pipe.named_steps["transf"].b is None
    # invalid parameters should raise an error message
    with raises(TypeError, match="unexpected keyword argument"):
        pipe.fit(None, None, clf__bad=True)
def test_pipeline_memory_transformer():
    iris = load_iris()
    X = iris.data
    y = iris.target
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir=cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(probability=True, random_state=0)
        transf = DummyTransf()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert not hasattr(transf, 'means_')
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(probability=True, random_state=0)
        transf_2 = DummyTransf()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
def test_pipeline_memory_transformer():
    iris = load_iris()
    X = iris.data
    y = iris.target
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(gamma='scale', probability=True, random_state=0)
        transf = DummyTransf()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert not hasattr(transf, 'means_')
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(gamma='scale', probability=True, random_state=0)
        transf_2 = DummyTransf()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
Exemple #29
0
def test_pipeline_fit_params():
    # Test that the pipeline can take fit parameters
    pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())])
    pipe.fit(X=None, y=None, clf__should_succeed=True)
    # classifier should return True
    assert pipe.predict(None)
    # and transformer params should not be changed
    assert pipe.named_steps['transf'].a is None
    assert pipe.named_steps['transf'].b is None
    # invalid parameters should raise an error message
    assert_raise_message(TypeError,
                         "fit() got an unexpected keyword argument 'bad'",
                         pipe.fit,
                         None,
                         None,
                         clf__bad=True)
Exemple #30
0
def start_identification_twitter(user_list, text):
    if len(text) < 240:
        print('Text Length must be 240 characters. Currently:', len(text))
        return -1
    tw = twitScrape()
    user_profiles = []
    with open('../config/labels_twitter.json', 'r') as f:
        labels = json.load(f)
    downloaded_users = [
        user[:-4] for user in os.listdir(TWITTER_PATH) if user.endswith('.csv')
    ]
    user_list = [user for user in user_list if len(user) > 0]
    print('checking user_profiles for downloaded users')
    for user in user_list:
        if user in downloaded_users:
            user_profiles.append(pd.read_csv(f'{TWITTER_PATH}{user}.csv'))
            continue
        r = tw.getIndivTweets(user)
        if r <= -1:
            continue

        user_path = TWITTER_PATH + user + '.txt'
        ip = IdentProfile(user_path, labels['new_label'], email_size=240)
        labels['labels'][labels['new_label']] = user
        labels['new_label'] += 1
        df = ip.create_profile()
        df.to_csv(f'../corpora/twitter_corpus_csv/{user}.csv', index=False)
        user_profiles.append(df.copy())
        print('created new user profile for ', user)
    with open('../config/labels_twitter.json', 'w') as f:
        json.dump(labels, f)
    df = pd.concat(user_profiles, ignore_index=True)
    x, y = split_x_y(df, numpy=True)
    pipe = Pipeline([('MinMaxScaler', MinMaxScaler()),
                     ('SVC', LinearSVC(C=0.5, penalty='l2',
                                       dual=True))])  # type: ignore
    print('Beginning Training')
    pipe.fit(x, y)
    ip = IdentText('', text, email_size=240)
    text_df = ip.create_profile()
    x_test = text_df.to_numpy()
    prediction = pipe.predict(x_test)
    prediction = prediction[0]
    print('prediction', prediction)
    choice = labels['labels'][str(prediction)]
    print('Best candidate', choice)
    return choice
Exemple #31
0
def main():
    """ Trains a logistic regression, an attempt to be 'production' grade
    """

    logger = logging.getLogger(__name__)
    logger.info(f'Reading data')
    processed_df = pd.read_csv('../../data/processed/processed.csv')

    X = processed_df.drop('Class', axis=1).values
    y = processed_df['Class'].values

    X_train, X_test, y_train, y_test = tts(X, y, random_state=random_seed)

    logger.info(f'Constructing model pipeline')
    model = Pipeline(
        [
            ('sampling', SMOTE()),
            ('classification', baseline_classifiers['LogisiticRegression'])
        ]
    )

    logger.info(f'Constructing baseline model')
    model.fit(X_train, y_train)
    baseline_y_hat = model.predict(X_test)

    baseline_report = classification_report(y_test, baseline_y_hat)
    print(f'Classification report for Baseline model \n{baseline_report}')

    logger.info(f'Performing Gridsearch')
    gridsearch_cv = GridSearchCV(
        estimator=model,
        param_grid=LogisiticRegression_grid,
        cv=5,
        scoring=model_metrics,
        n_jobs=1,
        refit='F1',
        return_train_score=True
    )

    gridsearch_cv.fit(X_train, y_train)
    print(f'Best score (log-loss): {gridsearch_cv.best_score_}\nBest Parameters: {gridsearch_cv.best_params_}')
    gridsearch_y_hat = gridsearch_cv.predict(X_test)
    gridsearch_report = classification_report(y_test, gridsearch_y_hat)
    print(f'Classification report for tuned model \n{gridsearch_report}')

    joblib.dump(gridsearch_cv, f'../../models/{best_model_file_name}', compress=9)
    logger.info(f'Serialised model as {best_model_file_name}')
def train_and_evaluate_model(model, params, X_train, y_train, X_test, y_test):
    for k, v in params.items():
        model.set_params(**{k: v})
    model_name = type(model).__name__
    # print(" ")
    # print("Training model {0}: ".format(model_name))
    model = Pipeline([('sampling', SMOTE(sampling_strategy='minority')),
                      ('model', model)])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score_acc = accuracy_score(y_test, y_pred)
    score_precision = precision_score(y_test, y_pred)
    score_recall = recall_score(y_test, y_pred)
    score_f1 = f1_score(y_test, y_pred)
    score_roc = roc_auc_score(y_test, y_pred)
    # plot_confusion_matrix(y_test, y_pred, score_f1, model_name)
    return score_acc, score_precision, score_recall, score_f1, score_roc
Exemple #33
0
    def illigal_genralization_checking(self, X_test, y_test):

        X = self.df[self.features]
        X_test = X_test[self.features]
        Y = self.df[self.target]
        pipe = Pipeline(steps=[('classifier', XGBClassifier(n_estimators=1000, scale_pos_weight=3, reg_alpha=1))])
        y_test = y_test["intrusion_cutoff"].apply(lambda x: int(x))
        scores = cross_val_score(pipe, X, Y, scoring='precision', cv=StratifiedKFold(5))
        print(self.features)
        print("cross vl scores")
        print(sum(scores)/5)
        pipe.fit(X, Y.values)
        y_pred = pipe.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        print("test scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def output_test_smote(train, test):
    features = np.array([
        'reviewText', 'summary', 'reviewSentiment', 'summarySentiment',
        'reviewSub', 'summarySub'
    ])
    # set up pipeline transformers
    revTf = Pipe(steps=[('selector', Selector(
        key='reviewText')), ('cf', CountVectorizer(
            max_df=.95)), ('tf', TfidfTransformer(sublinear_tf=True))])
    sumTf = Pipe(steps=[('selector', Selector(
        key='summary')), ('cf', CountVectorizer(
            max_df=.95)), ('tf', TfidfTransformer(sublinear_tf=True))])

    # set up column transformers
    combine = ColumnTransformer(transformers=[(
        'revSen', StandardScaler(), ['reviewSentiment']
    ), ('sumSen', StandardScaler(),
        ['summarySentiment']), (
            'revSub', StandardScaler(),
            ['reviewSub']), ('sumSub', StandardScaler(),
                             ['summarySub']), (
                                 'rev', revTf,
                                 ['reviewText']), ('sum', sumTf, ['summary'])])

    # set up final model pipeline
    model = Pipe(
        steps=[('combine', combine), ('smote', SMOTE()),
               ('classifier',
                LogisticRegression(
                    max_iter=500, warm_start=True, penalty='l2', C=.8))])

    # Fit the data and predict
    print("Fitting")
    model.fit(train[features], train['label'])
    print("Model fit")
    preds = model.predict(test[features])

    # Output to an output file
    df = pd.DataFrame(np.dstack((test["asin"], preds))[0],
                      columns=["asin", "label"])
    df.to_csv("output_smote.csv")
Exemple #35
0
def model_select():
    for nome_balanceador, balanceador in balanceadores:
        if classificador_ja_executado(nome, nome_balanceador):
            continue
        else:
            print(balanceador)
            pipeline = Pipeline([('dimension', PCA(n_components=250)),
                                 ('balance', balanceador), ('clf', modelo)])
            print("# Rodando o algoritmo %s" % nome)
            print()

            np.set_printoptions(precision=4)
            pipeline.fit(dados_completo_x, dados_completo_y)

            print("Detailed classification report:")
            print()
            print("The model is trained on the full development set.")
            print("The scores are computed on the full evaluation set.")
            print()
            y_pred = pipeline.predict(test_x)
            matriz_confusao = confusion_matrix(test_y, y_pred)
            nome_arquivo = nome + '_' + nome_balanceador + '_best_mucilage'
            plot_confusion_matrix(matriz_confusao,
                                  nome_arquivo, [1, 2, 3, 4],
                                  False,
                                  title='Confusion matrix' + nome +
                                  ' (best parameters)')
            plot_confusion_matrix(matriz_confusao,
                                  nome_arquivo, [1, 2, 3, 4],
                                  True,
                                  title='Confusion matrix ' + nome +
                                  ', normalized')
            print('Matriz de Confusão')
            print(matriz_confusao)
            print(classification_report(y_true=test_y, y_pred=y_pred,
                                        digits=4))
            y_pred = pipeline.predict_proba(test_x)
            roc_auc_aux(test_y, y_pred, nome, nome_balanceador)
            print()
            sys.stdout.flush()
def train_model(X_train, X_test, y_train, y_test,variablepath):
  # define pipeline
  steps = [('model', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='entropy', max_depth=90, max_features=2,
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=5,
                        min_weight_fraction_leaf=0.0, n_estimators=16,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False))]
  pipeline = Pipeline(steps=steps)
  # evaluate pipeline
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=4, random_state=1)
  scores = cross_val_score(pipeline, X_train, y_train, scoring='f1_micro', cv=cv, n_jobs=-1)
  score = mean(scores)
  print('Cross Validated f1 score: %.3f' % score)
  pipeline.fit(X_train,y_train)
  y_pred = pipeline.predict(X_test)
  evaluate(pipeline,X_test, y_test)
  os.chdir(variablepath)
  # save the model to disk
  filename = 'RFC.sav'
  pickle.dump(pipeline, open(filename, 'wb'))
  return pipeline
Exemple #37
0
def resampling(X, Y, r):
    # print(sorted(Counter(Y).items()))
    smote_enn = TomekLinks()
    X_resampled, y_resampled = smote_enn.fit_resample(X, Y)
    #print(sorted(Counter(y_resampled).items()))
    return X_resampled, y_resampled


# pipeline
pipeline = Pipeline([
    ('und', RandomUnderSampler()),
    #('power', preprocessing.PowerTransformer()),
    ('standardize', preprocessing.StandardScaler()),
    ('normalizer', preprocessing.Normalizer()),
    ('lda', LinearDiscriminantAnalysis()),
    #('logistic', sk.linear_model.SGDClassifier(loss="hinge", eta0=1, learning_rate="constant", penalty='l2'))
    ('svm', LinearSVC(verbose=0, max_iter=3000, class_weight='balanced')),
])

com_values = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10]
for c in com_values:
    pipeline.set_params(svm__C=c, und__random_state=42).fit(X_train, Y_train)
    # clf = CalibratedClassifierCV(base_estimator=pipeline, cv=10).fit(X,Y)
    y_p = pipeline.decision_function(X_dev)
    y_pred = pipeline.predict(X_dev)
    print("With:", c)
    print("Confusion matrix:\n", sk.metrics.confusion_matrix(Y_dev, y_pred))
    one = sk.metrics.recall_score(Y_dev, y_pred, pos_label=0)
    two = sk.metrics.recall_score(Y_dev, y_pred, pos_label=1)
    print("UAR:", (one + two) / 2, "\n")
Exemple #38
0
    axis=1,
    inplace=True)

cols_to_encode = ['Departure', 'Arrival', 'month', 'day', 'year', 'season']
cols_to_scale = ['WeeksToDeparture', 'std_wtd', 'distance']

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

scaler = MinMaxScaler()
ohe = OneHotEncoder(categories='auto', sparse=False)

scaled_cols = scaler.fit_transform(df_train[cols_to_scale])
encoded_cols = ohe.fit_transform(df_train[cols_to_encode])

processed_df = np.concatenate([scaled_cols, encoded_cols], axis=1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(processed_df,
                                                    y_train,
                                                    test_size=0.25)
y_train = np.ravel(y_train)

from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier

pipeline = Pipeline([('ovs', SMOTE()), ('clf', KNeighborsClassifier())])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

from sklearn.metrics import f1_score
score = f1_score(y_test, y_pred, average='micro')
class TargetEnsembler(object):

    def __init__(self, features):
        self.features = features

    def fit(self, X_train, y_train):

        # intrusion
        if intrusion:
            X_intrusion = FeatureEngineering(X_train[self.features], "intrusion_cutoff").engineer_features().values
            y_intrusion = X_train["intrusion_cutoff"].apply(lambda x: int(x))

            self.pipe_intrusion = Pipeline(steps=[
                ('feature_selection', SelectFpr(alpha=0.05)),
                ('sampling', BorderlineSMOTE(k_neighbors=10)),
                ('classifier', XGBClassifier(n_estimators=300, max_depth=5))])

            scores = cross_val_score(self.pipe_intrusion, X_intrusion, y_intrusion, scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"intrusion {sum(scores)/5}")
            self.pipe_intrusion.fit(X_intrusion, y_intrusion)

        # avoidance
        if avoidance:
            X_avoidance = FeatureEngineering(X_train[self.features], "avoidance_cutoff").engineer_features().values
            y_avoidance = X_train["avoidance_cutoff"].apply(lambda x: int(x))

            self.pipe_avoidance = Pipeline(steps=[
                ('feature_selection',  RFE(estimator=XGBClassifier(scale_pos_weight=5.88, n_estimators=100),
                                           n_features_to_select=20)),
                ('classifier', BalancedRandomForestClassifier(n_estimators=300, max_depth=10))])

            scores = cross_val_score(self.pipe_avoidance, X_avoidance, y_avoidance, scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"avoidance {sum(scores)/5}")
            self.pipe_avoidance.fit(X_avoidance, y_avoidance)

        # hypertension
        if hypertension:
            X_hypertension = FeatureEngineering(X_train[self.features], "hypertention_cutoff").engineer_features().values
            y_hypertention = X_train["hypertention_cutoff"].apply(lambda x: int(x))

            self.pipe_hypertension = Pipeline(steps=[
                ('feature_selection',  RFE(estimator=XGBClassifier(n_estimators=100, scale_pos_weight=3.51),
                                           n_features_to_select=20)),
                ( 'sampling', SMOTE(k_neighbors=10)),
                ('classifier', BalancedRandomForestClassifier(n_estimators=100))])

            scores = cross_val_score(self.pipe_hypertension, X_hypertension, y_hypertention, scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"hypertension {sum(scores)/5}")
            self.pipe_hypertension.fit(X_hypertension, y_hypertention)

        # depression
        if depression:
            X_depression = FeatureEngineering(X_train[self.features], "depression_cutoff").engineer_features().values
            y_depression = X_train["depression_cutoff"].apply(lambda x: int(x))

            self.pipe_depression = Pipeline(steps=[
                ('feature_selection', SelectFdr(alpha=0.1)),
                ('sampling', SMOTE(k_neighbors=5)),
                ('classifier', RandomForestClassifier(n_estimators=100))])

            scores = cross_val_score(self.pipe_depression, X_depression, y_depression, scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"depression {sum(scores)/5}")
            self.pipe_depression.fit(X_depression, y_depression)

        # only_avoidance
        if only_avoidance:
            X_only_avoidance = FeatureEngineering(X_train[self.features], "only_avoidance_cutoff").engineer_features().values
            y_only_avoidance = X_train["only_avoidance_cutoff"].apply(lambda x: int(x))

            self.pipe_only_avoidance = Pipeline(steps=[
                ('feature_selection', RFE(XGBClassifier(n_estimators=100,max_depth=3), n_features_to_select=10)),
                ('classifier', BalancedRandomForestClassifier( n_estimators=500, max_depth=10))])

            scores = cross_val_score(self.pipe_only_avoidance, X_only_avoidance,
                                     y_only_avoidance, scoring='f1', cv=StratifiedKFold(5))
            print(f"only_avoidance {sum(scores)/5}")
            self.pipe_only_avoidance.fit(X_only_avoidance, y_only_avoidance)

        # pcl_strict3
        if PCL_Strict3:
            X_PCL_Strict3 = FeatureEngineering(X_train[self.features], "PCL_Strict3").engineer_features().values
            y_PCL_Strict3 = y_train["PCL_Strict3"].apply(lambda x: int(x))

            self.pipe_PCL_Strict3 = Pipeline(steps=[
                ('feature_selection', SelectKBest(k=20)),
                ('sampling', SMOTE(k_neighbors=5)),
                ('classifier', XGBClassifier(max_depth=3, n_estimators=100))])

            scores = cross_val_score(self.pipe_PCL_Strict3, X_PCL_Strict3,
                                     y_PCL_Strict3, scoring='f1', cv=StratifiedKFold(5))
            print(f"PCL_Strict3 {sum(scores)/5}")
            self.pipe_PCL_Strict3.fit(X_PCL_Strict3, y_PCL_Strict3)


        # cutoff_33
        if regression_cutoff_33:
            X_regression_cutoff_33 = FeatureEngineering(X_train[self.features],
                                                        "regression_cutoff_33").engineer_features().values
            y_regression_cutoff_33 = X_train["regression_cutoff_33"].apply(lambda x: int(x))

            self.pipe_regression_cutoff_33 = Pipeline(steps=[
                ('feature_selection', SelectFpr(alpha=0.033)),
                ('sampling', SMOTE(k_neighbors=10)),
                ('classifier', RandomForestClassifier(n_estimators=100, max_depth=5))])

            scores = cross_val_score(self.pipe_regression_cutoff_33, X_regression_cutoff_33,
                                     y_regression_cutoff_33, scoring='f1', cv=StratifiedKFold(5))
            print(f"regression_cutoff_33 {sum(scores)/5}")
            self.pipe_regression_cutoff_33.fit(X_regression_cutoff_33, y_regression_cutoff_33)

        # cutoff 50
        if regression_cutoff_50:
            X_regression_cutoff_50 = FeatureEngineering(X_train[self.features], "regression_cutoff_50").engineer_features().values
            y_regression_cutoff_50 = X_train["regression_cutoff_50"].apply(lambda x: int(x))

            self.pipe_regression_cutoff_50 = Pipeline(steps=[
                ('feature_selection', SelectKBest(k=10)),
                ('sampling', SMOTE(k_neighbors=10)),
                ('classifier', XGBClassifier(max_depth=2, n_estimators=100))])

            scores = cross_val_score(self.pipe_regression_cutoff_50, X_regression_cutoff_50,
                                     y_regression_cutoff_50, scoring='f1', cv=StratifiedKFold(5))
            print(f"regression_cutoff_50 {sum(scores)/5}")
            self.pipe_regression_cutoff_50.fit(X_regression_cutoff_50, y_regression_cutoff_50)

        # tred_cutoff
        if tred_cutoff:
            X_tred_cutoff = FeatureEngineering(X_train[self.features], "tred_cutoff").engineer_features().values
            y_tred_cutoff = X_train["tred_cutoff"].apply(lambda x: int(x))

            self.pipe_tred_cutoff = Pipeline(steps=[
                ('feature_selection', SelectKBest(k=20)),
                ('sampling', SMOTE(k_neighbors=10)),
                ('classifier', XGBClassifier(n_estimators=100, max_depth=2))])

            scores = cross_val_score(self.pipe_tred_cutoff, X_tred_cutoff, y_tred_cutoff, scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"tred_cutoff {sum(scores)/5}")
            self.pipe_tred_cutoff.fit(X_tred_cutoff, y_tred_cutoff)

        # target
        if intrusion:
            y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion)
        else:
            y_pred_intrusion = 1

        if avoidance:
            y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance)
        else: y_pred_avoidance = 1

        if hypertension:
            y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension)
        else: y_pred_hypertension = 1

        if depression:
            y_pred_depression = self.pipe_depression.predict(X_depression)
        else: y_pred_depression = 1

        if only_avoidance:
            y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_only_avoidance)
        else: y_pred_only_avoidance = 1

        if PCL_Strict3:
            y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_PCL_Strict3)
        else: y_pred_PCL_Strict3 = 1

        if regression_cutoff_33:
            y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_regression_cutoff_33)
        else: y_pred_regression_cutoff_33 = 1

        if regression_cutoff_50:
            y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_regression_cutoff_50)
        else: y_pred_regression_cutoff_50 = 1

        if tred_cutoff:
            y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_tred_cutoff)
        else: y_pred_tred_cutoff = 1


        y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression &
                  y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 &
                  y_pred_regression_cutoff_50 & y_pred_tred_cutoff)
        y_target = y_train

        acc = accuracy_score(y_target, y_pred)
        f1 = f1_score(y_target, y_pred)
        recall = recall_score(y_target, y_pred)
        precision = precision_score(y_target, y_pred)
        print("training scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")

    def predict(self, X_test):


        if intrusion:
            X_test_intrusion_cutoff = FeatureEngineering(X_test[self.features],
                                                         "intrusion_cutoff").engineer_features().values
            y_pred_intrusion = self.pipe_intrusion.predict(X_test_intrusion_cutoff)
        else: y_pred_intrusion = 1

        if avoidance:
            X_test_avoidance_cutoff = FeatureEngineering(X_test[self.features],
                                                         "avoidance_cutoff").engineer_features().values
            y_pred_avoidance = self.pipe_avoidance.predict(X_test_avoidance_cutoff)
        else: y_pred_avoidance = 1

        if hypertension:
            X_test_hypertention_cutoff = FeatureEngineering(X_test[self.features],
                                                            "hypertention_cutoff").engineer_features().values
            y_pred_hypertension = self.pipe_hypertension.predict(X_test_hypertention_cutoff)
        else: y_pred_hypertension = 1

        if depression:
            X_test_depression_cutoff = FeatureEngineering(X_test[self.features],
                                                          "depression_cutoff").engineer_features().values
            y_pred_depression = self.pipe_depression.predict(X_test_depression_cutoff)
        else: y_pred_depression = 1

        if only_avoidance:
            X_test_only_avoidance_cutoff = FeatureEngineering(X_test[self.features],
                                                              "only_avoidance_cutoff").engineer_features().values

            y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_test_only_avoidance_cutoff)
        else: y_pred_only_avoidance = 1

        if PCL_Strict3:
            X_test_PCL_Strict3 = FeatureEngineering(X_test[self.features], "PCL_Strict3").engineer_features().values
            y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_test_PCL_Strict3)
        else: y_pred_PCL_Strict3 = 1

        if regression_cutoff_33:
            X_test_regression_cutoff_33 = FeatureEngineering(X_test[self.features],
                                                             "regression_cutoff_33").engineer_features().values
            y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_test_regression_cutoff_33)
        else: y_pred_regression_cutoff_33 =1

        if regression_cutoff_50:
            X_test_regression_cutoff_50 = FeatureEngineering(X_test[self.features],
                                                             "regression_cutoff_50").engineer_features().values
            y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_test_regression_cutoff_50)
        else: y_pred_regression_cutoff_50 = 1

        if tred_cutoff:
            X_test_tred_cutoff = FeatureEngineering(X_test[self.features], "tred_cutoff").engineer_features().values
            y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_test_tred_cutoff)
        else: y_pred_tred_cutoff = 1

        y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression &
                  y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 &
                  y_pred_regression_cutoff_50 & y_pred_tred_cutoff)

        return y_pred
Exemple #40
0

import time

selector1 = VarianceThreshold(threshold=3)
scaler1 = StandardScaler()
ros1 = RandomOverSampler()
pca1 = PCA(n_components=1)
gnb1 = GaussianNB()

Gnb_mic = Pipeline(steps=[('selector', selector1), ('scaler', scaler1), ('sampler', ros1), ('pca', pca1), ('gnb', gnb1)])

time1 = time.time()
Gnb_mic.fit(X_train, y_train)
time2 = time.time()
Gnb_mic_pred = Gnb_mic.predict(X_test)
time3 = time.time()

_,_,Gnb_f1_mic,_ = precision_recall_fscore_support(y_test, Gnb_mic_pred, average='micro')
_,_,f1_mac,_ = precision_recall_fscore_support(y_test, Gnb_mic_pred, average='macro')

print("f1-micro: ", Gnb_f1_mic)
print("f1-macro: ", f1_mac)
print("Accuracy: ", accuracy_score(y_test, Gnb_mic_pred))



# Και έχουμε τον εξής Confusion Matrix:

# In[ ]:
def test_pipeline_memory_sampler():
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0,
    )
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(gamma="scale", probability=True, random_state=0)
        transf = DummySampler()
        pipe = Pipeline([("transf", clone(transf)), ("svc", clf)])
        cached_pipe = Pipeline([("transf", transf), ("svc", clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps["transf"].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert not hasattr(transf, "means_")
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(gamma="scale", probability=True, random_state=0)
        transf_2 = DummySampler()
        cached_pipe_2 = Pipeline([("transf_2", transf_2), ("svc", clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe_2.named_steps["transf_2"].means_,
        )
        assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
def test_pipeline_memory_sampler():
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0)
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir=cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(probability=True, random_state=0)
        transf = DummySampler()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert not hasattr(transf, 'means_')
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(probability=True, random_state=0)
        transf_2 = DummySampler()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
Exemple #43
0
class PennyModel:
    """ 
    The Model for the penny auction.   Takes a sklearn classifier and fits the model after transformation.

    Attributes: 
        model (SklearnClassifier): The model for the regression
        user_scaler (bool): Whether or not to scale the data first
        sampling_ratio (float):  The ratio of the minority class to the majority class
        numeric_features (list(str)):  The numerical features of the model
        cateogorical_features (list(str)): The categorical features of the model
    """
    def __init__(self, model, use_scaler=False, sampling_ratio=1):
        """

        Parameters:
        Returns:
        """
        self.model = model
        self.sampling_ratio = sampling_ratio
        self.use_scaler = use_scaler
        self.categorical_features = [
            'cardtype', 'limited_allowed', 'is_locked', 'is_bidomatic',
            'is_bidomatic0', 'is_bidomatic1', 'is_bidomatic2', 'is_bidomatic3'
        ]
        self.numeric_features = [
            'bid', 'cashvalue', 'bidvalue', 'prevusers', 'bids_so_far0',
            'perc_to_bin0', 'bom_bids_so_far0', 'bom_streak0',
            'prev_is_new_user0', 'prev_auction_count0', 'prev_overbid0',
            'prev_giveup_one0', 'prev_give_before_six0', 'prev_wins0',
            'prev_bids0', 'prev_bom_bids0', 'distance1', 'bids_so_far1',
            'perc_to_bin1', 'bom_bids_so_far1', 'bom_streak1',
            'prev_is_new_user1', 'prev_auction_count1', 'prev_overbid1',
            'prev_giveup_one1', 'prev_give_before_six1', 'prev_wins1',
            'prev_bids1', 'prev_bom_bids1', 'distance2', 'bids_so_far2',
            'perc_to_bin2', 'bom_bids_so_far2', 'bom_streak2',
            'prev_is_new_user2', 'prev_auction_count2', 'prev_overbid2',
            'prev_giveup_one2', 'prev_give_before_six2', 'prev_wins2',
            'prev_bids2', 'prev_bom_bids2', 'distance3', 'bids_so_far3',
            'perc_to_bin3', 'bom_bids_so_far3', 'bom_streak3',
            'prev_is_new_user3', 'prev_auction_count3', 'prev_overbid3',
            'prev_giveup_one3', 'prev_give_before_six3', 'prev_wins3',
            'prev_bids3', 'prev_bom_bids3', 'is_weekend', 'time_of_day'
        ]

    def get_features_as_string(self):
        """
        Returns all the features of the model.

        Parameters:
        Returns:
        """
        return ",".join(self.categorical_features + self.numeric_features)

    def get_column_names_from_ColumnTransformer(self, column_transformer):
        """

        Parameters:
        Returns:
        """
        col_name = []
        for transformer_in_columns in column_transformer.transformers_[:
                                                                       -1]:  #the last transformer is ColumnTransformer's 'remainder'
            raw_col_name = transformer_in_columns[2]
            if isinstance(transformer_in_columns[1], Pipeline):
                transformer = transformer_in_columns[1].steps[-1][1]
            else:
                transformer = transformer_in_columns[1]
            try:
                names = transformer.get_feature_names(
                    self.categorical_features)
            except AttributeError:  # if no 'get_feature_names' function, use raw column name
                names = raw_col_name
            if isinstance(names, np.ndarray):  # eg.
                col_name += names.tolist()
            elif isinstance(names, list):
                col_name += names
            elif isinstance(names, str):
                col_name.append(names)
        return col_name

    def transform(self, X):
        """

        Parameters:
        Returns:
        """

        rX = X.copy()
        return self.transform_no_copy(rX)

    def transform_no_copy(self, X):
        """

        Parameters:
        Returns:
        """

        #rX = X.copy()
        #print ("2. Transforming data")
        X.is_bidomatic0 = X.is_bidomatic0.astype(str)
        X.is_bidomatic1 = X.is_bidomatic1.astype(str)
        X.is_bidomatic2 = X.is_bidomatic2.astype(str)
        X.is_bidomatic3 = X.is_bidomatic3.astype(str)

        X["fee"] = [
            0 if x == 0 else (1 if x < 50 else 1.99) for x in X["cardvalue"]
        ]
        X["time_of_day"] = [x.hour for x in X["auctiontime"]]
        X["is_weekend"] = [x.weekday() >= 6 for x in X["auctiontime"]]
        return X

    def internal_fit(self, X, y):
        """
        Fits self.model 

        Parameters:
        Returns:
        """
        self.train_pop = X.shape[0]
        self.target_pop = sum(y)
        self.sampled_train_pop = self.target_pop / self.sampling_ratio + self.target_pop
        self.sampled_target_pop = self.target_pop

        numeric_transformer = Pipeline_imb(
            steps=[('imputer',
                    SimpleImputer(strategy='constant', fill_value=-1))
                   #     ('scaler', StandardScaler())
                   ])
        categorical_transformer = Pipeline_imb(
            steps=[('imputer',
                    SimpleImputer(strategy='constant', fill_value='unknown')),
                   ('onehot',
                    OneHotEncoder(handle_unknown='error', drop='first'))])
        preprocessor = ColumnTransformer(
            transformers=[('num', numeric_transformer, self.numeric_features),
                          ('cat', categorical_transformer,
                           self.categorical_features)])
        steps = [('preprocessor', preprocessor)]
        steps.append(
            ('sampler',
             RandomUnderSampler(sampling_strategy=self.sampling_ratio)))
        steps.append(('classifier', self.model))

        self.pipeline = Pipeline_imb(steps=steps)

        print("4. Fitting model")
        self.pipeline.fit(X, y)

    def fit_already_transformed(self, X, y):
        """
        fits X if it's already been transformed.

        Parameters:
        Returns:
        """
        self.internal_fit(X, y)

    def fit_transform(self, X, y):
        """
        fits and transforms X.

        Parameters:
        Returns:
        """
        self.transform_no_copy(X)
        self.internal_fit(X, y)

    def pickle(self, filename):
        """
        Writes this class as a pickle file to filename

        Parameters:
        Returns:
        """
        print("5. Pickling model as penny_auction.pickle")
        pickle.dump(self, open(filename, "wb"))

    def predict_proba(self, X):
        """
        Returns the predicted probabilities that the auction will end, in the UNDERSAMPLED data set.

        Parameters:
        Returns:
        """
        return self.pipeline.predict_proba(self.transform(X))

    def predict_proba_calibrated(self, X):
        """
        Returns the probabilities from the model AFTER accounting for the undersampling.

        Parameters:
        Returns:
        """
        return self.calibrate_probabilties(self.predict_proba(X))

    def predict(self, X):
        """
        Calls predict on the model to get binary whether or not the auction will end.

        Parameters:
        Returns:
        """
        return self.pipeline.predict(self.transform(X))

    def get_feature_scores(self):
        """
        Returns the feature importances from the model

        Parameters:
        Returns:
        """
        return pd.Series(self.pipeline.steps[2][1].feature_importances_,
                         index=self.get_column_names_from_ColumnTransformer(
                             self.pipeline.named_steps['preprocessor']))

    def calibrate_probabilties(self, data):
        """
        Recalibrates the probabilities to account for the undersampling.  So if the model says 20%, it will comeout as something like 1.2%

        Parameters:
        Returns:
        """

        calibrated_data = \
        ((data * (self.target_pop / self.train_pop) / (self.sampled_target_pop / self.sampled_train_pop)) /
        ((
            (1 - data) * (1 - self.target_pop / self.train_pop) / (1 - self.sampled_target_pop / self.sampled_train_pop)
        ) +
        (
            data * (self.target_pop / self.train_pop) / (self.sampled_target_pop / self.sampled_train_pop)
        )))
        return calibrated_data

    def get_actual_and_potential_profits(self, X, y):
        """
        returns the actual and potential profits over X

        Parameters:
        Returns:
        """
        potential_profits = (X.cashvalue - X.fee - X.bid / 100) - .4
        actual_profits = y * (X.cashvalue - X.fee - X.bid / 100) - .4
        return potential_profits, actual_profits

    def get_score(self, X, y):
        """
        Returns the expected profit over the set X

        Parameters:
        Returns:
        """
        cprobs = self.predict_proba_calibrated(X)[:, 1]
        pp, ap = self.get_actual_and_potential_profits(X, y)
        expected_value = np.multiply(cprobs, pp) - (1 - cprobs) * .4
        return sum(ap[expected_value > 0])
Exemple #44
0
def train_model(transactions_details):
    X = transactions_details.drop(columns='fraudster')
    y = transactions_details['fraudster'].copy()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    categorical_cols = [
        'currency', 'transaction_state', 'type', 'source', 'entry_method',
        'is_crypto', 'merchant_country', 'phone_country', 'user_country', 'kyc'
    ]
    numerical_cols = [
        'failed_sign_in_attempts', 'age', 'diff_in_days', 'amount_usd'
    ]

    # pre processing pipeline
    # Feature Scaling
    # One Hot Encoding
    preprocess = make_column_transformer(
        (make_pipeline(SimpleImputer(), StandardScaler()), numerical_cols),
        (OneHotEncoder(handle_unknown='ignore'), categorical_cols))

    # Create a pipeline
    model = Pipeline([('preprocess', preprocess),
                      ('sampling', SMOTE(random_state=42)),
                      ('classification', RandomForestClassifier())])

    # fit model
    model.fit(X_train, y_train)

    # Predict target vector
    y_pred = model.predict(X_test)

    print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
    print('Classification report:\n', classification_report(y_test, y_pred))
    print(accuracy_score(y_test, y_pred))

    # Create true and false positive rates
    false_positive_rate, true_positive_rate, threshold = roc_curve(
        y_test, y_pred)

    # Calculate Area Under the Receiver Operating Characteristic Curve
    probs = model.predict_proba(X_test)
    roc_auc = roc_auc_score(y_test, probs[:, 1])
    print('ROC AUC Score:', roc_auc)

    # Obtain precision and recall
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)

    # Calculate average precision
    average_precision = average_precision_score(y_test, y_pred)

    # Plot the roc curve
    plot_roc_curve(false_positive_rate, true_positive_rate, roc_auc)

    # Plot recall precision curve
    plot_pr_curve(recall, precision, average_precision)

    return model
class TargetEnsembler(object):

    def __init__(self, features):
        self.features = features

    def fit(self, X_train, y_train):

        # create list of targets

        # self.pipelines_list = []
        # self.preds = []
        # for i in targets :
        #  x. feature engineering (i)
        # y = df[i]
        # cv_scores  (x, y, pipeline_per_target[i])
        # model = pipeline_per_target[i].train(x, y)
        # pipelines_list.append(model)
        # preds.append(model.pred(x))

        # y = df[y]
        # combined_model = LogReg.train(preds, y)
        # print results....

        # def pred(X):
        #
        if intrusion:
            y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion)
        else:
            y_pred_intrusion = 1

        if avoidance:
            y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance)
        else:
            y_pred_avoidance = 1

        if hypertension:
            y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension)
        else:
            y_pred_hypertension = 1

        if depression:
            y_pred_depression = self.pipe_depression.predict(X_depression)
        else:
            y_pred_depression = 1

        if only_avoidance:
            y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_only_avoidance)
        else:
            y_pred_only_avoidance = 1

        if PCL_Strict3:
            y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_PCL_Strict3)
        else:
            y_pred_PCL_Strict3 = 1

        if regression_cutoff_33:
            y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_regression_cutoff_33)
        else:
            y_pred_regression_cutoff_33 = 1

        if regression_cutoff_50:
            y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_regression_cutoff_50)
        else:
            y_pred_regression_cutoff_50 = 1

        if tred_cutoff:
            y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_tred_cutoff)
        else:
            y_pred_tred_cutoff = 1

        y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression &
                  y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 &
                  y_pred_regression_cutoff_50 & y_pred_tred_cutoff)
        y_target = y_train

        acc = accuracy_score(y_target, y_pred)
        f1 = f1_score(y_target, y_pred)
        recall = recall_score(y_target, y_pred)
        precision = precision_score(y_target, y_pred)
        print("training scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")

        # combined
        y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension)
        y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance)
        y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion)
        y_pred_regression = self.pipe_regression.predict(X_regression)

        X_train["y_pred_hypertension"] = y_pred_hypertension
        X_train["y_pred_avoidance"] = y_pred_avoidance
        X_train["y_pred_intrusion"] = y_pred_intrusion
        X_train["y_pred_regression"] = y_pred_regression
        preds = ["y_pred_hypertension", "y_pred_avoidance", "y_pred_intrusion", "y_pred_regression"]

        X_combined = X_train[['q6.11_NUMB_pcl2', 'q6.13_SLEEP_pcl1', 'intrusion_pcl2', 'phq2'] + preds].values
        y_combined = y_train
        self.pipe_combined = Pipeline(steps=[
            ('classifier', DecisionTreeClassifier())])
        scores = cross_val_score(self.pipe_combined, X_combined, y_combined, scoring='precision', cv=StratifiedKFold(5))
        print(f"hypertension {sum(scores)/5}")
        self.pipe_combined.fit(X_combined, y_combined)

    def predict(self, X_test):

        if intrusion:
            X_test_intrusion_cutoff = FeatureEngineering(X_test[self.features],
                                                         "intrusion_cutoff").engineer_features().values
            y_pred_intrusion = self.pipe_intrusion.predict(X_test_intrusion_cutoff)
        else:
            y_pred_intrusion = 1

        if avoidance:
            X_test_avoidance_cutoff = FeatureEngineering(X_test[self.features],
                                                         "avoidance_cutoff").engineer_features().values
            y_pred_avoidance = self.pipe_avoidance.predict(X_test_avoidance_cutoff)
        else:
            y_pred_avoidance = 1

        if hypertension:
            X_test_hypertention_cutoff = FeatureEngineering(X_test[self.features],
                                                            "hypertention_cutoff").engineer_features().values
            y_pred_hypertension = self.pipe_hypertension.predict(X_test_hypertention_cutoff)
        else:
            y_pred_hypertension = 1

        if depression:
            X_test_depression_cutoff = FeatureEngineering(X_test[self.features],
                                                          "depression_cutoff").engineer_features().values
            y_pred_depression = self.pipe_depression.predict(X_test_depression_cutoff)
        else:
            y_pred_depression = 1

        if only_avoidance:
            X_test_only_avoidance_cutoff = FeatureEngineering(X_test[self.features],
                                                              "only_avoidance_cutoff").engineer_features().values

            y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_test_only_avoidance_cutoff)
        else:
            y_pred_only_avoidance = 1

        if PCL_Strict3:
            X_test_PCL_Strict3 = FeatureEngineering(X_test[self.features], "PCL_Strict3").engineer_features().values
            y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_test_PCL_Strict3)
        else:
            y_pred_PCL_Strict3 = 1

        if regression_cutoff_33:
            X_test_regression_cutoff_33 = FeatureEngineering(X_test[self.features],
                                                             "regression_cutoff_33").engineer_features().values
            y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_test_regression_cutoff_33)
        else:
            y_pred_regression_cutoff_33 = 1

        if regression_cutoff_50:
            X_test_regression_cutoff_50 = FeatureEngineering(X_test[self.features],
                                                             "regression_cutoff_50").engineer_features().values
            y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_test_regression_cutoff_50)
        else:
            y_pred_regression_cutoff_50 = 1

        if tred_cutoff:
            X_test_tred_cutoff = FeatureEngineering(X_test[self.features], "tred_cutoff").engineer_features().values
            y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_test_tred_cutoff)
        else:
            y_pred_tred_cutoff = 1

        y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression &
                  y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 &
                  y_pred_regression_cutoff_50 & y_pred_tred_cutoff)

        preds = ["y_pred_hypertension", "y_pred_avoidance", "y_pred_intrusion", "y_pred_regression"]

        X_combined = X_test[['q6.11_NUMB_pcl2', 'q6.13_SLEEP_pcl1', 'intrusion_pcl2', 'phq2'] + preds].values

        y_pred = self.pipe_combined.predict(X_combined)
        return y_pred