def test_pipeline_sample():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pipeline = Pipeline([('rus', rus)])

    # test transform and fit_transform:
    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_trans2, y_trans2 = pipeline.fit_sample(X, y)
    X_trans3, y_trans3 = rus.fit_sample(X, y)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(X_trans, X_trans3)
    assert_array_almost_equal(y_trans, y_trans2)
    assert_array_almost_equal(y_trans, y_trans3)

    pca = PCA()
    pipeline = Pipeline([('pca', pca), ('rus', rus)])

    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_pca = pca.fit_transform(X)
    X_trans2, y_trans2 = rus.fit_sample(X_pca, y)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(y_trans, y_trans2)
def test_predict_with_predict_params():
    # tests that Pipeline passes predict_params to the final estimator
    # when predict is invoked
    pipe = Pipeline([('transf', Transf()), ('clf', DummyEstimatorParams())])
    pipe.fit(None, None)
    pipe.predict(X=None, got_attribute=True)
    assert pipe.named_steps['clf'].got_attribute
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    assert_raises(TypeError, Pipeline)
    # Check that we can't instantiate pipelines with objects without fit
    # method
    pipe = assert_raises(TypeError, Pipeline, [('svc', IncorrectT)])
    # Smoke test with only an estimator
    clf = T()
    pipe = Pipeline([('svc', clf)])
    assert_equal(
        pipe.get_params(deep=True),
        dict(
            svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)))

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert_equal(clf.a, 0.1)
    assert_equal(clf.b, None)
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC()
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([('anova', filter1), ('svc', clf)])

    # Check that we can't use the same stage name twice
    assert_raises(ValueError, Pipeline, [('svc', SVC()), ('svc', SVC())])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert_equal(clf.C, 0.1)
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    assert_raises(ValueError, pipe.set_params, anova__C=0.1)

    # Test clone
    pipe2 = clone(pipe)
    assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc'])

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop('svc')
    params.pop('anova')
    params2.pop('svc')
    params2.pop('anova')
    assert_equal(params, params2)
def test_fit_predict_with_intermediate_fit_params():
    # tests that Pipeline passes fit_params to intermediate steps
    # when fit_predict is invoked
    pipe = Pipeline([('transf', TransfFitParams()), ('clf', FitParamT())])
    pipe.fit_predict(
        X=None, y=None, transf__should_get_this=True, clf__should_succeed=True)
    assert pipe.named_steps['transf'].fit_params['should_get_this']
    assert pipe.named_steps['clf'].successful
    assert 'should_succeed' not in pipe.named_steps['transf'].fit_params
def test_pipeline_raise_set_params_error():
    # Test pipeline raises set params error message for nested models.
    pipe = Pipeline([('cls', LinearRegression())])
    with raises(ValueError, match="Invalid parameter"):
        pipe.set_params(fake='nope')

    # nested model check
    with raises(ValueError, match="Invalid parameter"):
        pipe.set_params(fake__estimator='nope')
def test_pipeline_fit_params():
    # Test that the pipeline can take fit parameters
    pipe = Pipeline([('transf', TransfT()), ('clf', FitParamT())])
    pipe.fit(X=None, y=None, clf__should_succeed=True)
    # classifier should return True
    assert_true(pipe.predict(None))
    # and transformer params should not be changed
    assert_true(pipe.named_steps['transf'].a is None)
    assert_true(pipe.named_steps['transf'].b is None)
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = PCA(n_components=2)
    clf = SVC(probability=True, random_state=0, decision_function_shape='ovr')

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert_equal(predict.shape, (n_samples,))

        proba = pipe.predict_proba(X)
        assert_equal(proba.shape, (n_samples, n_classes))

        log_proba = pipe.predict_log_proba(X)
        assert_equal(log_proba.shape, (n_samples, n_classes))

        decision_function = pipe.decision_function(X)
        assert_equal(decision_function.shape, (n_samples, n_classes))

        pipe.score(X, y)
def test_pipeline_fit_transform():
    # Test whether pipeline works with a transformer missing fit_transform
    iris = load_iris()
    X = iris.data
    y = iris.target
    transft = TransfT()
    pipeline = Pipeline([('mock', transft)])

    # test fit_transform:
    X_trans = pipeline.fit_transform(X, y)
    X_trans2 = transft.fit(X, y).transform(X)
    assert_array_almost_equal(X_trans, X_trans2)
def test_pipeline_wrong_memory():
    # Test that an error is raised when memory is not a string or a Memory
    # instance
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Define memory as an integer
    memory = 1
    cached_pipe = Pipeline(
        [('transf', DummyTransf()), ('svc', SVC(gamma='scale'))],
        memory=memory)
    error_regex = ("string or have the same interface as")
    with raises(ValueError, match=error_regex):
        cached_pipe.fit(X, y)
def test_pipeline_wrong_memory():
    # Test that an error is raised when memory is not a string or a Memory
    # instance
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Define memory as an integer
    memory = 1
    cached_pipe = Pipeline([('transf', DummyTransf()), ('svc', SVC())],
                           memory=memory)
    error_regex = ("'memory' should either be a string or a joblib.Memory"
                   " instance, got 'memory=1' instead.")
    with raises(ValueError, match=error_regex):
        cached_pipe.fit(X, y)
def test_pipeline_sample_transform():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pca = PCA()
    pca2 = PCA()
    pipeline = Pipeline([('pca', pca), ('rus', rus), ('pca2', pca2)])

    pipeline.fit(X, y).transform(X)
def test_fit_predict_on_pipeline():
    # test that the fit_predict method is implemented on a pipeline
    # test that the fit_predict on pipeline yields same results as applying
    # transform and clustering steps separately
    iris = load_iris()
    scaler = StandardScaler()
    km = KMeans(random_state=0)

    # first compute the transform and clustering step separately
    scaled = scaler.fit_transform(iris.data)
    separate_pred = km.fit_predict(scaled)

    # use a pipeline to do the transform and clustering in one step
    pipe = Pipeline([('scaler', scaler), ('Kmeans', km)])
    pipeline_pred = pipe.fit_predict(iris.data)

    assert_array_almost_equal(pipeline_pred, separate_pred)
def test_pipeline_transform():
    # Test whether pipeline works with a transformer at the end.
    # Also test pipeline.transform and pipeline.inverse_transform
    iris = load_iris()
    X = iris.data
    pca = PCA(n_components=2)
    pipeline = Pipeline([('pca', pca)])

    # test transform and fit_transform:
    X_trans = pipeline.fit(X).transform(X)
    X_trans2 = pipeline.fit_transform(X)
    X_trans3 = pca.fit_transform(X)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(X_trans, X_trans3)

    X_back = pipeline.inverse_transform(X_trans)
    X_back2 = pca.inverse_transform(X_trans)
    assert_array_almost_equal(X_back, X_back2)
Exemple #14
0
    def illigal_genralization_checking(self, X_test, y_test):

        X = self.df[self.features]
        X_test = X_test[self.features]
        Y = self.df[self.target]
        pipe = Pipeline(steps=[('classifier', XGBClassifier(n_estimators=1000, scale_pos_weight=3, reg_alpha=1))])
        y_test = y_test["intrusion_cutoff"].apply(lambda x: int(x))
        scores = cross_val_score(pipe, X, Y, scoring='precision', cv=StratifiedKFold(5))
        print(self.features)
        print("cross vl scores")
        print(sum(scores)/5)
        pipe.fit(X, Y.values)
        y_pred = pipe.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        print("test scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def test_pipeline_sample_weight_supported():
    # Pipeline should pass sample_weight
    X = np.array([[1, 2]])
    pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())])
    pipe.fit(X, y=None)
    assert pipe.score(X) == 3
    assert pipe.score(X, y=None) == 3
    assert pipe.score(X, y=None, sample_weight=None) == 3
    assert pipe.score(X, sample_weight=np.array([2, 3])) == 8
def test_pipeline_sample_weight_unsupported():
    # When sample_weight is None it shouldn't be passed
    X = np.array([[1, 2]])
    pipe = Pipeline([('transf', Transf()), ('clf', Mult())])
    pipe.fit(X, y=None)
    assert pipe.score(X) == 3
    assert pipe.score(X, sample_weight=None) == 3
    with raises(TypeError, match="unexpected keyword argument"):
        pipe.score(X, sample_weight=np.array([2, 3]))
def test_pipeline_sample():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pipeline = Pipeline([('rus', rus)])

    # test transform and fit_transform:
    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_trans2, y_trans2 = pipeline.fit_sample(X, y)
    X_trans3, y_trans3 = rus.fit_sample(X, y)
    assert_allclose(X_trans, X_trans2, rtol=R_TOL)
    assert_allclose(X_trans, X_trans3, rtol=R_TOL)
    assert_allclose(y_trans, y_trans2, rtol=R_TOL)
    assert_allclose(y_trans, y_trans3, rtol=R_TOL)

    pca = PCA()
    pipeline = Pipeline([('pca', PCA()),
                         ('rus', rus)])

    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_pca = pca.fit_transform(X)
    X_trans2, y_trans2 = rus.fit_sample(X_pca, y)
    # We round the value near to zero. It seems that PCA has some issue
    # with that
    X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0
    X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0
    assert_allclose(X_trans, X_trans2, rtol=R_TOL)
    assert_allclose(y_trans, y_trans2, rtol=R_TOL)
def test_pipeline_fit_params():
    # Test that the pipeline can take fit parameters
    pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())])
    pipe.fit(X=None, y=None, clf__should_succeed=True)
    # classifier should return True
    assert pipe.predict(None)
    # and transformer params should not be changed
    assert pipe.named_steps['transf'].a is None
    assert pipe.named_steps['transf'].b is None
    # invalid parameters should raise an error message
    with raises(TypeError, match="unexpected keyword argument"):
        pipe.fit(None, None, clf__bad=True)
# pre-process them accordingly.
categorical_features = X.loc[:, X.dtypes == 'object'].columns
numeric_features = X.loc[:, (X.dtypes == 'float64') |
                         (X.columns == 'age')].columns
indicator_features = X.loc[:, (X.dtypes == 'int64') &
                           (X.columns != 'age')].columns

# For numeric features we first imputed missing data using median. We choose median over mean,
# because our data is not normalized yet. It could have been skewed and had outliers, which would
# have given biased mean estimate.
# After imputing missing data, we used quantile transformer. This transformation method is robust to
# outliers, and transforms variables so they have normal distribution and
# all have similar range.

numeric_transformer = Pipeline(steps=[(
    'imputer', SimpleImputer(strategy='median')
), ('scaler',
    QuantileTransformer(output_distribution='normal', random_state=0))])

# For categorical features we imputed missing data with the most frequent value of the column.
# After that we encoded these variables using bayesian encoder LeaveOneOutEncoder. We chose this encoder
# because our categorical variables were of high cardinality.

categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='most_frequent')
            ), ('leaveoneout', LeaveOneOutEncoder(return_df=False))])

# for Indicator variables we imputed missing data with 0, as they only have values
# 0 and 1, 1 for event occuring)
indicator_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])
 def make():
     return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)])
    def train(self, data: pd.DataFrame) -> Pipeline:
        """
        Return the best fitted estimator, that is, the one that maximizes the average_precision
        :yield: the cross-validation report for every classifier
        :return: an estimator of type Pipeline
        """

        if not self.classifiers:
            raise RuntimeError(
                'A classifier is missing. Please call DefectPredictor.classifiers = [\'choiche]\' to set a classifier for training.'
            )

        X, y = prepare_training_data(data)
        releases = X.group.tolist()
        X = X.drop(['group'], axis=1)

        scoring = dict(roc_auc='roc_auc',
                       average_precision='average_precision',
                       accuracy='accuracy',
                       balanced_accuracy='balanced_accuracy',
                       precision='precision',
                       recall='recall',
                       f1='f1',
                       mcc=make_scorer(matthews_corrcoef))

        for classifier in self.classifiers:
            estimator = classifiers_map[classifier]

            pipe = Pipeline([
                ('variance',
                 VarianceThreshold(threshold=0)),  # Remove constant features
                (
                    'balancing', None
                ),  # To balance the training data See search_params['balancing'] below)
                (
                    'normalization', None
                ),  # To scale (and center) data. See search_params['normalization'] below
                # TODO feature_selection here
                ('classification', estimator)
            ])

            search_params = search_params_map[classifier]

            if self.balancers:
                search_params['balancing'] = self.balancers

            if self.normalizers:
                search_params['normalization'] = self.normalizers

            search = RandomizedSearchCV(pipe,
                                        search_params,
                                        cv=walk_forward_release(
                                            X, y, releases),
                                        scoring=scoring,
                                        refit='average_precision',
                                        verbose=self._verbose)

            search.fit(X, y)

            # Add additional metadata to the cv_results
            search.cv_results_['best_index_'] = search.best_index_

            buffer = io.StringIO()
            pd.DataFrame(search.cv_results_).to_json(buffer,
                                                     orient='table',
                                                     index=False)
            self.cv_report_map[classifier] = json.loads(buffer.getvalue())

            # Get the highest average_precision for this randomized search
            local_best_average_precision = search.cv_results_[
                'mean_test_average_precision'][search.best_index_]

            if (not self.best_estimator) or (
                    local_best_average_precision >
                    self.best_estimator_average_precision):
                self.cv_report_map['best_classifier'] = classifier
                self.best_estimator = search.best_estimator_
                selected_features_indices = self.best_estimator.named_steps[
                    'variance'].fit(X).get_support(indices=True)
                self.selected_features = X.iloc[:,
                                                selected_features_indices].columns.tolist(
                                                )

        return self.best_estimator
def test_pipeline_methods_rus_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0,
    )

    # Test with PCA + SVC
    clf = SVC(gamma="scale", probability=True, random_state=0)
    pca = PCA()
    rus = RandomUnderSampler(random_state=0)
    pipe = Pipeline([("rus", rus), ("pca", pca), ("svc", clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_memory_transformer():
    iris = load_iris()
    X = iris.data
    y = iris.target
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir=cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(probability=True, random_state=0)
        transf = DummyTransf()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert not hasattr(transf, 'means_')
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(probability=True, random_state=0)
        transf_2 = DummyTransf()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
def test_set_pipeline_step_passthrough(passthrough):
    # Test setting Pipeline steps to None
    X = np.array([[1]])
    y = np.array([1])
    mult2 = Mult(mult=2)
    mult3 = Mult(mult=3)
    mult5 = Mult(mult=5)

    def make():
        return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)])

    pipeline = make()

    exp = 2 * 3 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline.set_params(m3=passthrough)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    expected_params = {
        "steps": pipeline.steps,
        "m2": mult2,
        "m3": passthrough,
        "last": mult5,
        "memory": None,
        "m2__mult": 2,
        "last__mult": 5,
        "verbose": False,
    }
    assert pipeline.get_params(deep=True) == expected_params

    pipeline.set_params(m2=passthrough)
    exp = 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    # for other methods, ensure no AttributeErrors on None:
    other_methods = [
        "predict_proba",
        "predict_log_proba",
        "decision_function",
        "transform",
        "score",
    ]
    for method in other_methods:
        getattr(pipeline, method)(X)

    pipeline.set_params(m2=mult2)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline = make()
    pipeline.set_params(last=passthrough)
    # mult2 and mult3 are active
    exp = 6
    pipeline.fit(X, y)
    pipeline.transform(X)
    assert_array_equal([[exp]], pipeline.fit(X, y).transform(X))
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    with raises(AttributeError, match="has no attribute 'predict'"):
        getattr(pipeline, "predict")

    # Check 'passthrough' step at construction time
    exp = 2 * 5
    pipeline = Pipeline([("m2", mult2), ("m3", passthrough), ("last", mult5)])
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    with raises(TypeError):
        Pipeline()
    # Check that we can't instantiate pipelines with objects without fit
    # method
    error_regex = 'Last step of Pipeline should implement fit. .*NoFit.*'
    with raises(TypeError, match=error_regex):
        Pipeline([('clf', NoFit())])
    # Smoke test with only an estimator
    clf = NoTrans()
    pipe = Pipeline([('svc', clf)])
    expected = dict(svc__a=None, svc__b=None, svc=clf,
                    **pipe.get_params(deep=False))
    assert pipe.get_params(deep=True) == expected

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert clf.a == 0.1
    assert clf.b is None
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC()
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([('anova', filter1), ('svc', clf)])

    # Check that we can't instantiate with non-transformers on the way
    # Note that NoTrans implements fit, but not transform
    error_regex = 'implement fit and transform or sample'
    with raises(TypeError, match=error_regex):
        Pipeline([('t', NoTrans()), ('svc', clf)])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert clf.C == 0.1
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    with raises(ValueError):
        pipe.set_params(anova__C=0.1)

    # Test clone
    pipe2 = clone(pipe)
    assert not pipe.named_steps['svc'] is pipe2.named_steps['svc']

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop('svc')
    params.pop('anova')
    params2.pop('svc')
    params2.pop('anova')
    assert params == params2
Exemple #26
0
__all__ = ["rf_pipeline", "xgb_pipeline"]

# pipeline base steps definition
base_steps = [
    (
        "filter_dep",
        CategorySelector(variable=cfg.ZONE_VAR, category=cfg.SELECTED_DEP),
    ),
    (
        "add_lags",
        LagTransformer(
            date_column=cfg.DATE_VAR,
            zone_column=cfg.ZONE_VAR,
            columns=cfg.LAG_ERA5T_VARS,
        ),
    ),
    ("imputer", Imputer(columns=cfg.MODEL_ERA5T_VARS, strategy="median")),
    ("binarize_target", TargetDiscretizer(discretizer=discretizer)),
    ("subset_features", FeatureSubsetter(columns=cfg.MODEL_ERA5T_VARS)),
]

# Add estimator to base step lists
xgb_steps = [*base_steps, ("xgboost", XGBClassifier(**cfg.XGB_PARAMS))]
rf_steps = [
    *base_steps, ("random_forest", RandomForestClassifier(**cfg.RF_PARAMS))
]

# Define sklearn / imblearn pipelines
xgb_pipeline = Pipeline(xgb_steps)
rf_pipeline = Pipeline(rf_steps)
Exemple #27
0
    print("Before", counter)
    x_train = np.reshape(x_train, [x_train.shape[0], 40 * 40 * 3])
    steps = []
    if (smote_val != -1):
        print("Applying SMOTE with value", smote_val)
        smote = SMOTE(sampling_strategy=smote_val)
        steps.append(('o', smote))
    if (oversample_val != -1):
        print("Applying oversampling with value", oversample_val)
        oversample = RandomOverSampler(sampling_strategy=oversample_val)
        steps.append(('o', oversample))
    if (undersample_val != -1):
        print("Applying undersampling with value", undersample_val)
        undersample = RandomUnderSampler(sampling_strategy=undersample_val)
        steps.append(('u', undersample))
    pipeline = Pipeline(steps=steps)
    #x_train, y_train = pipeline.fit_resample(x_train, y_train)
    counter = Counter(y_train)
    print("After", counter)
    x_train = np.reshape(x_train, [x_train.shape[0], 40, 40, 3])

    # initialize output bias
    neg, pos = np.bincount(y_train)
    output_bias = np.log(pos / neg)
    output_bias = keras.initializers.Constant(output_bias)
    print("Positive Class Counter:", pos)
    print("Negative Class Counter:", neg)

    # output weights
    weight_for_0 = (1 / neg) * (neg + pos) / 2.0
    weight_for_1 = (1 / pos) * (neg + pos) / 2.0
Exemple #28
0
import numpy as np
from sklearn.svm import SVC
from hyperopt import hp
from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import OneSidedSelection

from config import random_seed
from utils.python_utils import quniform_int

steps = [('undersampler', OneSidedSelection(random_state=random_seed)),
         ('SVC',
          SVC(C=1, kernel='linear', random_state=random_seed,
              probability=True))]
model = Pipeline(steps=steps)

params_space = {'svm__C': hp.quniform('C', 1, 100, 5)}
Exemple #29
0
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import Normalizer

classifier = SVC()
#classifier = MLPClassifier()
#classifier =  DecisionTreeClassifier()
#classifier = KNeighborsClassifier()

resampler = SMOTE(random_state=22)

estimator = Pipeline([
    #('Normalizer', Normalizer()),
    #('resample', resampler),
    #('feature_selection', SelectKBest(f_classif, k = 5)),
    ('classification', classifier)
])

kfoldcv = StratifiedKFold(n_splits=4)
mean_acc = cross_val_score(estimator=estimator, X=X, y=y, cv=kfoldcv)
#permutation_test_score(estimator=estimator,X=X,y=y,cv=kfoldcv)'''

results = cross_validate(estimator=estimator,
                         X=X,
                         y=y,
                         cv=kfoldcv,
                         return_estimator=True)
best_estimator = results['estimator'][-1]

#X_test = X = np.array([alphas[0]+alphas[1], betas[0]+betas[1], thetas[0]+thetas[1]]).T
Exemple #30
0
                          n_estimators=150,
                          min_samples_leaf=10,
                          criterion='mse',
                          max_features=None,
                          max_depth=None),
}

print(
    "Execution               Model                 RMSE   R2     Pearson Correlation / p-value Spearman Correlation / p-value"
)
# For each base learner...
for learner_name, learner in base_learners.items():

    # build pipeline
    steps = [('scale', StandardScaler()), ('learner', learner)]
    pipeline = Pipeline(steps=steps)
    pipeline.fit(exec_training_features, exec_training_target)

    # prediction
    predicted = pipeline.predict(exec_test_features)

    # evaluation
    r2 = r2_score(exec_test_target, predicted)
    rmse = mean_squared_error(exec_test_target, predicted)
    pearson_corr, pvalue = stats.pearsonr(exec_test_target, predicted)
    spearmanr, spr_pvalue = stats.spearmanr(exec_test_target, predicted)

    output.write("{},{},{:.3f},{:.3f},{:.3f},{:.3f}\n".format(
        EXECUTION_NAME, learner_name, rmse, r2, pearson_corr, spearmanr))
    print("{}     {:20s}  {:.3f}  {:.3f}  {:.3f}/{:.5f}  {:.3f}/{:.5f}".format(
        EXECUTION_NAME, learner_name, rmse, r2, pearson_corr, pvalue,
Exemple #31
0
# df = df.drop(columns=["Num1"])
imp.fit(df)
vars = df.columns[range(len(df.columns) - 1)]
df = imp.transform(df)
# df = pd.DataFrame(vals, columns=vars)
# df = df[["WBC0", "Plt0", "Mg0", "Age", "Ca0", "BMI", "Na0", "P0", "HB0", "AST0", "PH0", "ALT0", "CRP0_Quantitative", "HeartFailure0", "Nausea0", "WeaknessFatigue0", "Cough0", "K0", "PR0", "Cr0",
#          "COVID19_outcome"]]
X = np.round(df[:, range(0, df.shape[1] - 1)])
Y = np.round(df[:, targetIndex])

# remove label -1
mask = Y != -1
Y = Y[mask]
X = X[mask, :]

base_estimator = Pipeline([('scaler', StandardScaler()),
                           ('model', LogisticRegression(penalty='l2'))])

selector = StabilitySelection(base_estimator=base_estimator,
                              lambda_name='model__C',
                              lambda_grid=np.logspace(-5, -1, 50)).fit(X, Y)
fig, ax = plot_stability_path(selector, vars=vars)
fig.show()

selected_variables = selector.get_support(indices=True)
selected_scores = selector.stability_scores_.max(axis=1)

pd.DataFrame({
    "selectedVars": vars[selected_variables],
    "score": selected_scores[selected_variables]
}).to_excel("stabilityFeatureSelection.xlsx")
# print(selector.get_support(indices=True))
Exemple #32
0
    def score(self, pipeline: Pipeline, dataset: Dataset, class_names: List[str] = None):
        """
        Computes scores for metrics provided in Scorer constructor. If y_true is multi class then scorer does
        macro average mode for precision/recall/f1. If y_true is multilabel the scores performs macro averaging.

        Parameters
        ----------
        pipeline: Pipeline
            Complete pipeline including features pipeline and classifier.
        dataset: Dataset
            Dataset containing x and y pd.DataFrames
            For Multiclass the shape of y_true should be 1-D (NOT one-hot encoded).
            For Multilabel the shape should be n-dimensional (where n is number of classes).
        class_names: List of strings, optional
            If given, the scores for separate classes will be displayed with appropriate names.

        Returns
        -------
        metrics: Dict
            Dictionary with metrics' names as keys and scores as values
        """

        x, y_true = dataset.x, dataset.y.to_numpy()

        if len(y_true.shape) == 1:
            y_true = y_true.reshape(-1, 1)

        # run inference for probabilities
        probabilities = pipeline.predict_proba(x)

        # check if the output of inference is a list (sklearn models often output probabilities in this form
        # in case of multilabel task). If yes, convert to a single array.
        if isinstance(probabilities, list):
            probabilities = convert_list_probas_to_array(probabilities)

        # check task type based on the true and predicted arrays.
        self.task = check_task(y_true, probabilities)

        # turn probabilities into predictions with chosen threshold
        if self.task in ['binary', 'multiclass']:
            predictions = np.argmax(probabilities, axis=-1)
        else:
            predictions = np.where(probabilities >= self.threshold, 1, 0)

        # assign number of classes based on given array
        self.n_classes = probabilities.shape[-1]

        # assign names of classes
        self.class_names = class_names if class_names else [f'class_{i}' for i in range(self.n_classes)]

        # check if any of ['precision', 'recall', 'f1', 'accuracy'] are in the metrics.
        # if yes generate classification report - it calculates all of these metrics.
        # it does not calculate accuracy for multilabel problem so additional check is done in such case.
        if [metric for metric in ['precision', 'recall', 'f1', 'accuracy'] if metric in self.metrics]:
            self.scores_dict.update(classification_report(y_true, predictions,
                                                          target_names=self.class_names, output_dict=True))
            if self.task == 'multilabel':
                self.scores_dict['accuracy'] = accuracy_score(y_true, predictions)

        if 'auc' in self.metrics:
            self.fpr, self.tpr, self.roc_auc_dict = calculate_roc_auc(y_true, probabilities,
                                                                      self.class_names, self.task)
            for key, value in self.roc_auc_dict.items():
                self.scores_dict[key]['auc'] = value

        if self.report:
            print(pd.DataFrame(self.scores_dict).transpose())

        return self._get_metrics()
def RepeatedSampling(X_train, Y_train, X_test, Y_test, classifier, sampling, sampling_params, no_seeds):
    """
    Repeated sampling experiments for algorithms with randomized sampling.
    
    Considered Performance Criteria:
    - F2-Score
    - Balanced Accuracy
    - Precision
    - Recall
    
    Inputs:
    X_train = features of the training data (must be in pd.Dataframe format!!)
    Y_train = outcome of the training data (must be in pd.Series format!!)
    
    X_test = features of the test data (must be in pd.Dataframe format!!)
    Y_test = outcome of the test data (must be in pd.Series format!!)
    
    classifier = model, e.g. BaggingClassifier()
    
    sampling = sampling object, e.g. RandomOverSampler()
    sampling_params = parameters for the sampling object
    
    no_seeds = number of experiments to be executed/number of different seeds to be considered.
    """  
    
    seeds = random.sample(range(1, 10000), no_seeds)
    i=0
    
    test_frames = []

    for seed in seeds:
        sampling_new = sampling(**sampling_params, random_state = seeds[i])
        pipe = Pipeline(steps=[('s', sampling_new), ('m', classifier)])
        
        test_performance = pd.DataFrame(np.zeros((1,4)), columns = list(['F2-Score', 'bacc', 'Precision', 'Recall']))
    
        pipe.fit(X_train, Y_train)
        Y_pred = pipe.predict(X_test)

        f2              = metrics.fbeta_score(Y_test, Y_pred, beta = 2)
        bacc            = metrics.balanced_accuracy_score(Y_test, Y_pred)
        precision       = metrics.precision_score(Y_test, Y_pred)
        recall          = metrics.recall_score(Y_test, Y_pred)
    
        test_performance.iloc[0,0] = f2
        test_performance.iloc[0,1] = bacc
        test_performance.iloc[0,2] = precision
        test_performance.iloc[0,3] = recall
        
        test_frames.append(test_performance)
        i = i+1
        
    test_table = pd.concat(test_frames)
    
    f2_vals = test_table['F2-Score']
    bacc_vals = test_table['bacc']
    precision_vals = test_table['Precision']
    recall_vals = test_table['Recall']
    
    final_performance = pd.DataFrame(np.zeros((1,8)), columns = list(['F2-Score MEAN', 'F2-Score STD', 'bacc MEAN', 'bacc STD', 'Precision MEAN', 'Precision STD', 'Recall MEAN', 'Recall STD']))
    
    final_performance.iloc[0,0] = np.mean(f2_vals)
    final_performance.iloc[0,2] = np.mean(bacc_vals)
    final_performance.iloc[0,4] = np.mean(precision_vals)
    final_performance.iloc[0,6] = np.mean(recall_vals)
    
    final_performance.iloc[0,1] = np.std(f2_vals)
    final_performance.iloc[0,3] = np.std(bacc_vals)
    final_performance.iloc[0,5] = np.std(precision_vals)
    final_performance.iloc[0,7] = np.std(recall_vals)
    
    final_performance = round(final_performance, 3)
    
    return final_performance
def test_pipeline_memory_transformer():
    iris = load_iris()
    X = iris.data
    y = iris.target
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(gamma="scale", probability=True, random_state=0)
        transf = DummyTransf()
        pipe = Pipeline([("transf", clone(transf)), ("svc", clf)])
        cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps["transf"].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert not hasattr(transf, "means_")
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(gamma="scale", probability=True, random_state=0)
        transf_2 = DummyTransf()
        cached_pipe_2 = Pipeline(
            [("transf_2", transf_2), ("svc", clf_2)], memory=memory
        )
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X))
        assert_array_equal(
            pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)
        )
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe_2.named_steps["transf_2"].means_,
        )
        assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
def test_pipeline_methods_rus_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)

    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA()
    rus = RandomUnderSampler(random_state=0)
    pipe = Pipeline([('rus', rus), ('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Exemple #36
0
def main():

    # 'LR', 'DT', 'SVC', 'LSTM', 'NN', # 'MLP', 'CNN', 'LSTM', 'ConvLSTM', 'CNNLSTM', 'EncodeDecodeLSTMs'
    models = ['RF']
    targets = ['DOcategory', 'pHcategory', 'ph', 'dissolved_oxygen']
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    n_job = -1

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookOne/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names',  'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta', 'imfeatures': 'imfeatures'}
            else:
                cat = 0
                directory = 'Results/bookOne/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names',  'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse',  'R2': 'R2', 'imfeatures': 'imfeatures'}

            if not os.path.exists(directory):
                os.makedirs(directory)

            resultFileName = 'results_'+target+str(time.time())+'.csv'
            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directory+resultFileName,
                            index=False, header=False)

            path = 'Sondes_data/train/train_data/'
            method = 'OrgData'

            for n_steps in [1, 3, 6, 12]:  #
                for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                    files = [f for f in os.listdir(path) if f.endswith(
                        '.csv') and f.startswith(sondefilename)]
                    file = files[0]
                    print('Window: '+str(n_steps) + ' TH: ' +
                          str(PrH_index)+' '+method+' '+target)

                    dataset = pd.read_csv(path+file)

                    train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                        dataset, PrH_index, n_steps, target, cat)
                    print(train_X_grid[0:1])

                    if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                        train_y_grid = to_categorical(train_y_grid, 3)
                    if model_name == 'LSTM' or model_name == 'NN':
                        n_job = 1

                    start_time = time.time()
                    model = func.algofind(model_name, input_dim, n_steps, cat)

                    if cat == 1:
                        metric = make_scorer(f2_measure)
                    else:
                        metric = make_scorer(R2_measure)

                    # cat_ix = train_X_grid[:, 7:]
                    # print(cat_ix[0:2])
                    # num_ix = train_X_grid[:, : 7]
                    # print(num_ix[0:2])
                    # one hot encode categorical, normalize numerical
                    # ct = ColumnTransformer(
                    #     [('c', OneHotEncoder(), cat_ix), ('n', StandardScaler(), num_ix)])

                    if model_name == 'RF' or model_name == 'DT':
                        pipeline = Pipeline(steps=[('model', model)])

                    else:  # model_name == 'LSTM' or model_name == 'NN':
                        pipeline = Pipeline(
                            steps=[('n', StandardScaler()), ('model', model)])

                    # else:
                    #     pipeline = Pipeline(
                    #         steps=[('transforms', ct), ('model', model)])

                    custom_cv = func.custom_cv_2folds(train_X_grid, 5)

                    if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                        gs = RandomizedSearchCV(
                            estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job)
                        clf = gs.fit(train_X_grid, train_y_grid,
                                     model__class_weight={0: 1, 1: 50, 2: 100})
                    elif cat == 0 and (model_name == 'LSTM' or model_name == 'NN'):
                        gs = RandomizedSearchCV(
                            estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job)
                        clf = gs.fit(train_X_grid, train_y_grid)
                    else:
                        gs = RandomizedSearchCV(
                            estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, scoring=metric, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job)
                        clf = gs.fit(train_X_grid, train_y_grid)

                    test_Score = clf.cv_results_['mean_test_score'].mean()
                    test_std = clf.cv_results_['std_test_score'].mean()

                    print('Mean test scores: %.3f' % test_Score)

                    i = 1
                    custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                    for train_index, test_index in custom_cv:
                        test_X = train_X_grid[test_index]
                        test_y = train_y_grid[test_index]

                        predictions = clf.predict(test_X)

                        fpath = 'predictions_' + method+target+'_Window' + \
                            str(n_steps) + '_TH' + \
                            str(PrH_index)+'_CV' + str(i)+file

                        if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                            test_y = argmax(test_y, axis=1)

                        cm0 = func.forecast_accuracy(predictions, test_y, cat)

                        plt.scatter(np.arange(len(test_y)),
                                    test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions, s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')

                        plt.savefig(directory+fpath+'.jpg')

                        plt.close()

                        data = {'Actual': test_y, 'Predictions': predictions}
                        print(test_y.shape)
                        print(predictions.shape)

                        df = pd.DataFrame(data=data)

                        df.to_csv(directory+fpath, index=False)

                        if cat == 1:
                            data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i,
                                    'file_names': fpath, 'std_test_score': [test_std], 'mean_test_score': [test_Score], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]], 'imfeatures': [clf.best_estimator_]}
                        elif cat == 0:
                            data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i,
                                    'file_names': fpath, 'std_test_score': [test_std], 'mean_test_score': [test_Score], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5], 'imfeatures': [clf.best_estimator_]}

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directory+resultFileName,
                                  index=False, mode='a', header=False)

                        elapsed_time = time.time() - start_time
                        print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
                        i = i+1
Exemple #37
0
def main():
    import time
    start = time.time()
    with open('./pkl/X.pkl', 'rb') as fh:  # Load data set
        X = dill.load(fh)
    with open('./pkl/y.pkl', 'rb') as fh:
        y = dill.load(fh)
    scaler = Normalizer()
    smote_etomek = SMOTETomek(ratio='auto')
    cachedir = mkdtemp()
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    classifier = XGBClassifier()

    # A parameter grid for XGBoost
    params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0, 0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [1, 3, 4, 5, 10],
    }
    pipeline = Pipeline([
        ('scaler', scaler),
        ('smt', smote_etomek),
        ('clf', classifier),
    ],
                        memory=cachedir)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
    sss.get_n_splits(X, y)
    for train_index, test_index in sss.split(X, y):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[
            test_index]  # make training and test set
        y_train, y_test = y[train_index], y[test_index]

        clf = dasksearchCV(classifier,
                           params,
                           n_jobs=8,
                           cv=3,
                           scoring='roc_auc',
                           refit=True)

        clf.fit(X_train, y_train)
        print(clf.best_params_)
        print(clf.best_score_)
        best_parameters, score = clf.best_params_, clf.best_score_
        print('Raw AUC score:', score)
        for param_name in sorted(best_parameters.keys()):
            print("%s: %r" % (param_name, best_parameters[param_name]))
        classifier = XGBClassifier(**best_parameters, njobs=-1)
        plot_cross_validation(
            cv, X_train, y_train,
            pipeline)  # do 5 fold stratified cross-validation
        clf = pipeline.fit(X_train, y_train)  #

        print(classifier.get_params())
        expected = y_test
        predicted = clf.predict(X_test)  # test performance on test set
        plot_confusion_matrix(confusion_matrix(expected, predicted),
                              classes=["Non-Zika", "Zika"])
    print(time.time() - start)
    from sklearn import metrics
    print("Classification report for classifier %s:\n%s\n" %
          (clf, metrics.classification_report(expected, predicted)))
def test_set_pipeline_step_none():
    # Test setting Pipeline steps to None
    X = np.array([[1]])
    y = np.array([1])
    mult2 = Mult(mult=2)
    mult3 = Mult(mult=3)
    mult5 = Mult(mult=5)

    def make():
        return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)])

    pipeline = make()

    exp = 2 * 3 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline.set_params(m3=None)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    expected_params = {'steps': pipeline.steps,
                       'm2': mult2,
                       'm3': None,
                       'last': mult5,
                       'memory': None,
                       'm2__mult': 2,
                       'last__mult': 5}
    assert pipeline.get_params(deep=True) == expected_params

    pipeline.set_params(m2=None)
    exp = 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    # for other methods, ensure no AttributeErrors on None:
    other_methods = ['predict_proba', 'predict_log_proba',
                     'decision_function', 'transform', 'score']
    for method in other_methods:
        getattr(pipeline, method)(X)

    pipeline.set_params(m2=mult2)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline = make()
    pipeline.set_params(last=None)
    # mult2 and mult3 are active
    exp = 6
    pipeline.fit(X, y)
    pipeline.transform(X)
    assert_array_equal([[exp]], pipeline.fit(X, y).transform(X))
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    with raises(AttributeError, match="has no attribute 'predict'"):
        getattr(pipeline, 'predict')

    # Check None step at construction time
    exp = 2 * 5
    pipeline = Pipeline([('m2', mult2), ('m3', None), ('last', mult5)])
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    scores.iloc[0, 3] = np.std(scores_temp['test_bacc'])
    scores.iloc[0, 4] = np.mean(scores_temp['test_Precision'])
    scores.iloc[0, 5] = np.std(scores_temp['test_Precision'])
    scores.iloc[0, 6] = np.mean(scores_temp['test_Recall'])
    scores.iloc[0, 7] = np.std(scores_temp['test_Recall'])

    scores = np.round(scores, 3)

    return scores


############### DEFINE COMBINATIONS VIA PIPELINES (FOR LATEX TABLE) ###############

models = []

pip_BAG0 = Pipeline(steps=[('m', model_BAG)])
models.append(pip_BAG0)
pip_WBAG = Pipeline(steps=[('m', model_WBAG)])
models.append(pip_WBAG)

pip_BAGROS = Pipeline(steps=[('s', ROS1), ('m', model_BAG)])
models.append(pip_BAGROS)
pip_BAGSMOTE = Pipeline(steps=[('s', SMOTE1), ('m', model_BAG)])
models.append(pip_BAGSMOTE)
pip_BAGADASYN = Pipeline(steps=[('s', ADASYN1), ('m', model_BAG)])
models.append(pip_BAGADASYN)

pip_BAGRUS = Pipeline(steps=[('s', RUS1), ('m', model_BAG)])
models.append(pip_BAGRUS)
pip_BAGENN = Pipeline(steps=[('s', ENN1), ('m', model_BAG)])
models.append(pip_BAGENN)
def test_pipeline_memory_sampler():
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0,
    )
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(gamma="scale", probability=True, random_state=0)
        transf = DummySampler()
        pipe = Pipeline([("transf", clone(transf)), ("svc", clf)])
        cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps["transf"].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert not hasattr(transf, "means_")
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(gamma="scale", probability=True, random_state=0)
        transf_2 = DummySampler()
        cached_pipe_2 = Pipeline(
            [("transf_2", transf_2), ("svc", clf_2)], memory=memory
        )
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X))
        assert_array_equal(
            pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)
        )
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe_2.named_steps["transf_2"].means_,
        )
        assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
Exemple #41
0
 def build_model_pipeline(self):
     from imblearn.pipeline import Pipeline
     model = Pipeline([('clf', LightGbmClassifier.model)])
     return model
def test_pipeline_init_tuple():
    # Pipeline accepts steps as tuple
    X = np.array([[1, 2]])
    pipe = Pipeline((("transf", Transf()), ("clf", FitParamT())))
    pipe.fit(X, y=None)
    pipe.score(X)
    pipe.set_params(transf="passthrough")
    pipe.fit(X, y=None)
    pipe.score(X)
def test_set_pipeline_steps():
    transf1 = Transf()
    transf2 = Transf()
    pipeline = Pipeline([('mock', transf1)])
    assert pipeline.named_steps['mock'] is transf1

    # Directly setting attr
    pipeline.steps = [('mock2', transf2)]
    assert 'mock' not in pipeline.named_steps
    assert pipeline.named_steps['mock2'] is transf2
    assert [('mock2', transf2)] == pipeline.steps

    # Using set_params
    pipeline.set_params(steps=[('mock', transf1)])
    assert [('mock', transf1)] == pipeline.steps

    # Using set_params to replace single step
    pipeline.set_params(mock=transf2)
    assert [('mock', transf2)] == pipeline.steps

    # With invalid data
    pipeline.set_params(steps=[('junk', ())])
    with raises(TypeError):
        pipeline.fit([[1]], [1])
    with raises(TypeError):
        pipeline.fit_transform([[1]], [1])
    def fit(self, X_train, y_train):

        # create list of targets

        # self.pipelines_list = []
        # self.preds = []
        # for i in targets :
        #  x. feature engineering (i)
        # y = df[i]
        # cv_scores  (x, y, pipeline_per_target[i])
        # model = pipeline_per_target[i].train(x, y)
        # pipelines_list.append(model)
        # preds.append(model.pred(x))

        # y = df[y]
        # combined_model = LogReg.train(preds, y)
        # print results....

        # def pred(X):
        #
        if intrusion:
            y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion)
        else:
            y_pred_intrusion = 1

        if avoidance:
            y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance)
        else:
            y_pred_avoidance = 1

        if hypertension:
            y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension)
        else:
            y_pred_hypertension = 1

        if depression:
            y_pred_depression = self.pipe_depression.predict(X_depression)
        else:
            y_pred_depression = 1

        if only_avoidance:
            y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_only_avoidance)
        else:
            y_pred_only_avoidance = 1

        if PCL_Strict3:
            y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_PCL_Strict3)
        else:
            y_pred_PCL_Strict3 = 1

        if regression_cutoff_33:
            y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_regression_cutoff_33)
        else:
            y_pred_regression_cutoff_33 = 1

        if regression_cutoff_50:
            y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_regression_cutoff_50)
        else:
            y_pred_regression_cutoff_50 = 1

        if tred_cutoff:
            y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_tred_cutoff)
        else:
            y_pred_tred_cutoff = 1

        y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression &
                  y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 &
                  y_pred_regression_cutoff_50 & y_pred_tred_cutoff)
        y_target = y_train

        acc = accuracy_score(y_target, y_pred)
        f1 = f1_score(y_target, y_pred)
        recall = recall_score(y_target, y_pred)
        precision = precision_score(y_target, y_pred)
        print("training scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")

        # combined
        y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension)
        y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance)
        y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion)
        y_pred_regression = self.pipe_regression.predict(X_regression)

        X_train["y_pred_hypertension"] = y_pred_hypertension
        X_train["y_pred_avoidance"] = y_pred_avoidance
        X_train["y_pred_intrusion"] = y_pred_intrusion
        X_train["y_pred_regression"] = y_pred_regression
        preds = ["y_pred_hypertension", "y_pred_avoidance", "y_pred_intrusion", "y_pred_regression"]

        X_combined = X_train[['q6.11_NUMB_pcl2', 'q6.13_SLEEP_pcl1', 'intrusion_pcl2', 'phq2'] + preds].values
        y_combined = y_train
        self.pipe_combined = Pipeline(steps=[
            ('classifier', DecisionTreeClassifier())])
        scores = cross_val_score(self.pipe_combined, X_combined, y_combined, scoring='precision', cv=StratifiedKFold(5))
        print(f"hypertension {sum(scores)/5}")
        self.pipe_combined.fit(X_combined, y_combined)
X = pd.DataFrame(X, columns=['X1', 'X2', 'Run Date'])
cv = None
n_jobs = 1
# BULID THE TRANSFORMATION PORTION OF THE ML PIPELINE
preprocess = PreProcessPipeline(
    imputer_method=imputer_method,
    normalize_method=normalize_method,
    resample_method=resample_method,
    cols_to_remove=X.columns.to_list().index('Run Date'))

steps = preprocess.get_steps()

# GET THE ESTIMATOR OBJECT AND APPEND IT AS THE FINAL STEP IN THE PIPELINE
model = get_classifier_model(model_name, params=None)
steps.append(('model', model))

print(steps)

# INITIALIZE THE PIPELINE
pipe = Pipeline(steps)
clf = CalibratedClassifierCV(pipe, cv=cv, method='isotonic', n_jobs=n_jobs)

print(clf)

clf.fit(X, y)

joblib.dump(clf, 'test_pipeline.joblib')

clf = joblib.load('test_pipeline.joblib')
clf.predict(X)
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(gamma='scale', probability=True, random_state=0)
    pca = PCA(svd_solver='full', n_components='mle', whiten=True)
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_memory_sampler():
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0)
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir=cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(probability=True, random_state=0)
        transf = DummySampler()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert not hasattr(transf, 'means_')
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(probability=True, random_state=0)
        transf_2 = DummySampler()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
def test_pipeline_methods_anova_rus():
    # Test the various methods of the pipeline (anova).
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0,
    )
    # Test with RandomUnderSampling + Anova + LogisticRegression
    clf = LogisticRegression(solver="lbfgs")
    rus = RandomUnderSampler(random_state=0)
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([("rus", rus), ("anova", filter1), ("logistic", clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(gamma='scale', probability=True, random_state=0)
    pca = PCA(svd_solver='full', n_components='mle', whiten=True)
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA()
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_param_error():
    clf = make_pipeline(LogisticRegression())
    with pytest.raises(
        ValueError,
        match="Pipeline.fit does not accept the sample_weight parameter",
    ):
        clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1])


parameter_grid_test_verbose = (
    (est, pattern, method)
    for (est, pattern), method in itertools.product(
        [
            (
                Pipeline([("transf", Transf()), ("clf", FitParamT())]),
                r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n"
                r"\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$",
            ),
            (
                Pipeline([("transf", Transf()), ("noop", None), ("clf", FitParamT())]),
                r"\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n"
                r"\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n"
                r"\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$",
            ),
            (
                Pipeline(
                    [
                        ("transf", Transf()),
                        ("noop", "passthrough"),
                        ("clf", FitParamT()),
Exemple #53
0
def f(colname, criteria):
    task = Classify()
    X, y = task.process_data(colname, data)
    print(X.columns, X.shape)
    task.summary()
    model_rf = Pipeline([
        ('random', RandomOverSampler(random_state=1)),
        # ('sampling', SMOTE()),
        ('classification', RandomForestClassifier(random_state=1))
    ])
    parameters_rf = {
        'classification__max_depth': [2, 5, 10],
        'classification__n_estimators': [50, 100, 200, 500, 1000],
        'classification__min_samples_split': [2]
    }

    model_log = Pipeline([
        ('random', RandomOverSampler(random_state=1)),
        # ('sampling', SMOTE()),
        ('classification',
         LogisticRegression(multi_class='ovr',
                            random_state=1,
                            fit_intercept=False))
    ])
    parameters_log = [{
        'classification__penalty': ['l2'],
        'classification__solver': ['lbfgs'],
        'classification__C': list(range(1, 11))
    }, {
        'classification__penalty': ['l1'],
        'classification__solver': ['liblinear'],
        'classification__C': list(range(1, 11))
    }]
    model_gdbt = Pipeline([
        ('random', RandomOverSampler(random_state=1)),
        # ('sampling', SMOTE()),
        ('classification', boosting(random_state=1))
    ])
    parameters_gdbt = {
        'classification__max_depth': [2, 5],
        'classification__n_estimators': [100, 500, 1000],
        'classification__min_samples_split': [2, 4, 6],
        'classification__learning_rate': [0.01, 0.1]
    }
    # X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=30, stratify=y)

    clf_log, best_parameters, best_score_log = task.classify(
        parameters_log, model_log,
        criteria)  #training_process(X, y, parameters_log, model_log)
    clf_rf, best_parameters, best_score_rf = task.classify(
        parameters_rf, model_rf,
        criteria)  # training_process(X, y, parameters_rf, model_rf)
    # clf_gdbt, best_parameters, best_score_gdbt = task.classify(parameters_gdbt, model_gdbt, criteria) # training_process(X, y, parameters_gdbt, model_gdbt)

    # prediction
    y_rf, y_log = clf_rf.predict(X), clf_log.predict(X)
    # y_gdbt = clf_gdbt.predict(X)
    res = {
        # 'gdbt': {'best_score': best_score_gdbt, 'balanced_acc': balanced_accuracy_score(y, y_gdbt), 'acc': accuracy_score(y, y_gdbt)},
        'rf': {
            'best_score': best_score_rf,
            'balanced_acc': balanced_accuracy_score(y, y_rf),
            'acc': accuracy_score(y, y_rf)
        },
        'log': {
            'best_score': best_score_log,
            'balanced_acc': balanced_accuracy_score(y, y_log),
            'acc': accuracy_score(y, y_log)
        }
    }
    '''
    clf_log, best_parameters, best_score_log, bacc_log, acc_log = task.classify(parameters_log, model_log, criteria) #training_process(X, y, parameters_log, model_log)
    clf_rf, best_parameters, best_score_rf, bacc_rf, acc_rf = task.classify(parameters_rf, model_rf, criteria) # training_process(X, y, parameters_rf, model_rf)
    clf_gdbt, best_parameters, best_score_gdbt, bacc_gdbt, acc_gdbt = task.classify(parameters_gdbt, model_gdbt, criteria) # training_process(X, y, parameters_gdbt, model_gdbt)
    res = {
        'gdbt': {'best_score': best_score_gdbt, 
               'balanced_acc': bacc_gdbt,
               'acc': acc_gdbt},
        'rf': {'best_score': best_score_rf, 
               'balanced_acc': bacc_rf, 
               'acc': acc_rf},
        'log': {'best_score': best_score_log, 
               'balanced_acc': bacc_log, 
               'acc': acc_log}
          }
    '''
    # print(best_score_gdbt, best_score_rf, best_score_log)
    return clf_log, clf_rf, res  #clf_gdbt, res
Exemple #54
0
ratio_list = [0.042, 0.111, 0.333, 0.538, 1]
percentage_list = [4, 10, 25, 35, 50]

#This loop executes the oversampling strategy (In this case SMOTE) for all the ratio's that were tested.
for ratio, percentage in zip(ratio_list, percentage_list):
    # Create a train-test split where the ratio of target class is maintained
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=47,
                                                        stratify=y)
    #Initialize a SMOTE sampler with ratio that will be tested
    over = SMOTE(sampling_strategy=ratio)
    #Initialize a pipeline (One can add extra steps here if required)
    steps = [('o', over)]
    pipeline = Pipeline(steps)
    #Resample data
    x_res, y_res = pipeline.fit_resample(x_train, y_train)
    print('resample finished')
    #Train an xg_boost model with resampled data
    xgb = xg_boost(x_res, y_res, x_test, y_test, f"SMOTE_{percentage}")

# The code below was used to calculate the running times.
# Since some running times were very long, we let the code time-out after 10 hours.
# # It is less relevant for WWF, hence it is commented out.

#List of sub-sample sizes that were evaluated to calculate running times.
# subset_list = [30000, 50000, 75000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1500000, 2000000]
# times_subsetsize_list = []

# def calculate_running_times():
Exemple #55
0
def extract_categorical_visualize_graphs(data_input_path="04_Model" + "/" +
                                         "prepared_input.pickle",
                                         top_percentage=0.2):
    '''
    Of the results of a wide search algorithm, find the top x percent (deafult=20%), calculate the median value for
    each parameter and select the parameter value with the best median result.

    Visualize results of the search algorithm.

    :args:
        data_path: Path to the pickle with the complete results of the run
        top_percentage: Top share of results to consider. def: 0.2.
    :return:
        Nothing
    '''

    # Get necessary data from the data preparation
    r = open(data_input_path, "rb")
    prepared_data = pickle.load(r)

    model_name = prepared_data['paths']['dataset_name']
    model_directory = prepared_data['paths']['model_directory']
    results_file_path = prepared_data['paths']['svm_run1_result_filename']
    refit_scorer_name = prepared_data['refit_scorer_name']
    selected_features = prepared_data['selected_features']
    feature_dict = prepared_data['feature_dict']
    X_train = prepared_data['X_train']
    svm_pipe_first_selection = prepared_data['paths'][
        'svm_pipe_first_selection']

    s = open(results_file_path, "rb")
    results = pickle.load(s)
    results_run1 = results['result']
    params_run1 = results['parameter']

    #Create result table
    #merged_params_run1 = {}
    #for d in params_run1:
    #    merged_params_run1.update(d)

    # Get the top x% values from the results
    # number of results to consider
    #top_percentage = 0.2
    number_results = np.int(results_run1.shape[0] * top_percentage)
    print("The top {} of the results are used, i.e {} samples".format(
        top_percentage * 100, number_results))
    results_subset = results_run1.iloc[0:number_results, :]

    ## %% Plot graphs

    # Prepare the inputs: Replace the lists with strings
    result_subset_copy = results_subset.copy()
    print("Convert feature lists to names")
    sup.list_to_name(selected_features, list(feature_dict.keys()),
                     result_subset_copy['param_feat__cols'])

    # Replace lists in the parameters with strings
    params_run1_copy = copy.deepcopy(params_run1)
    sup.replace_lists_in_grid_search_params_with_strings(
        selected_features, feature_dict, params_run1_copy)

    # Plot the graphs
    _, scaler_medians = vis.visualize_parameter_grid_search(
        'scaler',
        params_run1,
        results_subset,
        refit_scorer_name,
        save_fig_prefix=model_directory + '/images/' + model_name)
    _, sampler_medians = vis.visualize_parameter_grid_search(
        'sampling',
        params_run1,
        results_subset,
        refit_scorer_name,
        save_fig_prefix=model_directory + '/images/' + model_name)
    _, kernel_medians = vis.visualize_parameter_grid_search(
        'svm__kernel',
        params_run1,
        results_subset,
        refit_scorer_name,
        save_fig_prefix=model_directory + '/images/' + model_name)
    _, feat_cols_medians = vis.visualize_parameter_grid_search(
        'feat__cols',
        params_run1_copy,
        result_subset_copy,
        refit_scorer_name,
        save_fig_prefix=model_directory + '/images/' + model_name)

    ## Get the best parameters

    # Get the best scaler from median
    best_scaler = max(scaler_medians, key=scaler_medians.get)
    print("Best scaler: ", best_scaler)
    best_sampler = max(sampler_medians, key=sampler_medians.get)
    print("Best sampler: ", best_sampler)
    best_kernel = max(kernel_medians, key=kernel_medians.get)
    print("Best kernel: ", best_kernel)

    # Get best feature result
    # Get the best kernel
    best_feat_cols = max(feat_cols_medians,
                         key=feat_cols_medians.get)  # source.idxmax()
    # print("Best {}: {}".format(name, best_feature_combi))
    best_columns = feature_dict.get(best_feat_cols)

    print("Best feature selection: ", best_feat_cols)
    print(
        "Best column indices: ", best_columns
    )  # feature_dict.get((results_run1[result_columns_run1].loc[indexList]['param_feat__cols'].iloc[best_feature_combi])))
    print("Best column names: ", list(X_train.columns[best_columns]))

    # Define pipeline, which is constant for all tests
    pipe_run_best_first_selection = Pipeline([
        ('scaler', best_scaler), ('sampling', best_sampler),
        ('feat', modelutil.ColumnExtractor(cols=best_columns)),
        ('svm', SVC(kernel=best_kernel))
    ])

    display(pipe_run_best_first_selection)

    # Save best pipe
    dump(pipe_run_best_first_selection, open(svm_pipe_first_selection, 'wb'))
    print("Stored pipe_run_best_first_selection at ", svm_pipe_first_selection)

    print("Method end")
Exemple #56
0
    def fit(self, X_train, y_train):

        predictions_list = []

        for target in self.targets_list:
            if self.use_feature_engineering:
                X = FeatureEngineering(X_train[self.features], target).engineer_features().values
            else:
                X = X_train[self.features].values

            if target == "PCL_Strict3":
                y = y_train[target].apply(lambda x: int(x))
            else:
                y = X_train[target].apply(lambda x: int(x))

            pipeline = pipeline_per_target[target]
            scores = cross_val_score(pipeline, X, y, scoring='f1', cv=StratifiedKFold(5))
            print(f"{target} - {sum(scores)/len(scores)}")

            if self.train_on_partial_prediction:
                combined_y = pd.DataFrame(y, columns=[target])
                if target != "PCL_Strict3":
                    combined_y["PCL_Strict3"] = y_train["PCL_Strict3"].apply(lambda x: int(x))

                _X_train, _X_test, _y_train, _y_test = \
                    train_test_split(X, combined_y, test_size=0.25)
                self.trained_pipelines[target] = pipeline.fit(_X_train, _y_train[target])
                y_pred = self.trained_pipelines[target].predict(_X_test)
                predictions_list.append(y_pred)
                print("test f1", target, f1_score(_y_test[target], y_pred))
                self.trained_pipelines[target] = pipeline.fit(X, y)
                y = _y_test["PCL_Strict3"]
            else:
                self.trained_pipelines[target] = pipeline.fit(X, y)
                predictions_list.append([self.trained_pipelines[target].predict(X)])
                y = y_train["PCL_Strict3"]

            if self.check_on_test_set:
                if target == "PCL_Strict3":
                    y_test = self.y_test[target].apply(lambda x: int(x))
                else:
                    y_test = X_train[target].apply(lambda x: int(x))
                if self.use_feature_engineering:
                    X_test = FeatureEngineering(self.X_test[self.features], target).engineer_features().values
                else:
                    X_test = self.X_test[self.features].values

                model = self.trained_pipelines[target]
                y_pred = model.predict(X_test)
                s_f = f1_score(self.y_test, y_pred)
                s_p = precision_score(self.y_test, y_pred)
                s_r = recall_score(self.y_test, y_pred)
                print(f"test f1 {target}", s_f)
                print(f"test recall {target}", s_r)
                print(f"test precision {target}", s_p)

        #pipe = Pipeline(steps=[
        #    ('scaling', StandardScaler()),
        #    ('sampling', SMOTE()),
        #    ('classifier', LogisticRegression(penalty='l1'))])
        #c = ((len(y) - sum(y)) / sum(y))

        if not self.use_and_func:
            c = 10
            pipe = Pipeline(steps=[('feature_selection',
                                    RFE(XGBClassifier(n_estimators=10, scale_pos_weight=c))),
                                   ('clf', XGBClassifier(scale_pos_weight=c))])
            X = predictions_list
            self.combined_model = pipe.fit(np.array(X).reshape(-1, len(predictions_list)), y)
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    with raises(TypeError):
        Pipeline()
    # Check that we can't instantiate pipelines with objects without fit
    # method
    error_regex = 'Last step of Pipeline should implement fit. .*NoFit.*'
    with raises(TypeError, match=error_regex):
        Pipeline([('clf', NoFit())])
    # Smoke test with only an estimator
    clf = NoTrans()
    pipe = Pipeline([('svc', clf)])
    expected = dict(svc__a=None,
                    svc__b=None,
                    svc=clf,
                    **pipe.get_params(deep=False))
    assert pipe.get_params(deep=True) == expected

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert clf.a == 0.1
    assert clf.b is None
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC(gamma='scale')
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([('anova', filter1), ('svc', clf)])

    # Check that we can't instantiate with non-transformers on the way
    # Note that NoTrans implements fit, but not transform
    error_regex = 'implement fit and transform or sample'
    with raises(TypeError, match=error_regex):
        Pipeline([('t', NoTrans()), ('svc', clf)])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert clf.C == 0.1
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    with raises(ValueError):
        pipe.set_params(anova__C=0.1)

    # Test clone
    pipe2 = clone(pipe)
    assert not pipe.named_steps['svc'] is pipe2.named_steps['svc']

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop('svc')
    params.pop('anova')
    params2.pop('svc')
    params2.pop('anova')
    assert params == params2
def test_set_pipeline_steps():
    transf1 = Transf()
    transf2 = Transf()
    pipeline = Pipeline([("mock", transf1)])
    assert pipeline.named_steps["mock"] is transf1

    # Directly setting attr
    pipeline.steps = [("mock2", transf2)]
    assert "mock" not in pipeline.named_steps
    assert pipeline.named_steps["mock2"] is transf2
    assert [("mock2", transf2)] == pipeline.steps

    # Using set_params
    pipeline.set_params(steps=[("mock", transf1)])
    assert [("mock", transf1)] == pipeline.steps

    # Using set_params to replace single step
    pipeline.set_params(mock=transf2)
    assert [("mock", transf2)] == pipeline.steps

    # With invalid data
    pipeline.set_params(steps=[("junk", ())])
    with raises(TypeError):
        pipeline.fit([[1]], [1])
    with raises(TypeError):
        pipeline.fit_transform([[1]], [1])
def test_pipeline_methods_anova_rus():
    # Test the various methods of the pipeline (anova).
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)
    # Test with RandomUnderSampling + Anova + LogisticRegression
    clf = LogisticRegression()
    rus = RandomUnderSampler(random_state=0)
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression(solver='lbfgs', multi_class='auto')
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)