Ejemplo n.º 1
0
def test_pipeline_methods_rus_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0,
    )

    # Test with PCA + SVC
    clf = SVC(gamma="scale", probability=True, random_state=0)
    pca = PCA()
    rus = RandomUnderSampler(random_state=0)
    pipe = Pipeline([("rus", rus), ("pca", pca), ("svc", clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Ejemplo n.º 2
0
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA()
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(gamma='scale', probability=True, random_state=0)
    pca = PCA(svd_solver='full', n_components='mle', whiten=True)
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Ejemplo n.º 4
0
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression(solver="lbfgs", multi_class="auto")
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([("anova", filter1), ("logistic", clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Ejemplo n.º 5
0
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Ejemplo n.º 6
0
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(gamma="scale", probability=True, random_state=0)
    pca = PCA(svd_solver="full", n_components="mle", whiten=True)
    pipe = Pipeline([("pca", pca), ("svc", clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Ejemplo n.º 7
0
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Ejemplo n.º 8
0
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA()
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Ejemplo n.º 9
0
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = PCA(n_components=2)
    clf = SVC(probability=True, random_state=0, decision_function_shape='ovr')

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert_equal(predict.shape, (n_samples, ))

        proba = pipe.predict_proba(X)
        assert_equal(proba.shape, (n_samples, n_classes))

        log_proba = pipe.predict_log_proba(X)
        assert_equal(log_proba.shape, (n_samples, n_classes))

        decision_function = pipe.decision_function(X)
        assert_equal(decision_function.shape, (n_samples, n_classes))

        pipe.score(X, y)
Ejemplo n.º 10
0
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = PCA(n_components=2)
    clf = SVC(probability=True, random_state=0, decision_function_shape='ovr')

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert_equal(predict.shape, (n_samples,))

        proba = pipe.predict_proba(X)
        assert_equal(proba.shape, (n_samples, n_classes))

        log_proba = pipe.predict_log_proba(X)
        assert_equal(log_proba.shape, (n_samples, n_classes))

        decision_function = pipe.decision_function(X)
        assert_equal(decision_function.shape, (n_samples, n_classes))

        pipe.score(X, y)
Ejemplo n.º 11
0
def test_pipeline_methods_anova_rus():
    # Test the various methods of the pipeline (anova).
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)
    # Test with RandomUnderSampling + Anova + LogisticRegression
    clf = LogisticRegression()
    rus = RandomUnderSampler(random_state=0)
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Ejemplo n.º 12
0
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = PCA(n_components=2, svd_solver="randomized", whiten=True)
    clf = SVC(
        gamma="scale",
        probability=True,
        random_state=0,
        decision_function_shape="ovr",
    )

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert predict.shape == (n_samples, )

        proba = pipe.predict_proba(X)
        assert proba.shape == (n_samples, n_classes)

        log_proba = pipe.predict_log_proba(X)
        assert log_proba.shape == (n_samples, n_classes)

        decision_function = pipe.decision_function(X)
        assert decision_function.shape == (n_samples, n_classes)

        pipe.score(X, y)
def Predict(data, mode):
    train, test = data
    idx = test.id.values.astype(int)
    y = train.median_relevance.values

    train_query = list(
        train.apply(lambda x: '%s' % x['query_preprocessed'], axis=1))
    train_title = list(
        train.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1))

    test_query = list(
        test.apply(lambda x: '%s' % x['query_preprocessed'], axis=1))
    test_title = list(
        test.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1))

    stop_words = text.ENGLISH_STOP_WORDS.union(['http','www','img','border','color','style','padding','table','font', \
                                                'thi','inch','ha','width','height','0','1','2','3','4','5','6','7','8','9'])
    stop_words = text.ENGLISH_STOP_WORDS.union(set(stopwords.words('english')))

    tfv = text.TfidfVectorizer(min_df=7,  max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', \
                               ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words=stop_words)

    tfv.fit(train_query + train_title)
    X_train = hstack([tfv.transform(train_query), tfv.transform(train_title)])
    X_test = hstack([tfv.transform(test_query), tfv.transform(test_title)])

    sim = similarlity_stack()
    if mode == 'eda':
        svd = TruncatedSVD(n_components=200)
        scl = StandardScaler(with_mean=False)
        svm = SVC(C=10,
                  gamma="auto",
                  kernel="rbf",
                  class_weight=None,
                  probability=True)
        clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\
                            ('scl', scl),\
                            ('svm', svm)])
    elif mode == 'sampling':
        svd = TruncatedSVD(n_components=200)
        scl = StandardScaler(with_mean=False)
        svm = SVC(C=10,
                  gamma="auto",
                  kernel="rbf",
                  class_weight=None,
                  probability=True)
        sampling = SVMSMOTE(svm_estimator=svm, k_neighbors=4)
        clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\
                                  ('scl', scl),\
                                  ('sampling', sampling),\
                                  ('svm', svm)])

    clf.fit(X_train, y)
    preds = clf.predict(X_test)
    pred_probas = clf.predict_proba(X_test)

    submission = pd.DataFrame({"id": idx, "prediction": preds})
    submission_probas = pd.DataFrame(pred_probas, index=idx)

    return submission, submission_probas
Ejemplo n.º 14
0
def test_pipeline_methods_anova_rus():
    # Test the various methods of the pipeline (anova).
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)
    # Test with RandomUnderSampling + Anova + LogisticRegression
    clf = LogisticRegression()
    rus = RandomUnderSampler(random_state=0)
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Ejemplo n.º 15
0
def find_expert(tag):
    """
    输出话题标签[TAG]下模型预测的最有可能是潜在专家的20名用户
    """
    fold = StratifiedKFold(n_splits=4)
    params = best_solution(tag)
    data, target, ratio = load_data(tag)
    fold.random_state = int(params['seed'])
    samp = ADASYN(n_neighbors=2,
                  sampling_strategy=float(params['sampling_strategy']) * ratio,
                  random_state=int(params['seed']))
    clf = XGBClassifier(n_estimators=int(params['n_estimators']),
                        gamma=float(params['gamma']),
                        eta=float(params['eta']),
                        reg_lambda=int(params['reg_lambda']),
                        verbosity=0,
                        n_jobs=-1,
                        random_state=int(params['seed']))
    pipeline = Pipeline([(type(samp).__name__, samp),
                         (type(clf).__name__, clf)])
    experts = pd.DataFrame(columns=['id', 'probability'])
    for _, (train, test) in tqdm(enumerate(fold.split(data, target)), total=4):
        pipeline.fit(data.iloc[train], target.iloc[train])
        pred_proba = pd.Series(pipeline.predict_proba(data.iloc[test])[:, 1],
                               index=target.iloc[test].index,
                               name='probability')
        experts = experts.append(pred_proba.to_frame().reset_index())
    experts = experts.sort_values(by=['probability'],
                                  ascending=False).iloc[:20]
    experts['probability'] = experts['probability'].astype(float).map(
        "{:.1%}".format)
    print(experts.to_string(index=False))
Ejemplo n.º 16
0
def test_pipeline_methods_rus_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)

    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA()
    rus = RandomUnderSampler(random_state=0)
    pipe = Pipeline([('rus', rus), ('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def train_test_and_evaluate(seed, X_train, X_test, y_train, y_test, onehots,
                            numericals, cv):

    # define transformer for features that are one hot encoded and pre-select features using chi square
    selector = SelectKBest(chi2, k=15)
    ohc = OneHotEncoder(handle_unknown='ignore')
    onehot_transformer = Pipeline(steps=[('selector', selector), ('ohc', ohc)])

    # define transformer for numerical features, scale features and shrink space using pca
    scaler = StandardScaler()
    pca = PCA()
    numeric_transformer = Pipeline(steps=[('scaler', scaler), ('pca', pca)])

    # define resampling: oversample minority class first and undersample majority class afterwards
    over = RandomOverSampler(random_state=random_seed, sampling_strategy=0.1)
    under = RandomUnderSampler(random_state=random_seed, sampling_strategy=0.5)

    # combine steps into preprocessing and machine learning pipeline using logistic regression
    preprocessor = ColumnTransformer(
        transformers=[('numeric', numeric_transformer,
                       numericals), ('onehot', onehot_transformer, onehots)])

    pipe_model = Pipeline(
        steps=[('over', over), ('under', under), (
            'prep',
            preprocessor), ('classifier', LogisticRegression(max_iter=1000))])

    # Cross-validate model on training data to estimate performance
    cv_results = 2 * roc_auc_score(
        y_train,
        cross_val_predict(
            pipe_model, X_train, y_train, cv=cv,
            method='predict_proba')[:, 1]) - 1
    print("Mean training gini after CV: {res}".format(res=cv_results.mean()))

    # Fit the model to training data and evaluate on test data and finally on evaluation data
    pipe_model.fit(X_train, y_train)
    y_true, y_pred = y_test, pipe_model.predict(X_test)
    gini = 2 * roc_auc_score(y_true,
                             pipe_model.predict_proba(X_test)[:, 1]) - 1
    print("Gini score on test set: " + str(gini))

    y_true, y_pred = y_val, pipe_model.predict(X_val)
    gini = 2 * roc_auc_score(y_true, pipe_model.predict_proba(X_val)[:, 1]) - 1
    print("Gini score on validation set: " + str(gini))
Ejemplo n.º 18
0
class ImblearnRecalibrator(BaseEstimator, ClassifierMixin):
    """
    imblearnのリサンプリングの偏りを再較正するやつ
    再較正のコードを毎回書きたくない. scikit-learnの設計思想に則りオブジェクト指向プログラミングをしよう
    estimator, resampler, サンプリング割合を指定したら後は fit & predict/predict_proba するだけ
    * 注意: 不均衡データに対するリサンプリングは分類性能を目的としているので判別性能等に効果があるかは知らない
    
    :param estimatror: scikit-learn API 準拠の estimator オブジェクト
    :param resampler: imblearn で使われる各種 resampler オブジェクト
    :param post_minor_rate: リサンプリング後の**全件に対する少数例の割合**を指定. default is None. alpha とどちらか片方を使う.
    :param alpha: **リサンプリング前に対する**事後の少数例の割合**を指定. default is 'auto'. post_minor_rate とどちらか片方を使う.
    """
    def __init__(self,
                 estimator,
                 resampler,
                 alpha='auto',
                 post_minor_rate=None):
        resampler = clone(resampler)
        if post_minor_rate is None and alpha is None:
            warnings.warn(
                'neither of `post_minor_rate` nor `alpha` are specified. Instead resampling stragegy specified in `resampler` object is used.'
            )
        elif post_minor_rate and alpha:
            warnings.warn(
                'both of `post_minor_rate` and `alpha` are specified. the former is applied.'
            )
            self.post_minor_rate = post_minor_rate
            self.resampling_strategy = 'posterior_rate'
        elif post_minor_rate:
            self.post_minor_rate = post_minor_rate
            self.resampling_strategy = 'posterior_rate'
        elif alpha:
            self.alpha = alpha
            self.resampling_strategy = 'alpha'
            resampler.set_params(sampling_strategy=alpha)
        else:
            raise ('initialized error')
        self.estimator_ = Pipeline([('resampler', resampler),
                                    ('estimator', clone(estimator))])

    def fit(self, X, y):
        if self.resampling_strategy == 'posterior_rate':
            alpha = get_oversampling_rate(self.post_minor_rate)
            self.alpha = alpha
            self.estimator_['resampler'].set_params(sampling_strategy=alpha)
        self.estimator_.fit(X, y)
        self.minor_rate_ = np.min([y.mean(), 1 - y.mean()])
        return self

    def predict(self, X):
        return self.estimator_.predict(X)

    def predict_proba(self, X):
        return calibrate_imbalanceness(self.estimator_.predict_proba(X),
                                       pos_rate=get_oversampling_power(
                                           self.alpha, self.minor_rate_))
Ejemplo n.º 19
0
def test_pipeline_memory_transformer():
    iris = load_iris()
    X = iris.data
    y = iris.target
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir=cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(probability=True, random_state=0)
        transf = DummyTransf()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert not hasattr(transf, 'means_')
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(probability=True, random_state=0)
        transf_2 = DummyTransf()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
Ejemplo n.º 20
0
def test_pipeline_memory_transformer():
    iris = load_iris()
    X = iris.data
    y = iris.target
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(gamma='scale', probability=True, random_state=0)
        transf = DummyTransf()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert not hasattr(transf, 'means_')
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(gamma='scale', probability=True, random_state=0)
        transf_2 = DummyTransf()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
Ejemplo n.º 21
0
def get_probabilities(text: str, classes: list, model: Pipeline):
    """ Calculates probabilities of text belonging to each model's class.

    Parameters:
    ----------
    text:
        Text for analysis.
    classes:
        Models' classes.
    model:
        The trained model.

    Returns:
    ----------
        Probabilities of finding a word in a particular class.
    """
    probabilities = np.array(np.around(model.predict_proba([text])[0], 3),
                             dtype=float).flatten()
    probabilities = dict(zip(classes, probabilities))
    return probabilities
Ejemplo n.º 22
0
def model_select():
    for nome_balanceador, balanceador in balanceadores:
        if classificador_ja_executado(nome, nome_balanceador):
            continue
        else:
            print(balanceador)
            pipeline = Pipeline([('dimension', PCA(n_components=250)),
                                 ('balance', balanceador), ('clf', modelo)])
            print("# Rodando o algoritmo %s" % nome)
            print()

            np.set_printoptions(precision=4)
            pipeline.fit(dados_completo_x, dados_completo_y)

            print("Detailed classification report:")
            print()
            print("The model is trained on the full development set.")
            print("The scores are computed on the full evaluation set.")
            print()
            y_pred = pipeline.predict(test_x)
            matriz_confusao = confusion_matrix(test_y, y_pred)
            nome_arquivo = nome + '_' + nome_balanceador + '_best_mucilage'
            plot_confusion_matrix(matriz_confusao,
                                  nome_arquivo, [1, 2, 3, 4],
                                  False,
                                  title='Confusion matrix' + nome +
                                  ' (best parameters)')
            plot_confusion_matrix(matriz_confusao,
                                  nome_arquivo, [1, 2, 3, 4],
                                  True,
                                  title='Confusion matrix ' + nome +
                                  ', normalized')
            print('Matriz de Confusão')
            print(matriz_confusao)
            print(classification_report(y_true=test_y, y_pred=y_pred,
                                        digits=4))
            y_pred = pipeline.predict_proba(test_x)
            roc_auc_aux(test_y, y_pred, nome, nome_balanceador)
            print()
            sys.stdout.flush()
Ejemplo n.º 23
0
def performance():
    """
    分析模型性能
    """
    tqdm.write("*" * 50 + "\n\tStackOverlflow Expert Prediction\n" + "*" * 50)
    val = RepeatedStratifiedKFold(n_splits=3, n_repeats=2)
    y_real = [[], [], [], [], [], []]
    y_proba = [[], [], [], [], [], []]
    f_importance = []
    # 对每个话题标签分别进行测试
    for tag in tqdm(get_tags()):
        params = best_solution(tag)  # 获取最优参数
        data, target, ratio = load_data(tag)  # 加载数据
        val.random_state = int(params['seed'])  # 设置随机数种子
        # 建立过采样和分类器的流水线模型
        samp = ADASYN(n_neighbors=2,
                      sampling_strategy=float(params['sampling_strategy']) *
                      ratio,
                      random_state=int(params['seed']))
        clf = XGBClassifier(n_estimators=int(params['n_estimators']),
                            gamma=float(params['gamma']),
                            eta=float(params['eta']),
                            reg_lambda=int(params['reg_lambda']),
                            verbosity=0,
                            n_jobs=-1,
                            random_state=int(params['seed']))
        pipeline = Pipeline([(type(samp).__name__, samp),
                             (type(clf).__name__, clf)])
        # 对交叉验证的子集分别进行测试
        for ind, (train, test) in tqdm(enumerate(val.split(data, target)),
                                       leave=False,
                                       total=6):
            pipeline.fit(data.iloc[train], target.iloc[train])
            y_real[ind].append(target.iloc[test])  # 真实结果
            y_proba[ind].append(pipeline.predict_proba(
                data.iloc[test])[:, 1])  # 预测概率
            f_importance.append(
                pipeline[type(clf).__name__].feature_importances_)  # 特征重要性
    display(y_real, y_proba, f_importance, data.columns)
Ejemplo n.º 24
0
def run(fold,model):
    df = pd.read_csv('../input/train_fold.csv')
    df['bmi'] = df['bmi'].fillna(np.mean(df['bmi']))
    df_train = df[df.kfold!=fold].reset_index(drop=True)
    df_valid = df[df.kfold==fold].reset_index(drop=True)
    features =  [f for f in df.columns if f not in ('id','stroke','kfold')]
    categorial_features = [f for f in features if df[f].dtype==object]
    numerical_features = [f for f in features if df[f].dtype!=object]
    over = SMOTE(sampling_strategy=0.1)
    under = RandomUnderSampler(sampling_strategy=0.5)
    preprocess = make_column_transformer((OneHotEncoder(),categorial_features),(StandardScaler(),numerical_features))
    x_train = df_train[features]
    y_train = df_train.stroke 
    x_valid = df_valid[features]
    y_valid = df_valid.stroke
    clf = models[model]
    steps = [('preprocess',preprocess),('over',over),('under',under),('clf',clf)]
    pipe = Pipeline(steps=steps)
    pipe.fit(x_train,y_train)
    y_pred = pipe.predict_proba(x_valid)[:,1]
    auc = roc_auc_score(y_valid,y_pred)
    print("Fold : {} AUC Score: {:.3f}".format(fold,auc))
    joblib.dump(pipe,f'../models/dt_{model}_{fold}.bin')
def ROC_curve(classifiers):
    table = pd.DataFrame(columns=['classifiers', 'fpr', 'tpr', 'auc'])
    for model in classifiers:
        model_name = type(model).__name__
        model.probability = True
        model = Pipeline([('sampling', SMOTE(sampling_strategy='minority')),
                          ('model', model)])
        model = model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[::, 1]
        fpr, tpr, _ = roc_curve(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred)
        table = table.append(
            {
                'classifiers': model_name,
                'fpr': fpr,
                'tpr': tpr,
                'auc': auc
            },
            ignore_index=True)

    # Set name of the classifiers as index labels
    table.set_index('classifiers', inplace=True)
    for i in table.index:
        plt.plot(table.loc[i]['fpr'],
                 table.loc[i]['tpr'],
                 label="{}, AUC={:.3f}".format(i, table.loc[i]['auc']))

    plt.plot([0, 1], [0, 1], color='black', linestyle='--')
    plt.xticks(np.arange(0.0, 1.1, step=0.1))
    plt.xlabel("False Positive Rate", fontsize=15)
    plt.yticks(np.arange(0.0, 1.1, step=0.1))
    plt.ylabel("True Positive Rate", fontsize=15)
    plt.title('ROC Curves', fontweight='bold', fontsize=10)
    plt.legend(prop={'size': 9}, loc='lower right')
    plt.savefig(folder_plots + 'ROC_Curve.png', dpi=400)
    plt.show()
#########################  FITTING THE MODEL AND PREDICTING ##############################
# we fit the model
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

########################  METRICS  ######################################################
# we see how well our model is at making predictions

#              confusion matrix

# we can see how our model classified the items.
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

#                 ROC CURVE

# we draw a roc curve and look at auc score. Straight line indicates that model is not
# classifying well. The closer to 1 the auc score is, the better the model.
import matplotlib.pyplot as plt

y_pred_proba = clf.predict_proba(X_test)[::, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr, tpr, label="data 1, auc=" + str(auc))
plt.legend(loc=4)
plt.show()

# We can also look at the report. We can see f1 score for the event haappening. We use it to
# compare the models
print(metrics.classification_report(y_test, y_pred))
Ejemplo n.º 27
0
def test_pipeline_memory_sampler():
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0,
    )
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(gamma="scale", probability=True, random_state=0)
        transf = DummySampler()
        pipe = Pipeline([("transf", clone(transf)), ("svc", clf)])
        cached_pipe = Pipeline([("transf", transf), ("svc", clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps["transf"].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert not hasattr(transf, "means_")
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(gamma="scale", probability=True, random_state=0)
        transf_2 = DummySampler()
        cached_pipe_2 = Pipeline([("transf_2", transf_2), ("svc", clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe_2.named_steps["transf_2"].means_,
        )
        assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
Ejemplo n.º 28
0
    def score(self, pipeline: Pipeline, dataset: Dataset, class_names: List[str] = None):
        """
        Computes scores for metrics provided in Scorer constructor. If y_true is multi class then scorer does
        macro average mode for precision/recall/f1. If y_true is multilabel the scores performs macro averaging.

        Parameters
        ----------
        pipeline: Pipeline
            Complete pipeline including features pipeline and classifier.
        dataset: Dataset
            Dataset containing x and y pd.DataFrames
            For Multiclass the shape of y_true should be 1-D (NOT one-hot encoded).
            For Multilabel the shape should be n-dimensional (where n is number of classes).
        class_names: List of strings, optional
            If given, the scores for separate classes will be displayed with appropriate names.

        Returns
        -------
        metrics: Dict
            Dictionary with metrics' names as keys and scores as values
        """

        x, y_true = dataset.x, dataset.y.to_numpy()

        if len(y_true.shape) == 1:
            y_true = y_true.reshape(-1, 1)

        # run inference for probabilities
        probabilities = pipeline.predict_proba(x)

        # check if the output of inference is a list (sklearn models often output probabilities in this form
        # in case of multilabel task). If yes, convert to a single array.
        if isinstance(probabilities, list):
            probabilities = convert_list_probas_to_array(probabilities)

        # check task type based on the true and predicted arrays.
        self.task = check_task(y_true, probabilities)

        # turn probabilities into predictions with chosen threshold
        if self.task in ['binary', 'multiclass']:
            predictions = np.argmax(probabilities, axis=-1)
        else:
            predictions = np.where(probabilities >= self.threshold, 1, 0)

        # assign number of classes based on given array
        self.n_classes = probabilities.shape[-1]

        # assign names of classes
        self.class_names = class_names if class_names else [f'class_{i}' for i in range(self.n_classes)]

        # check if any of ['precision', 'recall', 'f1', 'accuracy'] are in the metrics.
        # if yes generate classification report - it calculates all of these metrics.
        # it does not calculate accuracy for multilabel problem so additional check is done in such case.
        if [metric for metric in ['precision', 'recall', 'f1', 'accuracy'] if metric in self.metrics]:
            self.scores_dict.update(classification_report(y_true, predictions,
                                                          target_names=self.class_names, output_dict=True))
            if self.task == 'multilabel':
                self.scores_dict['accuracy'] = accuracy_score(y_true, predictions)

        if 'auc' in self.metrics:
            self.fpr, self.tpr, self.roc_auc_dict = calculate_roc_auc(y_true, probabilities,
                                                                      self.class_names, self.task)
            for key, value in self.roc_auc_dict.items():
                self.scores_dict[key]['auc'] = value

        if self.report:
            print(pd.DataFrame(self.scores_dict).transpose())

        return self._get_metrics()
    def mrmr_feature_selection(self, input, output, dict_of_models,
                               list_number_of_features_to_select):
        """
        Performs models evaluation within the No_outer times repeated No_inner-fold cross-validation procedure
        for different number of features selected by mRMR algorithm with nested 10-times cross-validation for
        model hyperparameters' tuning
        ----------
        :param input : array-like, shape (n_samples, n_features)
            The training input samples.
        :param output : array-like, shape (n_samples, 1)
            The target values.
        :param dict_of_models: dictionary
            Models with details for grid-search.
        :param list_number_of_features_to_select - list
            Number of features to select.

        :return df_aucs : DataFrame object, shape (No_outer x No_inner, number of models x length of list_number_of_features)
            AUC values for every step of No_outer x No_inner-times CV are provided.
        :return df_res : DataFrame object, shape ([number of models x length of list_number_of_features_to_select], 9)
            For every model and every No. of selected features best classifier's parameters and averaged classification
            metrics are provided : Accuracy, Sensitivity, Specificity, Precision, F1-Score, AUC.
        :return df_stds : DataFrame object, shape ([number of models x length of list_number_of_features_to_select], 8)
            For every model and every No. of selected features standard deviations of classification metrics are provided.
        """

        df_res = pd.DataFrame(columns=[
            'Classifier', 'Selected features', 'Best parameters', 'Accuracy',
            'Sensitivity', 'Specificity', 'Precision', 'F1-score', 'ROC_AUC'
        ])

        df_stds = pd.DataFrame(columns=[
            'Classifier', 'Selected features', 'Acc_std', 'Sens_std',
            'Spec_std', 'Prec_std', 'F1_std', 'ROC_AUC_std'
        ])

        df_aucs = pd.DataFrame()

        for m in dict_of_models:
            for k in list_number_of_features_to_select:
                accuracy = []
                aucs = []
                sensitivity = []
                specificity = []
                precision = []
                f1score = []
                tprs = []
                params = []
                X, y = input, output
                skf = RepeatedStratifiedKFold(n_splits=self.N_inner,
                                              n_repeats=self.N_outer,
                                              random_state=88)

                clf = m['classifier']

                for train_index, test_index in skf.split(X, y):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]

                    best_params = []

                    # MRMR
                    mrmr = MRMR(n_features=k)

                    mrmr_smote_clf = Pipeline([('oversampling',
                                                ADASYN(random_state=88)),
                                               ('feature_selection', mrmr),
                                               ('classifier', clf)])

                    mrmr_smote_clf.fit(X_train, y_train)
                    # best_params.append(gridsearch_cv.best_params_)

                    # predicted class
                    y_predict = mrmr_smote_clf.predict(X_test)

                    # predicted probabilities
                    probas_ = mrmr_smote_clf.predict_proba(X_test)

                    # confusion matrix
                    cm = confusion_matrix(y_test, y_predict)

                    # accuracy
                    acc = accuracy_score(y_predict, y_test)
                    accuracy.append(acc)

                    # sensitivity = recall
                    sens = recall_score(y_test, y_predict)
                    sensitivity.append(sens)

                    # specificity
                    spec = self.get_specificity(y_test, y_predict)
                    specificity.append(spec)

                    # precision
                    prec = precision_score(y_test, y_predict)
                    precision.append(prec)

                    # f1-score
                    f1 = f1_score(y_test, y_predict)
                    f1score.append(f1)

                    # Compute ROC curve and area the curve
                    fpr, tpr, thresholds = roc_curve(y[test_index], probas_[:,
                                                                            1])
                    tprs.append(interp(self.mean_fprs, fpr, tpr))
                    tprs[-1][0] = 0.0
                    roc_auc = auc(fpr, tpr)
                    aucs.append(roc_auc)

                    # best parameters
                    params.append(best_params)

                df_aucs[m['name'] + str(k)] = aucs

                df_stds = df_stds.append(
                    {
                        'Classifier': m['name'],
                        'Selected features': k,
                        'Acc_std': np.std(accuracy),
                        'Sens_std': np.std(sensitivity),
                        'Spec_std': np.std(specificity),
                        'Prec_std': np.std(precision),
                        'F1_std': np.std(f1score),
                        'ROC_AUC_std': np.std(aucs)
                    },
                    ignore_index=True)

                df_res = df_res.append(
                    {
                        'Classifier': m['name'],
                        'Selected features': k,
                        'Best parameters': params,
                        'Accuracy': np.mean(accuracy),
                        'Sensitivity': np.mean(sensitivity),
                        'Specificity': np.mean(specificity),
                        'Precision': np.mean(precision),
                        'F1-score': np.mean(f1score),
                        'ROC_AUC': np.mean(aucs)
                    },
                    ignore_index=True)

        return df_aucs, df_res, df_stds
Ejemplo n.º 30
0
 def compute_pr_auc_score(model: Pipeline, features: np.ndarray, labels: np.ndarray) -> float:
     probabilities = model.predict_proba(features)[:, 1]
     precision, recall, _ = precision_recall_curve(labels, probabilities)
     return auc(recall, precision)
over = SMOTE(sampling_strategy=0.1, k_neighbors=5)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over), ('under', under), ('model', model)]

pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=1)
scores_over = cross_val_score(pipeline, X, y, scoring='recall', cv=cv, n_jobs=-1)
print(f"k={k}\n")
print(f"mean recall: {np.mean(scores_over)}\n")
print(scores_over)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
pipeline.fit(X_train,y_train)

yhat_test = pipeline.predict(X_test)
yhat_test_proba = pipeline.predict_proba(X_test)[:,1]

confusion_matrix = CM(y_test,yhat_test,np.unique(y_train))

precision_ls, recall_ls, threshold_ls =  precision_recall_curve(y_test,yhat_test_proba)

plt.figure(figsize=(10,10))
threshold_ls = np.append(threshold_ls,1)
plt.plot(threshold_ls, precision_ls)
plt.plot(threshold_ls, recall_ls)
plt.legend(["precision","recall"])

tree1 = DecisionTreeClassifier( max_depth=3, min_samples_leaf = 30, class_weight="balanced")
tree1.fit(X_train, y_train)

fig = plt.figure(figsize=(25,20))
Ejemplo n.º 32
0
                     ('learner', learner)]
            pipeline = Pipeline(steps=steps)
            pipeline.fit(exec_training_features, exec_training_target)

            # prediction
            predicted = pipeline.predict(exec_test_features)

            # evaluation
            acc = accuracy_score(exec_test_target, predicted)
            precision, recall, f1, _ = precision_recall_fscore_support(
                exec_test_target, predicted, average='binary', zero_division=0)

            if hasattr(pipeline, "predict_proba"):
                false_positive_rate, true_positive_rate, _ = roc_curve(
                    exec_test_target,
                    pipeline.predict_proba(exec_test_features)[:, 1])
            else:
                false_positive_rate, true_positive_rate, _ = roc_curve(
                    exec_test_target, pipeline['learner']._predict_proba_lr(
                        exec_test_features)[:, 1])

            auroc = auc(false_positive_rate, true_positive_rate)

            # precision-recall AUC
            if precision == 0.0 and recall == 0.0 and f1 == 0.0:
                f1 = 'ND'
                auprc = 'ND'
            else:
                precision_, recall_, _ = precision_recall_curve(
                    exec_test_target, predicted)
                f1 = '{:.3f}'.format(f1)
Ejemplo n.º 33
0
pipeline_xgboost=Pipeline([('xgboost_over' , SMOTE(random_state=45)),
                      ('xgboost_scalar',StandardScaler()),
                      ('classifier_xgboost',XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=0.3, gamma=0.0,
                               learning_rate=0.1, max_delta_step=0, max_depth=6,
                               min_child_weight=1, missing=None,
                               n_estimators=100, n_jobs=1, nthread=None,
                               objective='binary:logistic', random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               seed=None, silent=None, subsample=1,
                               verbosity=1))])

pipeline_xgboost.fit(x_train, y_train)
prediction = pipeline_xgboost.predict_proba(x_test)
prediction=pd.DataFrame(prediction)
prediction.columns=['class_0' , 'prediction']
prediction=prediction.drop('class_0' , axis = 1)
prediction=prediction*100
prediction['prediction']=prediction['prediction'].astype(int)

for i in prediction['prediction']:
  if i >=60:
    prediction['prediction']=prediction['prediction'].replace(i , '1')
  else:
    prediction['prediction']=prediction['prediction'].replace(i , '0')

prediction.value_counts()

                                 module__hidden_dim=hidden_dim,
                                 module__dropout_rate=dropout_rate,
                                 batch_size=batch_size,
                                 max_epochs=max_epochs,
                                 train_split=None,
                                 optimizer=torch.optim.Adam,
                                 iterator_train__shuffle=True,
                                 device='cuda')
# 定义Pipeline
pipe = Pipeline([('model', LSTM)])
# 使用LSTM进行训练
pipe.fit(X_train, y_train.astype(np.float))

H_list_lstm.clear()
# 得到预测结果
X_test_predictions_1 = pipe.predict_proba(X_test)[:, 1]
model_evaluate(y_test, np.ones(len(y_test)))
print(X_test_predictions_1)
print(y_test)
# 对LSTM预测结果进行评估
model_evaluate(y_test, X_test_predictions_1)

fpr_1, tpr_1, thresholds_1 = roc_curve(y_test, X_test_predictions_1)
pyplot.plot([0, 1], [0, 1], linestyle='--')
pyplot.plot(fpr_1, tpr_1)
pyplot.show()


# 可视化
def visualize(X, y, points, n_features):
    # points随机排列
Ejemplo n.º 35
0
def test_pipeline_memory_sampler():
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0)
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir=cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(probability=True, random_state=0)
        transf = DummySampler()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert not hasattr(transf, 'means_')
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(probability=True, random_state=0)
        transf_2 = DummySampler()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
Ejemplo n.º 36
0
class PennyModel:
    """ 
    The Model for the penny auction.   Takes a sklearn classifier and fits the model after transformation.

    Attributes: 
        model (SklearnClassifier): The model for the regression
        user_scaler (bool): Whether or not to scale the data first
        sampling_ratio (float):  The ratio of the minority class to the majority class
        numeric_features (list(str)):  The numerical features of the model
        cateogorical_features (list(str)): The categorical features of the model
    """
    def __init__(self, model, use_scaler=False, sampling_ratio=1):
        """

        Parameters:
        Returns:
        """
        self.model = model
        self.sampling_ratio = sampling_ratio
        self.use_scaler = use_scaler
        self.categorical_features = [
            'cardtype', 'limited_allowed', 'is_locked', 'is_bidomatic',
            'is_bidomatic0', 'is_bidomatic1', 'is_bidomatic2', 'is_bidomatic3'
        ]
        self.numeric_features = [
            'bid', 'cashvalue', 'bidvalue', 'prevusers', 'bids_so_far0',
            'perc_to_bin0', 'bom_bids_so_far0', 'bom_streak0',
            'prev_is_new_user0', 'prev_auction_count0', 'prev_overbid0',
            'prev_giveup_one0', 'prev_give_before_six0', 'prev_wins0',
            'prev_bids0', 'prev_bom_bids0', 'distance1', 'bids_so_far1',
            'perc_to_bin1', 'bom_bids_so_far1', 'bom_streak1',
            'prev_is_new_user1', 'prev_auction_count1', 'prev_overbid1',
            'prev_giveup_one1', 'prev_give_before_six1', 'prev_wins1',
            'prev_bids1', 'prev_bom_bids1', 'distance2', 'bids_so_far2',
            'perc_to_bin2', 'bom_bids_so_far2', 'bom_streak2',
            'prev_is_new_user2', 'prev_auction_count2', 'prev_overbid2',
            'prev_giveup_one2', 'prev_give_before_six2', 'prev_wins2',
            'prev_bids2', 'prev_bom_bids2', 'distance3', 'bids_so_far3',
            'perc_to_bin3', 'bom_bids_so_far3', 'bom_streak3',
            'prev_is_new_user3', 'prev_auction_count3', 'prev_overbid3',
            'prev_giveup_one3', 'prev_give_before_six3', 'prev_wins3',
            'prev_bids3', 'prev_bom_bids3', 'is_weekend', 'time_of_day'
        ]

    def get_features_as_string(self):
        """
        Returns all the features of the model.

        Parameters:
        Returns:
        """
        return ",".join(self.categorical_features + self.numeric_features)

    def get_column_names_from_ColumnTransformer(self, column_transformer):
        """

        Parameters:
        Returns:
        """
        col_name = []
        for transformer_in_columns in column_transformer.transformers_[:
                                                                       -1]:  #the last transformer is ColumnTransformer's 'remainder'
            raw_col_name = transformer_in_columns[2]
            if isinstance(transformer_in_columns[1], Pipeline):
                transformer = transformer_in_columns[1].steps[-1][1]
            else:
                transformer = transformer_in_columns[1]
            try:
                names = transformer.get_feature_names(
                    self.categorical_features)
            except AttributeError:  # if no 'get_feature_names' function, use raw column name
                names = raw_col_name
            if isinstance(names, np.ndarray):  # eg.
                col_name += names.tolist()
            elif isinstance(names, list):
                col_name += names
            elif isinstance(names, str):
                col_name.append(names)
        return col_name

    def transform(self, X):
        """

        Parameters:
        Returns:
        """

        rX = X.copy()
        return self.transform_no_copy(rX)

    def transform_no_copy(self, X):
        """

        Parameters:
        Returns:
        """

        #rX = X.copy()
        #print ("2. Transforming data")
        X.is_bidomatic0 = X.is_bidomatic0.astype(str)
        X.is_bidomatic1 = X.is_bidomatic1.astype(str)
        X.is_bidomatic2 = X.is_bidomatic2.astype(str)
        X.is_bidomatic3 = X.is_bidomatic3.astype(str)

        X["fee"] = [
            0 if x == 0 else (1 if x < 50 else 1.99) for x in X["cardvalue"]
        ]
        X["time_of_day"] = [x.hour for x in X["auctiontime"]]
        X["is_weekend"] = [x.weekday() >= 6 for x in X["auctiontime"]]
        return X

    def internal_fit(self, X, y):
        """
        Fits self.model 

        Parameters:
        Returns:
        """
        self.train_pop = X.shape[0]
        self.target_pop = sum(y)
        self.sampled_train_pop = self.target_pop / self.sampling_ratio + self.target_pop
        self.sampled_target_pop = self.target_pop

        numeric_transformer = Pipeline_imb(
            steps=[('imputer',
                    SimpleImputer(strategy='constant', fill_value=-1))
                   #     ('scaler', StandardScaler())
                   ])
        categorical_transformer = Pipeline_imb(
            steps=[('imputer',
                    SimpleImputer(strategy='constant', fill_value='unknown')),
                   ('onehot',
                    OneHotEncoder(handle_unknown='error', drop='first'))])
        preprocessor = ColumnTransformer(
            transformers=[('num', numeric_transformer, self.numeric_features),
                          ('cat', categorical_transformer,
                           self.categorical_features)])
        steps = [('preprocessor', preprocessor)]
        steps.append(
            ('sampler',
             RandomUnderSampler(sampling_strategy=self.sampling_ratio)))
        steps.append(('classifier', self.model))

        self.pipeline = Pipeline_imb(steps=steps)

        print("4. Fitting model")
        self.pipeline.fit(X, y)

    def fit_already_transformed(self, X, y):
        """
        fits X if it's already been transformed.

        Parameters:
        Returns:
        """
        self.internal_fit(X, y)

    def fit_transform(self, X, y):
        """
        fits and transforms X.

        Parameters:
        Returns:
        """
        self.transform_no_copy(X)
        self.internal_fit(X, y)

    def pickle(self, filename):
        """
        Writes this class as a pickle file to filename

        Parameters:
        Returns:
        """
        print("5. Pickling model as penny_auction.pickle")
        pickle.dump(self, open(filename, "wb"))

    def predict_proba(self, X):
        """
        Returns the predicted probabilities that the auction will end, in the UNDERSAMPLED data set.

        Parameters:
        Returns:
        """
        return self.pipeline.predict_proba(self.transform(X))

    def predict_proba_calibrated(self, X):
        """
        Returns the probabilities from the model AFTER accounting for the undersampling.

        Parameters:
        Returns:
        """
        return self.calibrate_probabilties(self.predict_proba(X))

    def predict(self, X):
        """
        Calls predict on the model to get binary whether or not the auction will end.

        Parameters:
        Returns:
        """
        return self.pipeline.predict(self.transform(X))

    def get_feature_scores(self):
        """
        Returns the feature importances from the model

        Parameters:
        Returns:
        """
        return pd.Series(self.pipeline.steps[2][1].feature_importances_,
                         index=self.get_column_names_from_ColumnTransformer(
                             self.pipeline.named_steps['preprocessor']))

    def calibrate_probabilties(self, data):
        """
        Recalibrates the probabilities to account for the undersampling.  So if the model says 20%, it will comeout as something like 1.2%

        Parameters:
        Returns:
        """

        calibrated_data = \
        ((data * (self.target_pop / self.train_pop) / (self.sampled_target_pop / self.sampled_train_pop)) /
        ((
            (1 - data) * (1 - self.target_pop / self.train_pop) / (1 - self.sampled_target_pop / self.sampled_train_pop)
        ) +
        (
            data * (self.target_pop / self.train_pop) / (self.sampled_target_pop / self.sampled_train_pop)
        )))
        return calibrated_data

    def get_actual_and_potential_profits(self, X, y):
        """
        returns the actual and potential profits over X

        Parameters:
        Returns:
        """
        potential_profits = (X.cashvalue - X.fee - X.bid / 100) - .4
        actual_profits = y * (X.cashvalue - X.fee - X.bid / 100) - .4
        return potential_profits, actual_profits

    def get_score(self, X, y):
        """
        Returns the expected profit over the set X

        Parameters:
        Returns:
        """
        cprobs = self.predict_proba_calibrated(X)[:, 1]
        pp, ap = self.get_actual_and_potential_profits(X, y)
        expected_value = np.multiply(cprobs, pp) - (1 - cprobs) * .4
        return sum(ap[expected_value > 0])
Ejemplo n.º 37
0
def train_model(transactions_details):
    X = transactions_details.drop(columns='fraudster')
    y = transactions_details['fraudster'].copy()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    categorical_cols = [
        'currency', 'transaction_state', 'type', 'source', 'entry_method',
        'is_crypto', 'merchant_country', 'phone_country', 'user_country', 'kyc'
    ]
    numerical_cols = [
        'failed_sign_in_attempts', 'age', 'diff_in_days', 'amount_usd'
    ]

    # pre processing pipeline
    # Feature Scaling
    # One Hot Encoding
    preprocess = make_column_transformer(
        (make_pipeline(SimpleImputer(), StandardScaler()), numerical_cols),
        (OneHotEncoder(handle_unknown='ignore'), categorical_cols))

    # Create a pipeline
    model = Pipeline([('preprocess', preprocess),
                      ('sampling', SMOTE(random_state=42)),
                      ('classification', RandomForestClassifier())])

    # fit model
    model.fit(X_train, y_train)

    # Predict target vector
    y_pred = model.predict(X_test)

    print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
    print('Classification report:\n', classification_report(y_test, y_pred))
    print(accuracy_score(y_test, y_pred))

    # Create true and false positive rates
    false_positive_rate, true_positive_rate, threshold = roc_curve(
        y_test, y_pred)

    # Calculate Area Under the Receiver Operating Characteristic Curve
    probs = model.predict_proba(X_test)
    roc_auc = roc_auc_score(y_test, probs[:, 1])
    print('ROC AUC Score:', roc_auc)

    # Obtain precision and recall
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)

    # Calculate average precision
    average_precision = average_precision_score(y_test, y_pred)

    # Plot the roc curve
    plot_roc_curve(false_positive_rate, true_positive_rate, roc_auc)

    # Plot recall precision curve
    plot_pr_curve(recall, precision, average_precision)

    return model
from sklearn.metrics import plot_roc_curve

# Set the axes
ax = plt.gca()

# Plot the ROC curves of each GridSearch object on one graph for comparison.
logis_disp.plot(ax=ax, alpha=0.8)
gs_disp = plot_roc_curve(gs, X_test, y_test, ax=ax, alpha=0.8)
gs2_disp= plot_roc_curve(gs2, X_test, y_test, ax=ax, alpha=0.8)
# -

# Plot the confusion matrixes of the fitted models.
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(search, X_test, y_test)
plot_confusion_matrix(gs, X_test, y_test)
plot_confusion_matrix(gs2, X_test, y_test)

# As we can see from the above results, it seems like the Bagged Classifier displays the best performance in terms of the greatest AUC and also in tmers of the largest number of properly classified observations, hence we shall be using this as our final inferential model.

# # Final Model Pipeline

# +
# Build the final model using the tuned parameters from before
bestgbc =GradientBoostingClassifier(n_estimators = 770, learning_rate=0.05,max_features=10,subsample=0.8,random_state=42, max_depth = 3, min_samples_split = 400)

# Put together the final pipeline with scaled inputs for the model, and make predictions.
finalpipe = Pipeline(steps=[('scale',StandardScaler()),('classifier', bestgbc)])
finalpipe.fit(X_train_res, y_train_res.values.ravel())
y_predfinal=finalpipe.predict(X_test_res)
y_predprobs=finalpipe.predict_proba(X_test_res)