Python RUSBoostClassifierの例、imblearn.ensemble.RUSBoostClassifier Pythonの例

コード例 #1

0

ファイルを表示

ファイル: initializeClassifier.py プロジェクト: ilaplace/flask-auth0-graphqlserver

def train(classifier, df,y, user_id):
    ''' The main training function that runs on a seperate process'''
    X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=0)
    base_estimator = AdaBoostClassifier(n_estimators=10)
    rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator)
    rusboost.fit(X_train, y_train)
    y_pred_rusboost = rusboost.predict(X_test)
    print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format(balanced_accuracy_score(y_test, y_pred_rusboost), geometric_mean_score(y_test, y_pred_rusboost)))
    cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
    joblib.dump(rusboost, user_id+'.pkl')
    classifier.classifierStatus = "trained"
    print("Done training")
    return classifier

コード例 #2

0

ファイルを表示

    def get_best_parameters(self,
                            features,
                            labels,
                            base_estimator=None,
                            n_iter=300,
                            cv=3,
                            verbose=1,
                            random_state=1,
                            n_jobs=-1):

        clf_random =\
            GridSearchCV(
                estimator=RUSBoostClassifier(),
                param_grid=self.random_grid,
                cv=cv,
                verbose=verbose,
                n_jobs=n_jobs,
                iid=False,
                error_score=0
            )

        _features = features
        if 1 == len(features.values.shape):
            # imbalanced learn RUSBoostClassifier
            # doesn't like shapes of (N=1,) ?
            _features = features.values.reshape(-1, 1)

        clf_random.fit(_features, labels)

        return clf_random.best_params_

コード例 #3

0

ファイルを表示

def train(X_train, y_train, method_name, base_classifier, T):
    if method_name == 'adaboost':
        clf = AdaBoostClassifier(base_estimator=base_classifier,
                                 n_estimators=T)
    elif method_name == 'RUSBoost':
        clf = RUSBoostClassifier(base_estimator=base_classifier,
                                 n_estimators=T,
                                 sampling_strategy='majority')
    elif method_name == 'SMOTEBoost':
        clf = OversampleBoost(oversampling_algorithm='SMOTE',
                              base_estimator=base_classifier,
                              n_estimators=T)
    elif method_name == 'SMOTETomekBoost':
        clf = OversampleBoost(oversampling_algorithm='SMOTE-TOMEK',
                              base_estimator=base_classifier,
                              n_estimators=T)
    elif method_name == 'SMOTEENNBoost':
        clf = OversampleBoost(oversampling_algorithm='SMOTE-ENN',
                              base_estimator=base_classifier,
                              n_estimators=T)
    elif method_name == 'DERSBoost':
        clf = DERSBoost(base_estimator=base_classifier,
                        n_estimators=T,
                        NGEN=50)
    start_time = time()
    clf.fit(X_train, y_train)
    elapsed_time = time() - start_time
    return clf, elapsed_time

コード例 #4

0

ファイルを表示

ファイル: test_weight_boosting.py プロジェクト: cdchushig/imbalanced-learn

def test_rusboost_sample_weight(imbalanced_dataset, algorithm):
    X, y = imbalanced_dataset
    sample_weight = np.ones_like(y)
    rusboost = RUSBoostClassifier(algorithm=algorithm, random_state=0)

    # Predictions should be the same when sample_weight are all ones
    y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X)
    y_pred_no_sample_weight = rusboost.fit(X, y).predict(X)

    assert_array_equal(y_pred_sample_weight, y_pred_no_sample_weight)

    rng = np.random.RandomState(42)
    sample_weight = rng.rand(y.shape[0])
    y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X)

    with pytest.raises(AssertionError):
        assert_array_equal(y_pred_no_sample_weight, y_pred_sample_weight)

コード例 #5

0

ファイルを表示

ファイル: test_weight_boosting.py プロジェクト: chkoar/imbalanced-learn

def test_rusboost_sample_weight(imbalanced_dataset, algorithm):
    X, y = imbalanced_dataset
    sample_weight = np.ones_like(y)
    rusboost = RUSBoostClassifier(algorithm=algorithm,
                                  random_state=0)

    # Predictions should be the same when sample_weight are all ones
    y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X)
    y_pred_no_sample_weight = rusboost.fit(X, y).predict(X)

    assert_array_equal(y_pred_sample_weight, y_pred_no_sample_weight)

    rng = np.random.RandomState(42)
    sample_weight = rng.rand(y.shape[0])
    y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X)

    with pytest.raises(AssertionError):
        assert_array_equal(y_pred_no_sample_weight, y_pred_sample_weight)

コード例 #6

0

ファイルを表示

    def fit(self, X, Y, sample_weight=None):
        import sklearn.tree

        self.n_estimators = int(self.n_estimators)
        self.learning_rate = float(self.learning_rate)
        self.max_depth = int(self.max_depth)
        base_estimator = sklearn.tree.DecisionTreeClassifier(
            max_depth=self.max_depth)
        from imblearn.ensemble import RUSBoostClassifier
        estimator = RUSBoostClassifier(base_estimator=base_estimator,
                                       n_estimators=self.n_estimators,
                                       learning_rate=self.learning_rate,
                                       algorithm=self.algorithm,
                                       random_state=self.random_state)

        estimator.fit(X, Y, sample_weight=sample_weight)

        self.estimator = estimator
        return self

コード例 #7

0

ファイルを表示

ファイル: test_weight_boosting.py プロジェクト: youtang1993/imbalanced-learn

def test_rusboost(imbalanced_dataset, algorithm):
    X, y = imbalanced_dataset
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify=y,
                                                        random_state=1)
    classes = np.unique(y)

    n_estimators = 500
    rusboost = RUSBoostClassifier(n_estimators=n_estimators,
                                  algorithm=algorithm,
                                  random_state=0)
    rusboost.fit(X_train, y_train)
    assert_array_equal(classes, rusboost.classes_)

    # check that we have an ensemble of samplers and estimators with a
    # consistent size
    assert len(rusboost.estimators_) > 1
    assert len(rusboost.estimators_) == len(rusboost.samplers_)
    assert len(rusboost.pipelines_) == len(rusboost.samplers_)

    # each sampler in the ensemble should have different random state
    assert (len({sampler.random_state
                 for sampler in rusboost.samplers_
                 }) == len(rusboost.samplers_))
    # each estimator in the ensemble should have different random state
    assert (len({est.random_state
                 for est in rusboost.estimators_
                 }) == len(rusboost.estimators_))

    # check the consistency of the feature importances
    assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1]

    # check the consistency of the prediction outpus
    y_pred = rusboost.predict_proba(X_test)
    assert y_pred.shape[1] == len(classes)
    assert rusboost.decision_function(X_test).shape[1] == len(classes)

    score = rusboost.score(X_test, y_test)
    assert score > 0.7, "Failed with algorithm {} and score {}".format(
        algorithm, score)

    y_pred = rusboost.predict(X_test)
    assert y_pred.shape == y_test.shape

コード例 #8

0

ファイルを表示

ファイル: OVR_DNN.py プロジェクト: MiningMyBusiness/PhysioNetCinCChallenge_2021_Submission_2

    def get_models(self):
        base_lr = LogisticRegression(class_weight='balanced')
        ovr_lr = OneVsRestClassifier(base_lr)

        base_eec = EasyEnsembleClassifier(n_estimators=10)
        ovr_eec = OneVsRestClassifier(base_eec)

        base_rus = RUSBoostClassifier(n_estimators=50)
        ovr_rus = OneVsRestClassifier(base_rus)

        base_bbc = BalancedBaggingClassifier(n_estimators=10)
        ovr_bbc = OneVsRestClassifier(base_bbc)

        base_brf = BalancedRandomForestClassifier(n_estimators=100)
        ovr_brf = OneVsRestClassifier(base_brf)

        estimators = [('lr', ovr_lr), ('eec', ovr_eec), ('rus', ovr_rus),
                      ('bbc', ovr_bbc), ('brf', ovr_brf)]
        return estimators

コード例 #9

0

ファイルを表示

ファイル: stacking_classifier.py プロジェクト: HollySwift17/domain

    def __init__(self):
        from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE, RandomOverSampler
        from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, InstanceHardnessThreshold, NearMiss, \
            TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, OneSidedSelection, \
            CondensedNearestNeighbour, NeighbourhoodCleaningRule
        from imblearn.ensemble import EasyEnsemble, EasyEnsembleClassifier, BalancedBaggingClassifier, \
            BalancedRandomForestClassifier, BalanceCascade, RUSBoostClassifier

        self.oversamplers = {
            'ADASYN': ADASYN(),
            'RandomOverSampler': RandomOverSampler(),
            'SMOTE': SMOTE(),
            'BorderlineSMOTE': BorderlineSMOTE(),
            'SVMSMOTE': SVMSMOTE()
        }
        self.undersamplers = {
            'ClusterCentroids': ClusterCentroids(),
            'RandomUnderSampler': RandomUnderSampler(),
            'InstanceHardnessThreshold': InstanceHardnessThreshold(),
            'NearMiss': NearMiss(),
            'TomekLinks': TomekLinks(),
            'EditedNearestNeighbours': EditedNearestNeighbours(),
            'RepeatedEditedNearestNeighbours':
            RepeatedEditedNearestNeighbours(),
            'AllKNN': AllKNN(),
            'OneSidedSelection': OneSidedSelection(),
            'CondensedNearestNeighbour': CondensedNearestNeighbour(),
            'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule()
        }
        self.ensemblesamplers = {
            'EasyEnsemble': EasyEnsemble(),
            'EasyEnsembleClassifier': EasyEnsembleClassifier(),
            'BalancedBaggingClassifier': BalancedBaggingClassifier(),
            'BalanceCascade': BalanceCascade(),
            'BalancedRandomForestClassifier': BalancedRandomForestClassifier,
            'RUSBoostClassifier': RUSBoostClassifier()
        }

コード例 #10

0

ファイルを表示

ファイル: test_weight_boosting.py プロジェクト: chkoar/imbalanced-learn

def test_rusboost(imbalanced_dataset, algorithm):
    X, y = imbalanced_dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    classes = np.unique(y)

    n_estimators = 500
    rusboost = RUSBoostClassifier(n_estimators=n_estimators,
                                  algorithm=algorithm,
                                  random_state=0)
    rusboost.fit(X_train, y_train)
    assert_array_equal(classes, rusboost.classes_)

    # check that we have an ensemble of samplers and estimators with a
    # consistent size
    assert len(rusboost.estimators_) > 1
    assert len(rusboost.estimators_) == len(rusboost.samplers_)
    assert len(rusboost.pipelines_) == len(rusboost.samplers_)

    # each sampler in the ensemble should have different random state
    assert (len(set(sampler.random_state for sampler in rusboost.samplers_)) ==
            len(rusboost.samplers_))
    # each estimator in the ensemble should have different random state
    assert (len(set(est.random_state for est in rusboost.estimators_)) ==
            len(rusboost.estimators_))

    # check the consistency of the feature importances
    assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1]

    # check the consistency of the prediction outpus
    y_pred = rusboost.predict_proba(X_test)
    assert y_pred.shape[1] == len(classes)
    assert rusboost.decision_function(X_test).shape[1] == len(classes)

    score = rusboost.score(X_test, y_test)
    assert score > 0.7, "Failed with algorithm {} and score {}".format(
        algorithm, score)

    y_pred = rusboost.predict(X_test)
    assert y_pred.shape == y_test.shape

コード例 #11

0

ファイルを表示

ファイル: helper_classification.py プロジェクト: PelJon/Udacity_Customer-Segmentation

def get_models():
    models, names = list(), list()
    # LR
    models.append(
        LogisticRegression(solver='liblinear',
                           class_weight='balanced',
                           penalty='l2'))
    names.append('Logistic Regression')
    # Ada Boost
    names.append('Ada Boost')
    models.append(AdaBoostClassifier())
    # Gradient Boosting
    names.append('Gradient Boosting')
    models.append(GradientBoostingClassifier())
    # RUSBoostClassifier
    names.append('RUSBoost Classifier')
    models.append(RUSBoostClassifier())
    # BalancedRandomForestClassifier
    names.append('RandomForestClassifier')
    models.append(RandomForestClassifier(class_weight='balanced'))
    # BalancedRandomForestClassifier
    names.append('EasyEnsembleClassifier')
    models.append(EasyEnsembleClassifier())
    return models, names

コード例 #12

0

ファイルを表示

ファイル: test_weight_boosting.py プロジェクト: matfonseca/TP2DATOS

def test_balanced_random_forest_error(imbalanced_dataset, boosting_params,
                                      err_msg):
    rusboost = RUSBoostClassifier(**boosting_params)
    with pytest.raises(ValueError, message=err_msg):
        rusboost.fit(*imbalanced_dataset)

コード例 #13

0

ファイルを表示

ファイル: classifier_handler.py プロジェクト: FMuenke/classic_image_classification

    def _init_classifier(self, opt):
        if "classifier_opt" in opt:
            opt = opt['classifier_opt']
        if "base_estimator" in opt:
            b_est = self._init_classifier(opt["base_estimator"])
        else:
            b_est = None

        if "n_estimators" in opt:
            n_estimators = opt["n_estimators"]
        else:
            n_estimators = 200

        if "max_iter" in opt:
            max_iter = opt["max_iter"]
        else:
            max_iter = 100000

        if "num_parallel_tree" in opt:
            num_parallel_tree = opt["num_parallel_tree"]
        else:
            num_parallel_tree = 5

        if "layer_structure" in opt:
            layer_structure = opt["layer_structure"]
        else:
            layer_structure = (100,)

        if opt["type"] in ["random_forrest", "rf"]:
            return RandomForestClassifier(n_estimators=n_estimators, class_weight="balanced", n_jobs=-1)
        elif opt["type"] == "ada_boost":
            return AdaBoostClassifier(base_estimator=b_est, n_estimators=n_estimators)
        elif opt["type"] in ["logistic_regression", "lr"]:
            return LogisticRegression(class_weight='balanced', max_iter=max_iter)
        elif opt["type"] == "sgd":
            return SGDClassifier(class_weight='balanced', max_iter=max_iter)
        elif opt["type"] in ["gaussian_bayes", "bayes", "gaussian_nb"]:
            return GaussianNB()
        elif opt["type"] in ["support_vector_machine", "svm"]:
            return SVC(kernel='rbf', class_weight='balanced', gamma="scale")
        elif opt["type"] in ["multilayer_perceptron", "mlp"]:
            return MLPClassifier(hidden_layer_sizes=layer_structure, max_iter=max_iter)
        elif opt["type"] in ["decision_tree", "dt", "tree"]:
            return DecisionTreeClassifier()
        elif opt["type"] in ["b_decision_tree", "b_dt", "b_tree"]:
            return DecisionTreeClassifier(class_weight="balanced")
        elif opt["type"] in ["neighbours", "knn"]:
            return KNeighborsClassifier(n_neighbors=opt["n_neighbours"])
        elif opt["type"] == "extra_tree":
            return ExtraTreesClassifier(n_estimators=n_estimators, class_weight="balanced", n_jobs=-1)
        elif opt["type"] == "xgboost":
            return XGBClassifier(objective='binary:logistic',
                                 n_estimators=n_estimators,
                                 num_parallel_tree=num_parallel_tree,
                                 tree_method="hist",
                                 booster="gbtree",
                                 n_jobs=-1)
        elif opt["type"] in ["b_random_forrest", "b_rf"]:
            return BalancedRandomForestClassifier(n_estimators=n_estimators, n_jobs=-1)
        elif opt["type"] == "b_bagging":
            return BalancedBaggingClassifier(base_estimator=b_est, n_estimators=n_estimators)
        elif opt["type"] == "b_boosting":
            return RUSBoostClassifier(base_estimator=b_est, n_estimators=n_estimators)
        else:
            raise ValueError("type: {} not recognised".format(opt["type"]))

コード例 #14

0

ファイルを表示

ファイル: plot_comparison_ensemble_classifier.py プロジェクト: chkoar/imbalanced-learn

# achieve worse performance.

base_estimator = AdaBoostClassifier(n_estimators=10)
eec = EasyEnsembleClassifier(n_estimators=10,
                             base_estimator=base_estimator,
                             n_jobs=-1)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
print('Easy ensemble classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_eec),
              geometric_mean_score(y_test, y_pred_eec)))
cm_eec = confusion_matrix(y_test, y_pred_eec)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0],
                      title='Easy ensemble classifier')

rusboost = RUSBoostClassifier(n_estimators=10,
                              base_estimator=base_estimator)
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
print('RUSBoost classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rusboost),
              geometric_mean_score(y_test, y_pred_rusboost)))
cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
plot_confusion_matrix(cm_rusboost, classes=np.unique(satimage.target),
                      ax=ax[1], title='RUSBoost classifier')

plt.show()

コード例 #15

0

ファイルを表示

ファイル: nosampling_pipeline.py プロジェクト: puzzle91/Model-Pipeline-Machine-Learning

def nosampling_pipeline(data=[], verbose=False, clean=False, plot=False):

    results_table = []
    results = []
    rand_state = 42

    if clean:
        X = data.drop('Class', axis=1)
        y = data['Class']
        X_vals = X.values
        y_vals = y.values
        X_inliners, y_inliners = reject_sampler.fit_resample(X_vals, y_vals)
        X = X_inliners
        y = y_inliners
    else:
        X = data.drop('Class', axis=1)
        y = data['Class']
        X = X.values
        y = y.values
        pass

    sss = StratifiedKFold(n_splits=10, random_state=rand_state, shuffle=False)
    print("StratKFold:", sss)

    #List of models to be used
    models = [
        DecisionTreeClassifier(random_state=rand_state),
        RUSBoostClassifier(random_state=rand_state),
        LogisticRegression(random_state=rand_state),
        BalancedBaggingClassifier(random_state=rand_state),
        RandomForestClassifier(random_state=rand_state),
        EasyEnsembleClassifier(
            base_estimator=RandomForestClassifier(random_state=rand_state),
            random_state=rand_state),
        BalancedRandomForestClassifier(random_state=rand_state)
    ]

    results_table = pd.DataFrame(columns=['models', 'fpr', 'tpr', 'auc'])
    #Create training and testing data sets depending on wheather or not they have been generated previously.
    #Instantiate lists to store each of the models results
    strategy = []
    classifier = []
    strategy = []
    samp_technique = []
    accuracy = []
    f1 = []
    auc = []
    recall = []
    precision = []
    g_mean = []
    start = time.time()
    #Run thorugh each of the models to get their performance metrics

    sampling_strat = 'no_sampling'

    for train_index, test_index in sss.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    # X_train=X_train.values
    # X_test=X_test.values
    # y_train=y_train.values
    # y_test=y_test.values

    for model in models:
        print(
            "Using lentgh of X for training: {}; Using Length of Y for training: {}"
            .format(len(X_train), len(y_train)))
        print(
            "Using lentgh of X for testing: {}; Using Length of Y for test: {}"
            .format(len(X_test), len(y_test)))

        print("Currently training model - {} using sampling strategy - {}".
              format(model.__class__.__name__, sampling_strat))
        print("--" * 20)

        clf = model

        pipe = make_pipeline(clf)  # LOG_REG_MODEL WITH BOTHER
        pipe.fit(X_train, y_train)

        test_preds = pipe.predict(X_test)
        #yproba = pipe.predict_proba(X_test)[::,1]

        classifier.append(model.__class__.__name__)
        samp_technique.append(sampling_strat)
        strategy.append(" %s+%s " %
                        (str(model.__class__.__name__), sampling_strat))

        f1.append(f1_score(y_test, test_preds))
        accuracy.append(accuracy_score(y_test, test_preds))
        auc.append(roc_auc_score(y_test, test_preds))
        recall.append(recall_score(y_test, test_preds))
        precision.append(precision_score(y_test, test_preds))
        g_mean.append(
            geometric_mean_score(y_test, test_preds, average='binary'))

        fpr, tpr, _ = roc_curve(y_test, test_preds)
        auc_score = roc_auc_score(y_test, test_preds)

        results_table = results_table.append(
            {
                'classifiers': model.__class__.__name__,
                'fpr': fpr,
                'tpr': tpr,
                'auc_score': auc_score
            },
            ignore_index=True)

        #Print the model and its report
        if verbose:
            print('Classification Model: ', model.__class__.__name__, '\n')
            print('Sampling Strategy Model: ', sampling_strat, '\n')
            print(confusion_matrix(y_test, test_preds), '\n')
            print(classification_report_imbalanced(y_test, test_preds), '\n')

    #round the results for convenience
    f1 = [float(round(n, 4)) for n in f1]
    auc = [float(round(n, 4)) for n in auc]
    g_mean = [float(round(n, 4)) for n in g_mean]
    accuracy = [float(round(n, 4)) for n in accuracy]
    precision = [float(round(n, 4)) for n in precision]
    recall = [float(round(n, 4)) for n in recall]

    #store results in dataframe

    results = pd.DataFrame(
        [
            classifier, strategy, samp_technique, f1, auc, g_mean, accuracy,
            precision, recall
        ],
        index=[
            'classifier', 'strategy', 'samp_technique', 'f1', 'roc_auc',
            'g_mean', 'accuracy', 'precision', 'recall'
        ],
        columns=[
            'DecisionTreeClassifier', 'RUSBoostClaassifier',
            'LogisiticRegression', 'BalancedBaggingClassifier',
            'RandomForestClassifier', 'EasyEnsembleClassifier',
            'BalancedRandomForestClassifier'
        ])

    if plot:

        results_table.set_index('classifiers', inplace=True)
        fig = plt.figure(figsize=(8, 6))
        results_table.sort_values(by=['auc_score'], ascending=False)

        for i in results_table.index:

            plt.plot(results_table.loc[i]['fpr'],
                     results_table.loc[i]['tpr'],
                     label="{}, AUC={:.4f}".format(
                         i, results_table.loc[i]['auc_score']))

            plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

            plt.xticks(np.arange(0.0, 1.1, step=0.1))
            plt.xlabel("Flase Positive Rate", fontsize=15)

            plt.yticks(np.arange(0.0, 1.1, step=0.1))
            plt.ylabel("True Positive Rate", fontsize=15)

            plt.title(
                'ROC Curve for classifiers using Full data split using sampling technique: {}'
                .format(sampling_strat),
                fontweight='bold',
                fontsize=15)
            plt.legend(prop={'size': 13}, loc='lower right')

    plt.show()

    #Change orientation of the dataframe

    end = time.time()
    print("Time elapsed:", start - end)

    return results.transpose()

コード例 #16

0

ファイルを表示

ファイル: MCIDM.py プロジェクト: 209855/MCIDM_209855

df = pd.read_csv('data/poker-8-9_vs_5.csv')
X, y, z = prepare_data(df)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=0,
                                                    test_size=0.7)

kf = StratifiedKFold(n_splits=10)
kf.get_n_splits(X, y)

bbc = BalancedBaggingClassifier(
    base_estimator=DecisionTreeClassifier(random_state=0), random_state=42)
brfc = BalancedRandomForestClassifier(max_depth=2, random_state=0)
eec = EasyEnsembleClassifier(
    base_estimator=DecisionTreeClassifier(random_state=0), random_state=42)
rbc = RUSBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=0),
                         random_state=0)

bbc_score = []
brfc_score = []
eec_score = []
rbc_score = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    bbc.fit(X_train, y_train)
    brfc.fit(X_train, y_train)
    eec.fit(X_train, y_train)
    rbc.fit(X_train, y_train)
    y_pred_bbc = bbc.predict(X_test)
    y_pred_brfc = brfc.predict(X_test)

コード例 #17

0

ファイルを表示

ファイル: test_weight_boosting.py プロジェクト: chkoar/imbalanced-learn

def test_balanced_random_forest_error(imbalanced_dataset, boosting_params,
                                      err_msg):
    rusboost = RUSBoostClassifier(**boosting_params)
    with pytest.raises(ValueError, message=err_msg):
        rusboost.fit(*imbalanced_dataset)

コード例 #18

0

ファイルを表示

ファイル: default_models.py プロジェクト: amethystorm/FTEC4003Proj

    DecisionTreeClassifier(random_state=r),
    KNeighborsClassifier(),
    GaussianNB(),
    MultinomialNB(),
    LogisticRegression(random_state=r),
    SVC(random_state=r, kernel='sigmoid'),
    MLPClassifier(random_state=r),
    BaggingClassifier(random_state=r),
    RandomForestClassifier(random_state=r),
    GradientBoostingClassifier(random_state=r),
    LGBMClassifier(),
    XGBClassifier(random_state=r),
    CatBoostClassifier(random_state=r, verbose=False),
    BalancedBaggingClassifier(random_state=r),
    BalancedRandomForestClassifier(random_state=r),
    RUSBoostClassifier(random_state=r)
]
names = [
    "DecisionTree", "KNeighbors", "GaussianNB", "MultinomialNB",
    "LogisticRegression", "SVC", "MLPClassifier", "Ensemble-Bagging",
    "Ensemble-RandomForest", "Ensemble-GradientBoosting",
    "LightGradientBoosting", "XGBoost", "CatBoost", "BalancedBagging",
    "BalancedRandomForest", "RUSBoost"
]

outputs = {}

for name, model in zip(names, models):
    model.fit(x_train, y_train)
    output = model.predict(test_dataframe)
    outputs[name] = output

コード例 #19

0

ファイルを表示

ファイル: cv5.py プロジェクト: FlamingHorizon/MORSE

# initialize cv5
skf = StratifiedKFold(n_splits=5)
cv5_ids = list(skf.split(full_data, labels))
# print(cv5_ids)

# initialize model
# lin_clf = svm.SVC(decision_function_shape='ovo', probability=True)
# lin_clf = svm.LinearSVC()
# lin_clf = LogisticRegression()
# lin_clf = svm.SVC(kernel='sigmoid')
# lin_clf = MLPClassifier((256,256), activation='relu', max_iter=1000)
# lin_clf = RandomForestClassifier(n_estimators=5000, max_depth=2, random_state=0)
single_clf = tree.DecisionTreeClassifier(max_depth=1)
# single_clf = LogisticRegression()
lin_clf = RUSBoostClassifier(base_estimator=single_clf, n_estimators=5000)

# initialize booster
sm = SMOTE(random_state=42)

# perform cv5
precision_avg = []
recall_avg = []
fscore_avg = []
acc_avg = 0.
for sp in cv5_ids:
    train_data, train_labels = full_data[sp[0]], labels[sp[0]]
    # train_data, train_labels = sm.fit_sample(train_data, train_labels)
    test_data, test_labels = full_data[sp[1]], labels[sp[1]]

    lin_clf.fit(train_data, train_labels)

コード例 #20

0

ファイルを表示

ファイル: demo.py プロジェクト: muzi0926/FraudViz

def learning_model(year, class_weight):
    iters = 300
    gap = 2
    year_test = year

    data_test = reader.ordinary_data_reader('uscecchini28.csv', year_test, year_test)
    x_test = data_test.features
    y_test = data_test.labels
    test = np.c_[data_test.years, data_test.firms]

    '''
        an if-else is used to judge whether the class_weight is None to prevent Exception from string concatenation
        
        a try-except for RusBoost with DecisionTreeClassifier using custom class_weight
        
        if we can find the right model trained last time on disk, we can directly use that model to predict
        the result without training twice
        otherwise, we have to train that model and save it on disk
        
    '''
    # if class_weight is not None:
    # we use current_model_name to find/save the trained model with custom class_weight
    #     current_model_name = class_weight + "_" + str(year_test) + ".m"
    # else:
    #     current_model_name = str(year_test) + ".m"
    current_model_name = class_weight + "_" + str(year_test) + ".m"
    try:

        rusboost_model = joblib.load(current_model_name)

    except Exception as e:

        print('Running RUSBoost (training period: 1991-' + str(year_test - gap) + ', testing period: ' + str(
            year_test) + ', with ' + str(gap) + '-year gap)...')

        data_train = reader.ordinary_data_reader('uscecchini28.csv', 1991, year_test - gap)

        x_train = data_train.features
        y_train = data_train.labels
        newpaaer_train = data_train.newpaaers

        # formatter labels and newpaaers for the step: data_test.newpaaers(data_test.labels~=0)
        data_test.newpaaers = np.array(data_test.newpaaers)
        data_test.labels = np.array(data_test.labels)
        # replace the nan that should be remained in the array with 0
        for i in range(len(data_test.newpaaers)):
            if np.isnan(data_test.newpaaers[i]):
                if data_test.labels[i] != 0:
                    data_test.newpaaers[i] = 0
        # replace all the nans remain in the array
        data_test.newpaaers = np.array([x for x in data_test.newpaaers if str(x) != 'nan'])
        # replace all the 0 back to nan
        for i in range(len(data_test.newpaaers)):
            if int(data_test.newpaaers[i]) == 0.0:
                data_test.newpaaers[i] = np.NaN

        # do the unique to get final result for newpaaer_test
        newpaaer_test = np.unique(data_test.newpaaers)

        ''' 
        Caution:
            here we change the type of variable called y_train for matching the array index of
            formatted array newpaaer_train in the following loop

        '''
        y_train = np.array(y_train)
        num_frauds = sum(y_train == 1)

        print(num_frauds)
        '''
            here we use the function in1d to replace the function ismember used in matlab
            and a temp array for the other operation to handle serial frauds finish the step:
            y_train[ismember(newpaaer_train, newpaaer_test)] = 0
        '''
        temp_array = np.array(np.in1d(newpaaer_train, newpaaer_test)).astype(int)
        for i in range(len(temp_array)):
            if temp_array[i] == 1:
                y_train[i] = 0

        # delete the temp array
        del temp_array

        num_frauds = num_frauds - sum(y_train == 1)
        print('Recode', num_frauds, 'overlapped frauds (i.e., change fraud label from 1 to 0).')

        start_time = time.perf_counter()
        rusboost_model = RUSBoostClassifier(DecisionTreeClassifier(min_samples_leaf=5, class_weight=class_weight),
                                            learning_rate=0.1, n_estimators=iters)
        rusboost_model.fit(x_train, y_train)
        end_time = time.perf_counter()
        t_train = end_time - start_time
        joblib.dump(rusboost_model, current_model_name)
        print(end_time - start_time)
        print('Training time: %.3f seconds' % t_train)

    start_time = time.perf_counter()
    predit = rusboost_model.predict(x_test)
    prob = rusboost_model.predict_proba(x_test)
    end_time = time.perf_counter()
    t_test = end_time - start_time

    print('Testing time %.3f seconds' % t_test)

    # test figures
    print("AUC: %.4f" % metrics.roc_auc_score(y_test, predit))
    # np.set_printoptions(precision=4, threshold=8, edgeitems=4, linewidth=75, suppress=True, nanstr='nan', infstr='inf')
    print("precision: %.2f%%" % np.multiply(metrics.precision_score(y_test, predit, zero_division=0), 100))
    print("recall: %.2f%%" % np.multiply(metrics.recall_score(y_test, predit), 100))

    # dump part of the results(fraud probability)
    prob = np.around(np.delete(prob, 0, axis=1) * 100, decimals=5)
    data = np.c_[predit, prob]
    data = np.c_[test, data]
    file_data = pd.DataFrame(data)
    csv_file_name = 'data.csv'
    file_data.to_csv(csv_file_name, header=False, index=False)

コード例 #21

0

ファイルを表示

ファイル: gridsearchcv.py プロジェクト: CT-6282/COVID-19_Paper

def Gridsearchcv(X_train, X_test, y_train, y_test):
    ############
    # Scale numeric values
    num_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler())])
    
    preprocessor = ColumnTransformer(
        remainder='passthrough',
        transformers=[
            ('num', num_transformer, make_column_selector(pattern='EDAD'))
            ])
    ############
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', PipelineHelper([
            ('svc', SVC()),
            ('gb', GradientBoostingClassifier()),
            ('xgb', XGBClassifier(use_label_encoder=False)),
            ('eec', EasyEnsembleClassifier()),
            ('rbc', RUSBoostClassifier()),
            ('bbc', BalancedBaggingClassifier()),
            ('brf', BalancedRandomForestClassifier()),
        ])),
    ])

    params = {
    'clf__selected_model': pipe.named_steps['clf'].generate({

        # # #EasyEnsembleClassifier
        'eec__n_estimators' : [10, 25, 50, 100],
        'eec__warm_start' : [False, True],
        'eec__replacement' : [False, True],

        # # #RUSBoostClassifier
        'rbc__algorithm' : ['SAMME','SAMME.R'],
        'rbc__n_estimators' : [10, 50, 100, 200, 500],
        'rbc__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.],
        
        # # #BalancedBaggingClassifier
        'bbc__base_estimator': [HistGradientBoostingClassifier(), None],
        'bbc__n_estimators' : [10, 50, 100, 200, 500,750,1000],
        'bbc__max_samples':[0.5,0.6,0.7,0.8,0.9,1.0],
        'bbc__max_features':[0.5,0.6,0.7,0.8,0.9,1.0],

        # #BalancedRandomForestClassifier
        'brf__criterion': ['gini', 'entropy'],
        'brf__n_estimators' : [int(x) for x in np.linspace(start = 20, stop = 200, num = 5)],
        'brf__max_depth' : [int(x) for x in np.linspace(1, 45, num = 3)],
        'brf__min_samples_split' : range(2,10),
        'brf__min_samples_leaf': [1,3,5,10], 
        'brf__max_features' : ['auto', 'sqrt', 'log2'],

        # # #svm 
        'svc__C': [0.1, 0.5, 1, 10, 30, 40, 50, 75, 100, 500, 1000], 
        'svc__gamma' : [0.0001, 0.001, 0.005, 0.01, 0.05, 0.07, 0.1, 0.5, 1, 5, 10, 50],
        'svc__kernel': ['rbf'],
        
        # # #gb 3780
        "gb__learning_rate": [0.0001, 0.001, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
        "gb__max_depth":[3,7,8,9,10,50],
        "gb__max_features":["log2","sqrt"],
        "gb__subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
        "gb__n_estimators":[10, 50, 100, 200, 300],
        
        # #xgboost
        'xgb__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.],  
        'xgb__min_child_weight': np.arange(1, 21, 5),
        'xgb__subsample': np.arange(0.05, 1.01, 0.05),
        'xgb__verbosity': [0],

        # 'xgb__booster': ['gbtree', 'gblinear' ,'dart'], 
        # 'xgb__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.], 
        # 'xgb__min_child_weight': range(1, 21, 5),
        # 'xgb__subsample': np.arange(0.05, 1.01, 0.05),
        # 'xgb__max_depth': [15,20,25],
        # 'xgb__verbosity': [0],

        # 'xgb__n_estimators': [100],
        # 'xgb__max_depth': range(1, 11),
        # 'xgb__learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        # 'xgb__subsample': np.arange(0.05, 1.01, 0.05),
        # 'xgb__min_child_weight': range(1, 21),
        # 'xgb__verbosity': [0], # add this line to slient warning 
        
        # 'xgb__n_estimators': [400, 700, 1000],
        # 'xgb__colsample_bytree': [0.7, 0.8],
        # 'xgb__max_depth': [15,20,25],
        # 'xgb__reg_alpha': [1.1, 1.2, 1.3],
        # 'xgb__reg_lambda': [1.1, 1.2, 1.3],
        # 'xgb__subsample': [0.7, 0.8, 0.9],
        # 'xgb__eval_metric' : ['mlogloss']
        }),
    }
    scoring = {'ba': 'balanced_accuracy','ap': 'average_precision', 'F1' : 'f1', 'ra': 'roc_auc', 'rc': 'recall'}
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)
    #cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5)
    #https://towardsdatascience.com/hyper-parameter-tuning-with-randomised-grid-search-54f865d27926
    #n_iter: 30,60, 100
    grid = RandomizedSearchCV(
        pipe, 
        params,
        refit = 'ba',
        cv = cv, 
        verbose = 3, 
        n_jobs=-1,
        n_iter = 60,
        scoring= scoring,
        return_train_score = True
        )

    grid.fit(X_train, y_train)
    df_grid=pd.DataFrame(grid.cv_results_)
    df_grid = df_grid.sort_values(by=['mean_test_ba'],ascending=False)
    df_grid = df_grid[[
        'param_clf__selected_model',
        'params',
        'mean_fit_time',
        'std_fit_time',
        'mean_test_ba',
        'std_test_ba',
        'rank_test_ba',
        'mean_test_ap',
        'std_test_ap',
        'rank_test_ap',
        'mean_test_ra',
        'std_test_ra',
        'rank_test_ra',
        'mean_test_F1', 
        'std_test_F1', 
        'rank_test_F1'
    ]]

    print("Best-Fit Parameters From Training Data:\n",grid.best_params_)
    grid_predictions = grid.best_estimator_.predict(X_test) 
    report = classification_report(y_test, grid_predictions, output_dict=True)
    report = pd.DataFrame(report).transpose()
    print(report)
    print(confusion_matrix(y_test, grid_predictions))

    return grid, df_grid, report

コード例 #22

0

ファイルを表示

ファイル: run.py プロジェクト: jhkjhkim/CUSBoost.NC

     y_test = y[test_index]
 
    
 
 
 
     
     
     
     #classifier = CUSBoostClassifier(**a) 
     #classifier = AdaboostClassifier(**a)
     #classifier = RusBoost(depth=depth, n_estimators=estimators)
     #classifier = AdaboostNC_Classifier(**a)
     #classifier = CUSBoostNC_Classifier(**a)
     #classifier = RusBoost(**a)
     classifier = RUSBoostClassifier(DecisionTreeClassifier(max_depth=8), n_estimators=64)
 
     #classifier.fit(X_train, y_train, number_of_clusters, 0.5) #CUSBoost classifier        
     #classifier.fit(X_train, y_train) #Adaboost classifier
     #classifier.fit(X_train, y_train, 0.5) #AdaboostNC classifier
     #classifier.fit(X_train, y_train, 6, 0.5)
     #classifier.fit(X_train, y_train, 6, fraction/100, 8)
     classifier.fit(X_train, y_train)
     
     
     
     predictions = classifier.predict_proba(X_test)
     prediction_ = classifier.predict(X_test)
 
     auc = roc_auc_score(y_test, predictions[:, 1])
     f1 = f1_score(y_test, prediction_)

コード例 #23

0

ファイルを表示

ファイル: test_weight_boosting.py プロジェクト: cdchushig/imbalanced-learn

def test_rusboost_error(imbalanced_dataset, boosting_params, err_msg):
    rusboost = RUSBoostClassifier(**boosting_params)
    with pytest.raises(ValueError, match=err_msg):
        rusboost.fit(*imbalanced_dataset)

コード例 #24

0

ファイルを表示

clf_results = pd.DataFrame()

# define models

models =    {
            'ExtraTrees': ExtraTreesClassifier(),
            'RandomForest': RandomForestClassifier(),
            'AdaBoost': AdaBoostClassifier(),
            'GradientBoosting': GradientBoostingClassifier(),
            'SVC': SVC(),
            'LogitBoost': LogitBoost(),
            'XGBClassifier': XGBClassifier(),
            'ComplementNB': ComplementNB(),
            'BalancedBagging': BalancedBaggingClassifier(),
            'BalancedRandomForest': BalancedRandomForestClassifier(),
            'RUSBoost': RUSBoostClassifier(),
            'EasyEnsemble': EasyEnsembleClassifier()
            }

# define model parameters for parameter search

param_extra_trees =     {
                        'n_estimators': [5, 10, 50, 100, 200],
                        'min_samples_split': [2, 4],
                        'max_depth': [2, 3, None],
                        'max_features': ['sqrt', None],
                        'class_weight': ['balanced']
                        }

param_random_forest =   {
                        'n_estimators': [5, 10, 50, 100, 200],

コード例 #25

0

ファイルを表示

ファイル: undersampling.py プロジェクト: TudorAndrei/eda_project

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    shuffle=True)

model = GradientBoostingClassifier(n_estimators=100, random_state=42)

imbl_methods = {
    'eec':
    EasyEnsembleClassifier(random_state=42,
                           sampling_strategy=1.,
                           n_jobs=-1,
                           base_estimator=model),
    'rub':
    RUSBoostClassifier(random_state=42,
                       sampling_strategy=1.,
                       base_estimator=model)
}

for method in imbl_methods.keys():

    imbl = imbl_methods[method]
    imbl.fit(X_train, y_train)
    y_hat_test = imbl.predict(X_test)
    y_hat_train = imbl.predict(X_train)
    print(f"Reults of {method}")
    print(imbl.score(X_test, y_test))
    print("Train data")
    print(classification_report(y_train, y_hat_train))
    print("Test data")
    print(classification_report(y_test, y_hat_test))

コード例 #26

0

ファイルを表示

ファイル: train.py プロジェクト: andychen1981/azure-1

def classiferSet(pre_cost_weight=20):
    #xgt = xgb.XGBClassifier(learning_rate=0.1, scale_pos_weight=10, n_estimators=100, random_state=1) #80.77%
    xgt = xgb.XGBClassifier(
        learning_rate=0.1,
        #subsample=0.99,
        max_depth=3,
        scale_pos_weight=pre_cost_weight,
        n_estimators=80,
        #cv=5,
        #subsample=.99,
        random_state=27,
        nthread=2  #use more threads only for large dataset
    )  #84.62%

    ada = AdaBoostClassifier(n_estimators=100,
                             learning_rate=.1,
                             random_state=1234)  #(0,130): .815

    #gbt = GradientBoostingClassifier(n_estimators=100, subsample=1.0, learning_rate=1, random_state=1234)		#(0,130): .830
    gbt = GradientBoostingClassifier(
        n_estimators=100, subsample=0.99, learning_rate=.1,
        random_state=1234)  #(0,130): .861															#

    rf = RandomForestClassifier(
        n_estimators=100,
        #max_depth=10,
        oob_score=True,
        class_weight={
            0: 1,
            1: pre_cost_weight
        },
        #class_weight='balanced',
        random_state=1234)  #.846

    brf = BalancedRandomForestClassifier(n_estimators=100,
                                         oob_score=True,
                                         class_weight={
                                             0: 1,
                                             1: pre_cost_weight
                                         },
                                         random_state=1234)

    rus = RUSBoostClassifier(n_estimators=100, random_state=1234)
    #https://www.kaggle.com/c/home-credit-default-risk/discussion/60921
    #https://sites.google.com/view/lauraepp/parameters
    lgbm = lightgbm.LGBMClassifier(
        boosting_type='dart',  #'gbdt', 'goss', 'dart'
        num_leaves=31,
        max_depth=-1,
        learning_rate=0.1,
        class_weight=
        None,  #{0:1,1:pre_cost_weight}, using this is inferior to default
        random_state=1234)

    ourmodels = dict({
        'AdaBoost': ada,
        'GradientBoost': gbt,
        'RandomForest': rf,
        'BalancedRandomForest': brf,
        'RUSBoost': rus,
        'XGBoost': xgt,
        'LightGBM': lgbm
    })
    return ourmodels

コード例 #27

0

ファイルを表示

ファイル: lw_preprocess.py プロジェクト: tfaatfcn/lw-mlearn-rogerluo

def pipe_main(pipe=None):
    '''pipeline construction using sklearn estimators, final step support only
    classifiers currently
    
    .. note::
        data flows through a pipeline consisting of steps as below:
            raw data --> clean --> encoding --> scaling --> feature construction 
            --> feature selection --> resampling --> final estimator
            see scikit-learn preprocess & estimators
    parameter
    ----
    pipe - str 
        - in the format of 'xx_xx' of which 'xx' means steps in pipeline,
          default None
    return
    ----
        1) pipeline instance of chosen steps
        2) if pipe is None, a dict indicating possible choice of 'steps'
    '''
    clean = {
        'clean':
        Split_cls(dtype_filter='not_datetime', na1='null', na2=-999),
        'cleanNA':
        Split_cls(dtype_filter='not_datetime', na1=None, na2=None),
        'cleanMean':
        Split_cls(dtype_filter='not_datetime', na1='most_frequent',
                  na2='mean'),
    }
    #
    encode = {
        'woe': Woe_encoder(max_leaf_nodes=5),
        'oht': Oht_encoder(),
        'ordi': Ordi_encoder(),
    }

    resample = {

        # over_sampling
        'rover':
        RandomOverSampler(),
        'smote':
        SMOTE(),
        'bsmote':
        BorderlineSMOTE(),
        'adasyn':
        ADASYN(),

        # under sampling controlled methods
        'runder':
        RandomUnderSampler(),
        'nearmiss':
        NearMiss(version=3),
        'pcart':
        InstanceHardnessThreshold(),

        # under sampling cleaning methods
        'tlinks':
        TomekLinks(n_jobs=-1),
        'oside':
        OneSidedSelection(n_jobs=-1),
        'cleanNN':
        NeighbourhoodCleaningRule(n_jobs=-1),
        'enn':
        EditedNearestNeighbours(n_jobs=-1),
        'ann':
        AllKNN(n_jobs=-1),
        'cnn':
        CondensedNearestNeighbour(n_jobs=-1),

        # clean outliers
        'inlierForest':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'IsolationForest'}),
        'inlierLocal':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'LocalOutlierFactor'}),
        'inlierEllip':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'EllipticEnvelope'}),
        'inlierOsvm':
        FunctionSampler(outlier_rejection, kw_args={'method': 'OneClassSVM'}),
        # combine
        'smoteenn':
        SMOTEENN(),
        'smotelink':
        SMOTETomek(),
    }

    scale = {
        'stdscale': StandardScaler(),
        'maxscale': MinMaxScaler(),
        'rscale': RobustScaler(quantile_range=(10, 90)),
        'qauntile': QuantileTransformer(),  # uniform distribution
        'power': PowerTransformer(),  # Gaussian distribution
        'norm': Normalizer(),  # default L2 norm

        # scale sparse data
        'maxabs': MaxAbsScaler(),
        'stdscalesp': StandardScaler(with_mean=False),
    }
    # feature construction
    feature_c = {
        'pca': PCA(whiten=True),
        'spca': SparsePCA(normalize_components=True, n_jobs=-1),
        'ipca': IncrementalPCA(whiten=True),
        'kpca': KernelPCA(kernel='rbf', n_jobs=-1),
        'poly': PolynomialFeatures(degree=2),
        'rtembedding': RandomTreesEmbedding(n_estimators=10),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
    }
    # select from model
    feature_m = {
        'fwoe':
        SelectFromModel(Woe_encoder(max_leaf_nodes=5)),
        'flog':
        SelectFromModel(
            LogisticRegressionCV(penalty='l1',
                                 solver='saga',
                                 scoring='roc_auc')),
        'fsgd':
        SelectFromModel(SGDClassifier(penalty="l1")),
        'fsvm':
        SelectFromModel(LinearSVC('l1', dual=False, C=1e-2)),
        'fxgb':
        SelectFromModel(XGBClassifier(n_jobs=-1)),
        'frf':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5)),
        'fRFExgb':
        RFE(XGBClassifier(n_jobs=-1), step=0.1, n_features_to_select=20),
        'fRFErf':
        RFE(ExtraTreesClassifier(n_estimators=100, max_depth=5),
            step=0.3,
            n_features_to_select=20),
        'fRFElog':
        RFE(LogisticRegressionCV(penalty='l1',
                                 solver='saga',
                                 scoring='roc_auc'),
            step=0.3,
            n_features_to_select=20)
    }
    # Univariate feature selection
    feature_u = {
        'fchi2':
        GenericUnivariateSelect(chi2, 'percentile', 25),
        'fMutualclf':
        GenericUnivariateSelect(mutual_info_classif, 'percentile', 25),
        'fFclf':
        GenericUnivariateSelect(f_classif, 'percentile', 25),
    }
    # sklearn estimator
    t = all_estimators(type_filter=['classifier'])
    estimator = {}
    for i in t:
        try:
            estimator.update({i[0]: i[1]()})
        except Exception:
            continue

    estimator.update(
        dummy=DummyClassifier(),
        XGBClassifier=XGBClassifier(n_jobs=-1),
        LogisticRegressionCV=LogisticRegressionCV(scoring='roc_auc'),
        EasyEnsembleClassifier=EasyEnsembleClassifier(),
        BalancedRandomForestClassifier=BalancedRandomForestClassifier(),
        RUSBoostClassifier=RUSBoostClassifier(),
        SVC=SVC(C=0.01, gamma='auto'))

    if pipe is None:
        feature_s = {}
        feature_s.update(**feature_m, **feature_u)
        return {
            'clean': clean.keys(),
            'encoding': encode.keys(),
            'resample': resample.keys(),
            'scale': scale.keys(),
            'feature_c': feature_c.keys(),
            'feature_s': feature_s.keys(),
            'classifier': estimator.keys()
        }
    elif isinstance(pipe, str):
        l = pipe.split('_')
        all_keys_dict = {}
        all_keys_dict.update(**clean, **encode, **scale, **feature_c,
                             **feature_m, **feature_u, **estimator, **resample)
        steps = []
        for i in l:
            if all_keys_dict.get(i) is not None:
                steps.append((i, all_keys_dict.get(i)))
            else:
                raise KeyError(
                    "'{}' invalid key for sklearn estimators".format(i))
        return Pipeline(steps)

    else:
        raise ValueError("input pipe must be a string in format 'xx[_xx]'")

コード例 #28

0

ファイルを表示

ファイル: dataset_editted_excel.py プロジェクト: ilaplace/flask-auth0-graphqlserver

# eec = EasyEnsembleClassifier(n_estimators=10,
#                              base_estimator=base_estimator,
#                              n_jobs=-1)
# eec.fit(X_train_seek, y_train_seek)
# y_pred_eec = eec.predict(X_test_seek)
# print('Easy ensemble classifier performance:')
# print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
#       .format(balanced_accuracy_score(y_test_seek, y_pred_eec),
#               geometric_mean_score(y_test_seek, y_pred_eec)))
# cm_eec = confusion_matrix(y_test_seek, y_pred_eec)
# fig, ax = plt.subplots(ncols=2)
# plot_confusion_matrix(cm_eec, classes=np.unique(dataset.target), ax=ax[0],
#                       title='Easy ensemble classifier')

base_estimator = AdaBoostClassifier(n_estimators=10)
rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator)
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
print('RUSBoost classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format(
    balanced_accuracy_score(y_test, y_pred_rusboost),
    geometric_mean_score(y_test, y_pred_rusboost)))
cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_rusboost,
                      classes=np.unique(dataset.target),
                      ax=ax[1],
                      title='RUSBoost classifier')

rusboost.fit(X_train_seek, y_train_seek)

コード例 #29

0

ファイルを表示

            exLpred.append(float(lineE[j]))
        #cellTypesTrue.append(lineE[int(len(lineE))-1])
        exMpred.append(exLpred)
        #s.append("\n")
        exLpred = []
        cellID.append(lineE[0])

#cellTypesTrue = np.array(cellTypesTrue)
exMpred = np.array(exMpred)
cellID = np.array(cellID)

###################################

##### Everything is ready for cell type prediction #####

rusboost = RUSBoostClassifier(random_state=0)
rusboost.fit(exMtrain, cellTypesTrain)

##### Cell types prediction #####
cellTypesPred = rusboost.predict(exMpred)

#accuracy_score = balanced_accuracy_score(cellTypesTrue, cellTypesPred)
#print accuracy_score
#classification_report(cellTypesTrue, cellTypesPred)

##### Checking performance #####
#confusionMatrix = confusion_matrix(cellTypesTrue, cellTypesPred)
cellTypesProbs = rusboost.predict_proba(exMpred)
#print confusionMatrix
##### Merging the cell types and probability score #####