def test_little_tree_with_small_max_samples():
    rng = np.random.RandomState(1)

    X = rng.randn(10000, 2)
    y = rng.randn(10000) > 0

    # First fit with no restriction on max samples
    est1 = BalancedRandomForestClassifier(
        n_estimators=1,
        random_state=rng,
        max_samples=None,
    )

    # Second fit with max samples restricted to just 2
    est2 = BalancedRandomForestClassifier(
        n_estimators=1,
        random_state=rng,
        max_samples=2,
    )

    est1.fit(X, y)
    est2.fit(X, y)

    tree1 = est1.estimators_[0].tree_
    tree2 = est2.estimators_[0].tree_

    msg = "Tree without `max_samples` restriction should have more nodes"
    assert tree1.node_count > tree2.node_count, msg
Esempio n. 2
0
def do_CV_Voting(LS, cv=10):
    nBits = 1250
    with measure_time("Creating fingerprint"):
        X_LS = create_fingerprints(LS["SMILES"].values, nBits=nBits)
        # drop duplicate
        data = pd.DataFrame(X_LS)
        data = data.drop_duplicates()
        X_LS = data.values

    # Drop also duplicate in the y_LS samples
    y_LS = LS["ACTIVE"].loc[data.index].values
    X_train, X_test, y_train, y_test = train_test_split(
        X_LS, y_LS, test_size=0.25, train_size=0.75, random_state=1)
    pipeline_1 = make_pipeline(ADASYN(sampling_strategy=0.25, random_state=64, n_jobs=-1),
                               BalancedRandomForestClassifier(n_estimators=600, random_state=18, n_jobs=-1))
    pipeline_2 = make_pipeline(ADASYN(random_state=64, n_jobs=-1),
                               BalancedRandomForestClassifier(n_estimators=600, random_state=24, n_jobs=-1))
    BRF = BalancedRandomForestClassifier(n_estimators=100, random_state=18, n_jobs=-1)
    BGC = make_pipeline(ADASYN(sampling_strategy=0.25, random_state=64, n_jobs=-1),
                        BalancedBaggingClassifier(estimator=DecisionTreeClassifier(max_features="log2"), n_estimators=50))
    votingModel = VotingClassifier(estimators=[(
        'pip1', pipeline_1), ('pip2', pipeline_2), ('BRF', BRF), ('BGC', BGC)], voting='soft', weights=[3, 1, 1, 1], n_jobs=-1)
    scores = cross_validate(votingModel, X_train, y_train, cv=cv, scoring=(
        'roc_auc', 'average_precision'), return_estimator=True)
    print(scores['test_roc_auc'].mean(),
          scores['test_average_precision'].mean())
    model = scores['estimator'][np.argmax(scores['test_roc_auc'])]
    y_pred = model.predict(X_test)
    conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
    print("confusion_matrix:\n", conf_mat)
def test_balanced_random_forest_oob(imbalanced_dataset):
    X, y = imbalanced_dataset
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=42,
                                                        stratify=y)
    est = BalancedRandomForestClassifier(
        oob_score=True,
        random_state=0,
        n_estimators=1000,
        min_samples_leaf=2,
    )

    est.fit(X_train, y_train)
    test_score = est.score(X_test, y_test)

    assert abs(test_score - est.oob_score_) < 0.1

    # Check warning if not enough estimators
    est = BalancedRandomForestClassifier(oob_score=True,
                                         random_state=0,
                                         n_estimators=1,
                                         bootstrap=True)
    with pytest.warns(UserWarning) and np.errstate(divide="ignore",
                                                   invalid="ignore"):
        est.fit(X, y)
Esempio n. 4
0
def get_classifier(n_subj, random_state, n_jobs_rf=1, multiclass=False):
    if multiclass:
        # multiplication with 0.9 required to make the subject number agree with training set AND because one of the
        # classes has only very few subject such that we can't reasonably sample more than 100 subjects
        subsample_size = round(n_subj * 0.9 * 0.5 / 4)
        estimator = BalancedRandomForestClassifier(n_estimators=1000,
                                                   class_weight='balanced',
                                                   oob_score=False,
                                                   sampling_strategy={
                                                       0: subsample_size,
                                                       1: subsample_size,
                                                       2: subsample_size,
                                                       3: subsample_size
                                                   },
                                                   n_jobs=n_jobs_rf,
                                                   random_state=random_state,
                                                   bootstrap=False,
                                                   replacement=False)
    else:
        subsample_size = round(n_subj * 0.632 / 2)
        estimator = BalancedRandomForestClassifier(n_estimators=1000,
                                                   class_weight='balanced',
                                                   oob_score=False,
                                                   sampling_strategy={
                                                       0: subsample_size,
                                                       1: subsample_size
                                                   },
                                                   n_jobs=n_jobs_rf,
                                                   random_state=random_state,
                                                   bootstrap=False,
                                                   replacement=False)
    return estimator
Esempio n. 5
0
    def make_model(self, config=None):
        """
        :param config : model parameters
        :return: self.model
        """

        if config != None:
            self.config = config

        print('Creating fresh model...')

        if self.class_ == 'RF':
            if self.type_ == 'reg':
                if self.balanced == 'balanced':
                    print('WARNING: balanced regressor not applicable')
                    self.model = RandomForestRegressor(
                        **config) if config != None else RandomForestRegressor(
                            random_state=self.seed)
                elif self.balanced == None:
                    self.model = RandomForestRegressor(
                        **config) if config != None else RandomForestRegressor(
                            random_state=self.seed)
            elif self.type_ == 'cls':
                if self.balanced == 'balanced':
                    self.model = BalancedRandomForestClassifier(
                        **config
                    ) if config != None else BalancedRandomForestClassifier(
                        random_state=self.seed)
                elif self.balanced == None:
                    self.model = RandomForestClassifier(
                        **
                        config) if config != None else RandomForestClassifier(
                            random_state=self.seed)
        elif self.class_ == 'lin':
            if self.type_ == 'reg':
                if self.balanced == 'balanced':
                    print('WARNING: balanced regressor not applicable')
                    self.model = LinearRegression(
                        **config) if config != None else LinearRegression()
                elif self.balanced == None:
                    self.model = LinearRegression(
                        **config) if config != None else LinearRegression()
            elif self.type_ == 'cls':
                if self.balanced == 'balanced':
                    self.model = LogisticRegression(
                        **config) if config != None else LogisticRegression()
                    self.model.class_weight = self.balanced
                elif self.balanced == None:
                    self.model = LogisticRegression(
                        **config) if config != None else LogisticRegression()
                    self.model.class_weight = None
        elif self.class_ == 'svm':
            assert self.type_ == 'cls', print(
                'If using SVM, make sure you have a classification problem. i.e. set type_="cls"'
            )
            self.model = SVC(**config) if config != None else SVC(kernel='rbf')

        print('Created: ', self.model)
        return self.model
def test_balanced_random_forest_pruning(imbalanced_dataset):
    brf = BalancedRandomForestClassifier()
    brf.fit(*imbalanced_dataset)
    n_nodes_no_pruning = brf.estimators_[0].tree_.node_count

    brf_pruned = BalancedRandomForestClassifier(ccp_alpha=0.015)
    brf_pruned.fit(*imbalanced_dataset)
    n_nodes_pruning = brf_pruned.estimators_[0].tree_.node_count

    assert n_nodes_no_pruning > n_nodes_pruning
 def _train_has_damage(cls, preprocessed_df: pd.DataFrame) -> LinearModelType:
     X_train, X_test, Y_train, Y_test = cls.get_X_Y_split(
         preprocessed_df, "has_claim"
     )
     model = BalancedRandomForestClassifier()
     model.fit(X_train, Y_train)
     return model
    def random_forest(df, drop, target, show, model_name):

        # split the table into features and outcomes
        x_cols = [i for i in df.columns if i not in drop]
        X = df[x_cols]
        y = df[target]

        # split features and outcomes into train and test data
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=1)
        brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
        brf.fit(X_train, y_train)
        y_predictions = brf.predict(X_test)

        feature_importance = sorted(
            zip(brf.feature_importances_, X.columns.tolist()))[::-1]

        # Calculating the accuracy score.
        acc_score = balanced_accuracy_score(y_test, y_predictions)

        # Displaying results
        if show == True:
            print(f"Feature Importance: {model_name}")
            for i in feature_importance:
                print(i)
            print("\n")

        return acc_score * 100
def main():
    """ Main entrance."""
    print('Spliting challenges')
    split_challenges()
    print('Reading X...')
    X = pd.concat([pd.read_json(XY_PATH['X'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1'])
    print('Reading y...')
    y = pd.concat([pd.read_json(XY_PATH['y'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1'])

    print('\nTraining Inner sampler RFC')
    for i in range(10):
        print(f'Training 10-Fold CV #{i}', end='\r')
        X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i)

        balanced_rfc = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
        balanced_rfc.fit(X_train.to_numpy(), y_train.to_numpy().ravel())

        pd.DataFrame(balanced_rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'brf', f'y_prob_{i}.json'), orient='records')
        pd.Series(balanced_rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'brf', f'feature_importance_{i}.json'))

    print('\nTraining RandomUnderSampler')
    for i in range(10):
        print(f'Training 10-Fold CV #{i}', end='\r')
        X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i)

        rfc = RandomForestClassifier(n_estimators=100, random_state=0)
        rus = RandomUnderSampler(random_state=0)

        X_resample, y_resample = rus.fit_resample(X_train.to_numpy(), y_train.to_numpy().ravel())
        rfc.fit(X_resample, y_resample)

        pd.DataFrame(rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'rus', f'y_prob_{i}.json'), orient='records')
        pd.Series(rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'rus', f'feature_importance_{i}.json'))
def get_balanced_models():
    models = list()
    #LR
    models.append(
        ('LR_Bal', LogisticRegression(solver='lbfgs',
                                      class_weight='balanced')))
    # LDA
    models.append(('LDA', LinearDiscriminantAnalysis()))
    #KNN
    models.append(('KNN', KNeighborsClassifier()))
    #NB
    models.append(('NB', GaussianNB()))
    #MNB
    #models.append(('MNB', MultinomialNB()))
    #GPC
    #models.append(('GPC', GaussianProcessClassifier()))
    if X.shape[0] < 100000:
        #SVM Balanced
        models.append(('SVM_Bal', SVC(gamma='scale', class_weight='balanced')))
        #SVM Weight
        models.append(('SVM_W', SVC(gamma='scale', class_weight=weights)))
    #Balanced RF
    models.append(
        ('Bal_RF', BalancedRandomForestClassifier(n_estimators=1000)))
    #RF
    models.append(('RF_Bal',
                   RandomForestClassifier(n_estimators=1000,
                                          class_weight='balanced')))
    #DT
    models.append(('DT_Bal', DecisionTreeClassifier(class_weight='balanced')))
    #Bag
    models.append(('BAG', BaggingClassifier(n_estimators=1000)))
    #XGB
    models.append(('XGB_W', XGBClassifier(scale_pos_weight=weights)))
    return models
Esempio n. 11
0
	def __init__(self, iterations=1, transform_first=False, untrained_model=BalancedRandomForestClassifier(random_state=42,n_jobs=40), max_train_test_samples=100, mode_interaction_extract='knee', include_self_interactions=False, penalty=3, pelt_model='l2', no_changepoint_strategy='median'):
		"""https://github.com/ModelOriented/SAFE/blob/master/SafeTransformer/SafeTransformer.py"""
		steps=[]
		for i in range(iterations):
			steps.extend([['interaction{}'.format(i),InteractionTransformer(copy.deepcopy(untrained_model), max_train_test_samples, mode_interaction_extract, include_self_interactions)],
						  ['transformer{}'.format(i),SafeTransformer(penalty=penalty, model=copy.deepcopy(untrained_model), pelt_model=pelt_model, no_changepoint_strategy=no_changepoint_strategy)]])
		self.pipeline=Pipeline(steps)
def fourth_test(X_train, y_train, X_test, y_test):
    print("Test with BalancedRandomForestClassifier or BalancedBaggingClassifier\n")

    print("BalancedRandomForestClassifier")
    scores = cross_validate(BalancedRandomForestClassifier(max_depth=None, n_estimators=500, random_state=0, n_jobs=2, max_features='log2', oob_score=False), X_train, y_train, cv=10, scoring=('roc_auc', 'average_precision'), return_estimator=True)
    print(scores['test_roc_auc'].mean(),
          scores['test_average_precision'].mean())
    log_model = scores['estimator'][np.argmax(scores['test_roc_auc'])]
    y_log_pred = log_model.predict(X_test)
    conf_mat = confusion_matrix(y_true=y_test, y_pred=y_log_pred)
    print("confusion_matrix:\n", conf_mat)
    print()

    print("BalancedBaggingClassifier")
    tree = DecisionTreeClassifier(max_features='auto')
    resample_bagging = BalancedBaggingClassifier(
        base_estimator=tree, n_estimators=100, random_state=0, n_jobs=2, oob_score=True)
    scores = cross_validate(resample_bagging, X_train, y_train, cv=10, scoring=(
        'roc_auc', 'average_precision'), return_estimator=True)
    print(scores['test_roc_auc'].mean(),
          scores['test_average_precision'].mean())
    rf_model = scores['estimator'][np.argmax(scores['test_roc_auc'])]
    y_rf_pred = rf_model.predict(X_test)
    conf_mat = confusion_matrix(y_true=y_test, y_pred=y_rf_pred)
    print("confusion_matrix:\n", conf_mat)

    """
Esempio n. 13
0
def test_balanced_random_forest_attributes(imbalanced_dataset):
    X, y = imbalanced_dataset
    n_estimators = 10
    brf = BalancedRandomForestClassifier(
        n_estimators=n_estimators, random_state=0
    )
    brf.fit(X, y)

    for idx in range(n_estimators):
        X_res, y_res = brf.samplers_[idx].fit_resample(X, y)
        X_res_2, y_res_2 = (
            brf.pipelines_[idx]
            .named_steps["randomundersampler"]
            .fit_resample(X, y)
        )
        assert_allclose(X_res, X_res_2)
        assert_array_equal(y_res, y_res_2)

        y_pred = brf.estimators_[idx].fit(X_res, y_res).predict(X)
        y_pred_2 = brf.pipelines_[idx].fit(X, y).predict(X)
        assert_array_equal(y_pred, y_pred_2)

        y_pred = brf.estimators_[idx].fit(X_res, y_res).predict_proba(X)
        y_pred_2 = brf.pipelines_[idx].fit(X, y).predict_proba(X)
        assert_array_equal(y_pred, y_pred_2)
def apply_ml_model(X_train_input, y_train_input, X_test_input, y_test_input):
    models = ['LREG','RFC','Tree','Balanced RFC']
    scores = []
    # Specify the target classes
    classes = ["No re-admission","Re-admission in < 30 days"]
    for model in models:
        if model == 'LREG':
            model_select = LogisticRegression(solver='lbfgs', max_iter=500, random_state=78)
        elif model == 'RFC':
            model_select = RandomForestClassifier(n_estimators= 128, random_state=78)
        elif model == 'Tree':
            model_select = tree.DecisionTreeClassifier(random_state=78)
        elif model == 'Balanced RFC':
            model_select = BalancedRandomForestClassifier(n_estimators=128, random_state=78)
        model_select.fit(X_train_input, y_train_input)
        y_pred = model_select.predict(X_test_input)
        # Create a DataFrame from the confusion matrix.
        cm = confusion_matrix(y_test_input, y_pred)
        # Calculating the accuracy score.
        acc_score = balanced_accuracy_score(y_test, y_pred)
        scores.append(acc_score)
        print(f"Model: {model}")
        # Displaying results
        print("Confusion Matrix")
        cm_df = pd.DataFrame(
        cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
        print(cm_df)
        print(f"Accuracy Score : {acc_score}\n")
        print("Classification Report")
        print(classification_report_imbalanced(y_test_input, y_pred))
Esempio n. 15
0
def test_balanced_random_forest_oob(imbalanced_dataset):
    X, y = imbalanced_dataset
    est = BalancedRandomForestClassifier(oob_score=True, random_state=0)

    n_samples = X.shape[0]
    est.fit(X[:n_samples // 2, :], y[:n_samples // 2])
    test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:])

    assert abs(test_score - est.oob_score_) < 0.1

    # Check warning if not enough estimators
    est = BalancedRandomForestClassifier(oob_score=True, random_state=0,
                                         n_estimators=1, bootstrap=True)
    with pytest.warns(UserWarning) and np.errstate(divide="ignore",
                                                   invalid="ignore"):
        est.fit(X, y)
Esempio n. 16
0
def do_CV_grid(LS, cv=10):
    nBits = 1250
    with measure_time("Creating fingerprint"):
        X_LS = create_fingerprints(LS["SMILES"].values, nBits=nBits)
        # drop duplicate
        data = pd.DataFrame(X_LS)
        data = data.drop_duplicates()
        X_LS = data.values
    # Drop also duplicate in the y_LS samples
    y_LS = LS["ACTIVE"].loc[data.index].values
    X_train, X_test, y_train, y_test = train_test_split(
        X_LS, y_LS, test_size=0.25, train_size=0.75, random_state=1)
    pipeline = Pipeline([('ada', ADASYN(sampling_strategy=0.25, random_state=64, n_jobs=-1)),
                         ('BRF', BalancedRandomForestClassifier(n_estimators=500, random_state=18, n_jobs=-1, bootstrap=False))])
    param = {}
    param['BRF__n_estimators'] = [500]
    param['BRF__max_features'] = [None, 'log2']
    #param['BRF__criterion'] = ['gini', 'entropy']

    clf = GridSearchCV(pipeline, param, scoring='roc_auc', n_jobs=2, cv=10)
    clf.fit(X_train, y_train)
    print("Best parameters set found on development set:")
    print()
    print(clf.cv_results_)
    print(clf.best_params_)
    print(clf.best_score_)
    print()

    y_pred = clf.predict(X_test)
    conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
    print("confusion_matrix:\n", conf_mat)
    print("Classification report")
    print(classification_report(y_true=y_test, y_pred=y_pred))
Esempio n. 17
0
        def objective(trial):

            train_X, val_X, train_y, val_y = train_test_split(self.X,
                                                              self.y,
                                                              test_size=0.2)
            median_imputer = SimpleImputer(missing_values=np.NaN,
                                           strategy='median')
            v_train_X = median_imputer.fit_transform(train_X)
            v_val_X = median_imputer.fit_transform(val_X)
            train_X = pd.DataFrame(v_train_X,
                                   columns=train_X.columns,
                                   index=train_X.index)
            val_X = pd.DataFrame(v_val_X,
                                 columns=val_X.columns,
                                 index=val_X.index)

            v_test_X = median_imputer.fit_transform(self.X_validation)
            test_X = pd.DataFrame(v_test_X,
                                  columns=self.X_validation.columns,
                                  index=self.X_validation.index)

            list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000]

            brf_n_estimators = trial.suggest_categorical(
                'n_estimators', list_trees)
            brf_max_features = trial.suggest_uniform('max_features', 0.15, 1.0)
            brf_min_samples_split = trial.suggest_int('min_samples_split', 2,
                                                      16)
            brf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16)
            brf_min_weight_fraction_leaf = trial.suggest_uniform(
                'min_weight_fraction_leaf', 0, 0.5)
            brf_max_depth = trial.suggest_int('max_depth', 2, 32)

            brfmodel = BalancedRandomForestClassifier(
                n_estimators=brf_n_estimators,
                max_features=brf_max_features,
                min_samples_split=brf_min_samples_split,
                min_samples_leaf=brf_min_samples_leaf,
                max_depth=brf_max_depth,
                min_weight_fraction_leaf=brf_min_weight_fraction_leaf,
                bootstrap=True)

            brfmodel.fit(train_X, train_y)

            aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1])
            aucbrf_test = roc_auc_score(self.y_validation,
                                        brfmodel.predict_proba(test_X)[:, 1])
            print('Accuracy test ' + str(
                accuracy_score(self.y_validation, brfmodel.predict(test_X))))

            plt.figure()
            plot_confusion_matrix(brfmodel,
                                  test_X,
                                  self.y_validation,
                                  cmap=plt.cm.Blues,
                                  normalize=None)
            plt.show()
            print(aucbrf_test)

            return aucbrf
def test_balanced_random_forest_grid_search(imbalanced_dataset):
    brf = BalancedRandomForestClassifier()
    grid = GridSearchCV(brf, {
        "n_estimators": (1, 2),
        "max_depth": (1, 2)
    },
                        cv=3)
    grid.fit(*imbalanced_dataset)
Esempio n. 19
0
def test_balanced_random_forest(imbalanced_dataset):
    n_estimators = 10
    brf = BalancedRandomForestClassifier(n_estimators=n_estimators, random_state=0)
    brf.fit(*imbalanced_dataset)

    assert len(brf.samplers_) == n_estimators
    assert len(brf.estimators_) == n_estimators
    assert len(brf.pipelines_) == n_estimators
    assert len(brf.feature_importances_) == imbalanced_dataset[0].shape[1]
Esempio n. 20
0
def test_balanced_random_forest_grid_search(imbalanced_dataset):
    brf = BalancedRandomForestClassifier()
    grid = GridSearchCV(brf, {
        'n_estimators': (1, 2),
        'max_depth': (1, 2)
    },
                        cv=3,
                        iid=False)
    grid.fit(*imbalanced_dataset)
Esempio n. 21
0
def evaluate(X_train, y_train, X_test, y_test):
    global seed
    clf = BalancedRandomForestClassifier(n_estimators=500, random_state=seed)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test).argsort(axis=1)
    y_pred1 = y_pred[:, -1]
    y_pred2 = y_pred[:, -2]
    return metrics.confusion_matrix(y_test, y_pred1), metrics.confusion_matrix(
        y_test, y_pred2)
Esempio n. 22
0
    def model_checking(self):

        X = self.df[self.features]
        Y = self.df[self.target]

        pipelines = [
            Pipeline(steps=[('classifier',
                             BalancedRandomForestClassifier(
                                 n_estimators=200))]),
            Pipeline(steps=[
                # ('rfe', RFE(XGBClassifier(), )),
                ('classifier', BalancedBaggingClassifier(n_estimators=200))
            ]),
            Pipeline(steps=[('rfe', SMOTE()),
                            ('classifier',
                             XGBClassifier(n_estimators=1000, reg_alpha=1))]),
            Pipeline(steps=[('rfe', BorderlineSMOTE()),
                            ('classifier',
                             XGBClassifier(n_estimators=1000, reg_alpha=1))]),
            Pipeline(steps=[
                # ('rfe', RFE(XGBClassifier(), )),
                ('classifier',
                 XGBClassifier(
                     n_estimators=1000, scale_pos_weight=3, reg_alpha=1))
            ]),
            Pipeline(
                steps=[('rfe', RFE(XGBClassifier())),
                       ('classifier',
                        XGBClassifier(
                            n_estimators=1000, scale_pos_weight=3, reg_alpha=1)
                        )])
        ]

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.25,
                                                            stratify=Y)

        for pipe in pipelines:
            scores = cross_val_score(pipe,
                                     X_train.values,
                                     y_train,
                                     scoring='precision',
                                     cv=StratifiedKFold(5))
            print("cross val scores")
            print(sum(scores) / 5)
            pipe.fit(X_train.values, y_train.values)
            y_pred = pipe.predict(X_test.values)

            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            print("test scores")
            print(
                f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}"
            )
Esempio n. 23
0
    def evaluate_model(self):

        with open(self.result_folder +
                  '/param_RF_{}.json'.format(self.epoch)) as f:
            dati = json.load(f)

            for data in dati:

                del data['value']

                rf_model = BalancedRandomForestClassifier(**data)

                rf_auc = []

                for i in tqdm(range(20)):

                    cv = StratifiedKFold(n_splits=5,
                                         shuffle=True,
                                         random_state=i + 187462)

                    for train_index, test_index in cv.split(self.X, self.y):

                        trainX = self.X.iloc[lambda x: train_index]
                        testX = self.X.iloc[lambda x: test_index]

                        trainy = np.take(self.y, train_index)
                        testy = np.take(self.y, test_index)

                        median_imputer = SimpleImputer(missing_values=np.NaN,
                                                       strategy='median')
                        imputer = median_imputer.fit(trainX)
                        vtrainX = imputer.transform(trainX)

                        imputertest = median_imputer.fit(testX)
                        vtestX = imputertest.transform(testX)
                        trainX = pd.DataFrame(vtrainX,
                                              columns=trainX.columns,
                                              index=trainX.index)
                        testX = pd.DataFrame(vtestX,
                                             columns=testX.columns,
                                             index=testX.index)

                        # Calcolo AUC per migliori risultati da CatBoost

                        rf_model.fit(trainX, trainy)
                        roc_rf = roc_auc_score(
                            testy,
                            rf_model.predict_proba(testX)[:, 1])
                        rf_auc.append(roc_rf)

                        print(roc_rf)

            print(statistics.mean(rf_auc))
        return rf_auc
Esempio n. 24
0
    def find_clf_parameters(self, train_x, train_y, clf_type):
        max_depth = [2, 4, 6]
        min_samples_leaf = np.arange(1, 4)
        min_samples_split = np.arange(2, 5)
        n_estimators = [100, 300]
        criterion = ['gini', 'entropy']
        sampling_strategy = ['auto', 'majority', 'not majority']

        models1 = {
            'BalancedRandomForestClassifier':
            BalancedRandomForestClassifier(random_state=42),
            'EasyEnsembleClassifier':
            EasyEnsembleClassifier(random_state=42)
        }

        params1 = {
            'BalancedRandomForestClassifier': [{
                'criterion':
                criterion,
                'n_estimators':
                n_estimators,
                'max_depth':
                max_depth,
                'min_samples_leaf':
                min_samples_leaf,
                'min_samples_split':
                min_samples_split,
                'sampling_strategy':
                sampling_strategy,
            }],
            'EasyEnsembleClassifier': [{
                'n_estimators': n_estimators,
                'sampling_strategy': sampling_strategy
            }],
        }

        helper1 = EstimatorSelectionHelper(models1, params1)
        if clf_type == 'binary':
            helper1.fit(train_x,
                        train_y,
                        cv=5,
                        scoring='balanced_accuracy',
                        n_jobs=-1)
        if clf_type == 'multi-class':
            helper1.fit(train_x,
                        train_y,
                        cv=5,
                        scoring='balanced_accuracy',
                        n_jobs=-1)
        df = helper1.score_summary()
        best_estimator = df['estimator'].iloc[0]
        best_estimator_params = df['params'].iloc[0]

        return best_estimator, best_estimator_params
Esempio n. 25
0
def _plot_championship_importance(all_res, save_directory, top = 6):
    
    save_file = save_directory + 'championship_importance.png'
    
    if os.path.exists(save_file):
        return
    
    xs = []
    ys = []
    teams = []

    for season in all_res:

        team_df = all_res[season][0]
        team_stats = all_res[season][1]
        champion = all_res[season][2]

        for team, g in team_df.groupby('TEAM'):
            x = g.nlargest(top, 'TIME')[['off_norm', 'def_norm']].unstack().values
            y = 1 if team in champion else 0

            xs.append(x)
            ys.append(y)
            teams.append(team + '_' + season)

    xs = np.vstack(xs)
    ys = np.array(ys)

    fts = []
    for ntree in tqdm([50, 75, 100, 125, 150, 175, 200]):

        for i in np.where(ys==1)[0]:

            xs_temp = xs[[x for x in range(len(xs)) if x != i]]
            ys_temp = ys[[y for y in range(len(xs)) if y != i]]

            rfr = BalancedRandomForestClassifier(n_estimators=ntree)
            rfr.fit(xs_temp, ys_temp)
            ft = rfr.feature_importances_
            fts.append(ft)
            
    fts = np.vstack(fts)
    
    feature_names = ['off' + str(i+1) for i in range(top)] + ['def' + str(i+1) for i in range(top)]
    
    fig, ax = plt.subplots(figsize=(8,6))
    for i in range(len(feature_names)):
        ax.boxplot(fts[:, i], positions=[i])
    ax.set_xticklabels(feature_names)
    ax.set_ylabel('Feature Importance', labelpad=10)
    ax.set_title('Championship Feature Importance')
    
    plt.savefig(save_file)
    plt.close()
def test_balanced_random_forest_oob_binomial(ratio):
    # Regression test for #655: check that the oob score is closed to 0.5
    # a binomial experiment.
    rng = np.random.RandomState(42)
    n_samples = 1000
    X = np.arange(n_samples).reshape(-1, 1)
    y = rng.binomial(1, ratio, size=n_samples)

    erf = BalancedRandomForestClassifier(oob_score=True, random_state=42)
    erf.fit(X, y)
    assert np.abs(erf.oob_score_ - 0.5) < 0.1
Esempio n. 27
0
def do_sampling_research(X_train, y_train, X_test, y_test, nBits=124, info_features=False):

    print("First tests:")
    # Without sampling
    if info_features:
        get_info_features(y_train, pd.DataFrame(
            {'ACTIVE': y_train}), save="Count_before_{}.pdf".format(nBits))
    pipeline = make_pipeline(ADASYN(sampling_strategy=0.25, random_state=64, n_jobs=-1),
                             BalancedRandomForestClassifier(n_estimators=100, random_state=18, n_jobs=-1))
    display_confusion_matrix(pipeline, X_train, y_train, X_test, y_test,
                             save="confusion_matrix_before_{}.pdf".format(nBits), title="Confusion Matrix before sampling with {} nBits".format(nBits))

    print("\nSecond test:")
    # With sampling
    if info_features:
        get_info_features(y_train, pd.DataFrame(
            {'ACTIVE': y_train}), save="Count_after_{}.pdf".format(nBits))
    pipeline = make_pipeline(ADASYN(sampling_strategy=0.28, random_state=64, n_jobs=-1),
                             BalancedRandomForestClassifier(n_estimators=50, random_state=18, n_jobs=-1))
    display_confusion_matrix(pipeline, X_train, y_train, X_test, y_test,
                             save="confusion_matrix_after_{}.pdf".format(nBits), title="Confusion Matrix after sampling with {} nBits".format(nBits))
Esempio n. 28
0
    def evaluate_on_validation_or_test(self, test=False):

        with open(self.result_folder +
                  '/param_RF_{}.json'.format(self.epoch)) as f:
            dati = json.load(f)
            for data in dati:

                del data['value']

                rf_model = BalancedRandomForestClassifier(**data)

                trainX = self.X
                trainy = self.y
                valx = self.X_validation
                valy = self.y_validation
                if test == True:
                    testx = self.X_test
                    testy = self.y_test

                median_imputer = SimpleImputer(missing_values=np.NaN,
                                               strategy='median')
                imputer = median_imputer.fit(trainX)
                vtrainX = imputer.transform(trainX)
                trainX = pd.DataFrame(vtrainX,
                                      columns=trainX.columns,
                                      index=trainX.index)

                vvalX = imputer.transform(valx)
                valx = pd.DataFrame(vvalX,
                                    columns=valx.columns,
                                    index=valx.index)

                if test == True:
                    vtest = imputer.transform(testx)
                    testx = pd.DataFrame(vtest,
                                         columns=testx.columns,
                                         index=testx.index)
                    trainX = pd.concat([trainX, valx])
                    trainy = np.concatenate((trainy, valy))

                rf_model.fit(trainX, trainy)

                if test == True:
                    roc_rf = roc_auc_score(testy,
                                           rf_model.predict_proba(testx)[:, 1])
                else:
                    roc_rf = roc_auc_score(valy,
                                           rf_model.predict_proba(valx)[:, 1])

                if test == False:
                    print("Validation AUC: {}".format(str(roc_rf)))
                else:
                    print("Test AUC: {}".format(str(roc_rf)))
    def run_best_estimator(self, train_x, train_y, test_x, test_y, estimator,
                           params, clf_type, question):
        estimator_scores = {}

        if estimator == 'BalancedRandomForestClassifier':
            clf = BalancedRandomForestClassifier(
                n_estimators=params['n_estimators'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)
        elif estimator == 'BalancedBaggingClassifier':
            clf = BalancedBaggingClassifier(
                n_estimators=params['n_estimators'],
                bootstrap=params['bootstrap'],
                max_samples=params['max_samples'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)
        elif estimator == 'EasyEnsembleClassifier':
            clf = EasyEnsembleClassifier(
                n_estimators=params['n_estimators'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)

        clf.fit(train_x, train_y)
        cross_val_scores = self.calc_cross_val_scores(clf, train_x, train_y,
                                                      clf_type, question)

        predicted_labels = clf.predict(test_x)

        tn, fp, fn, tp = confusion_matrix(test_y, predicted_labels).ravel()
        specificity = round((tn / (tn + fp)) * 100, 2)

        predicted_prob = clf.predict_proba(test_x)
        predicted_prob_true = [p[1] for p in predicted_prob]

        estimator_scores['Question'] = question
        estimator_scores['Accuracy'] = round(
            accuracy_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Balanced Accuracy'] = round(
            balanced_accuracy_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Precision'] = round(
            precision_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Recall'] = round(
            recall_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Specificity'] = specificity
        estimator_scores['F1'] = round(f1_score(test_y, predicted_labels), 2)
        estimator_scores['ROC AUC'] = round(
            roc_auc_score(test_y, predicted_prob_true), 2)

        # print('Perfect Confusion Matrix for Q-%s is: ' % (str(question).zfill(2)))
        # perfect_labels = train_y
        # print(confusion_matrix(train_y, perfect_labels))

        return cross_val_scores, estimator_scores
def get_classifiers():
    classifiers = [
        DummyClassifier(), LogisticRegression(), PassiveAggressiveClassifier(), RidgeClassifier(), SGDClassifier(), \
        KNeighborsClassifier(), MLPClassifier(), LinearSVC(), \
        NuSVC(), SVC(), DecisionTreeClassifier(), ExtraTreeClassifier(), AdaBoostClassifier(), \
        BaggingClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), \
        RandomForestClassifier(), GaussianProcessClassifier(), \
        EasyEnsembleClassifier(), BalancedBaggingClassifier(), BalancedRandomForestClassifier(),
        XGBClassifier()
    ]
    return classifiers