Ejemplo n.º 1
0
        def objective(trial):

            train_X, val_X, train_y, val_y = train_test_split(self.X,
                                                              self.y,
                                                              test_size=0.2)
            median_imputer = SimpleImputer(missing_values=np.NaN,
                                           strategy='median')
            v_train_X = median_imputer.fit_transform(train_X)
            v_val_X = median_imputer.fit_transform(val_X)
            train_X = pd.DataFrame(v_train_X,
                                   columns=train_X.columns,
                                   index=train_X.index)
            val_X = pd.DataFrame(v_val_X,
                                 columns=val_X.columns,
                                 index=val_X.index)

            v_test_X = median_imputer.fit_transform(self.X_validation)
            test_X = pd.DataFrame(v_test_X,
                                  columns=self.X_validation.columns,
                                  index=self.X_validation.index)

            list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000]

            brf_n_estimators = trial.suggest_categorical(
                'n_estimators', list_trees)
            brf_max_features = trial.suggest_uniform('max_features', 0.15, 1.0)
            brf_min_samples_split = trial.suggest_int('min_samples_split', 2,
                                                      16)
            brf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16)
            brf_min_weight_fraction_leaf = trial.suggest_uniform(
                'min_weight_fraction_leaf', 0, 0.5)
            brf_max_depth = trial.suggest_int('max_depth', 2, 32)

            brfmodel = BalancedRandomForestClassifier(
                n_estimators=brf_n_estimators,
                max_features=brf_max_features,
                min_samples_split=brf_min_samples_split,
                min_samples_leaf=brf_min_samples_leaf,
                max_depth=brf_max_depth,
                min_weight_fraction_leaf=brf_min_weight_fraction_leaf,
                bootstrap=True)

            brfmodel.fit(train_X, train_y)

            aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1])
            aucbrf_test = roc_auc_score(self.y_validation,
                                        brfmodel.predict_proba(test_X)[:, 1])
            print('Accuracy test ' + str(
                accuracy_score(self.y_validation, brfmodel.predict(test_X))))

            plt.figure()
            plot_confusion_matrix(brfmodel,
                                  test_X,
                                  self.y_validation,
                                  cmap=plt.cm.Blues,
                                  normalize=None)
            plt.show()
            print(aucbrf_test)

            return aucbrf
Ejemplo n.º 2
0
def test_balanced_random_forest_attributes(imbalanced_dataset):
    X, y = imbalanced_dataset
    n_estimators = 10
    brf = BalancedRandomForestClassifier(
        n_estimators=n_estimators, random_state=0
    )
    brf.fit(X, y)

    for idx in range(n_estimators):
        X_res, y_res = brf.samplers_[idx].fit_resample(X, y)
        X_res_2, y_res_2 = (
            brf.pipelines_[idx]
            .named_steps["randomundersampler"]
            .fit_resample(X, y)
        )
        assert_allclose(X_res, X_res_2)
        assert_array_equal(y_res, y_res_2)

        y_pred = brf.estimators_[idx].fit(X_res, y_res).predict(X)
        y_pred_2 = brf.pipelines_[idx].fit(X, y).predict(X)
        assert_array_equal(y_pred, y_pred_2)

        y_pred = brf.estimators_[idx].fit(X_res, y_res).predict_proba(X)
        y_pred_2 = brf.pipelines_[idx].fit(X, y).predict_proba(X)
        assert_array_equal(y_pred, y_pred_2)
Ejemplo n.º 3
0
def test_little_tree_with_small_max_samples():
    rng = np.random.RandomState(1)

    X = rng.randn(10000, 2)
    y = rng.randn(10000) > 0

    # First fit with no restriction on max samples
    est1 = BalancedRandomForestClassifier(
        n_estimators=1,
        random_state=rng,
        max_samples=None,
    )

    # Second fit with max samples restricted to just 2
    est2 = BalancedRandomForestClassifier(
        n_estimators=1,
        random_state=rng,
        max_samples=2,
    )

    est1.fit(X, y)
    est2.fit(X, y)

    tree1 = est1.estimators_[0].tree_
    tree2 = est2.estimators_[0].tree_

    msg = "Tree without `max_samples` restriction should have more nodes"
    assert tree1.node_count > tree2.node_count, msg
 def _train_has_damage(cls, preprocessed_df: pd.DataFrame) -> LinearModelType:
     X_train, X_test, Y_train, Y_test = cls.get_X_Y_split(
         preprocessed_df, "has_claim"
     )
     model = BalancedRandomForestClassifier()
     model.fit(X_train, Y_train)
     return model
Ejemplo n.º 5
0
def test_balanced_random_forest_oob(imbalanced_dataset):
    X, y = imbalanced_dataset
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=42,
                                                        stratify=y)
    est = BalancedRandomForestClassifier(
        oob_score=True,
        random_state=0,
        n_estimators=1000,
        min_samples_leaf=2,
    )

    est.fit(X_train, y_train)
    test_score = est.score(X_test, y_test)

    assert abs(test_score - est.oob_score_) < 0.1

    # Check warning if not enough estimators
    est = BalancedRandomForestClassifier(oob_score=True,
                                         random_state=0,
                                         n_estimators=1,
                                         bootstrap=True)
    with pytest.warns(UserWarning) and np.errstate(divide="ignore",
                                                   invalid="ignore"):
        est.fit(X, y)
def main():
    """ Main entrance."""
    print('Spliting challenges')
    split_challenges()
    print('Reading X...')
    X = pd.concat([pd.read_json(XY_PATH['X'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1'])
    print('Reading y...')
    y = pd.concat([pd.read_json(XY_PATH['y'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1'])

    print('\nTraining Inner sampler RFC')
    for i in range(10):
        print(f'Training 10-Fold CV #{i}', end='\r')
        X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i)

        balanced_rfc = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
        balanced_rfc.fit(X_train.to_numpy(), y_train.to_numpy().ravel())

        pd.DataFrame(balanced_rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'brf', f'y_prob_{i}.json'), orient='records')
        pd.Series(balanced_rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'brf', f'feature_importance_{i}.json'))

    print('\nTraining RandomUnderSampler')
    for i in range(10):
        print(f'Training 10-Fold CV #{i}', end='\r')
        X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i)

        rfc = RandomForestClassifier(n_estimators=100, random_state=0)
        rus = RandomUnderSampler(random_state=0)

        X_resample, y_resample = rus.fit_resample(X_train.to_numpy(), y_train.to_numpy().ravel())
        rfc.fit(X_resample, y_resample)

        pd.DataFrame(rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'rus', f'y_prob_{i}.json'), orient='records')
        pd.Series(rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'rus', f'feature_importance_{i}.json'))
    def random_forest(df, drop, target, show, model_name):

        # split the table into features and outcomes
        x_cols = [i for i in df.columns if i not in drop]
        X = df[x_cols]
        y = df[target]

        # split features and outcomes into train and test data
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=1)
        brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
        brf.fit(X_train, y_train)
        y_predictions = brf.predict(X_test)

        feature_importance = sorted(
            zip(brf.feature_importances_, X.columns.tolist()))[::-1]

        # Calculating the accuracy score.
        acc_score = balanced_accuracy_score(y_test, y_predictions)

        # Displaying results
        if show == True:
            print(f"Feature Importance: {model_name}")
            for i in feature_importance:
                print(i)
            print("\n")

        return acc_score * 100
Ejemplo n.º 8
0
def predict_model_kfold(name,path,features_type,label_name,data):
    kfold = KFold(10, True)
    #RandomForest -I 1000 -K 0 -S 1 -num-slots 1
    model = BalancedRandomForestClassifier(n_estimators=1000,max_depth=5)
    index = 0
    size = data.shape[0]
    all_predictions = 0
    x = data.drop('hasBug', axis=1)
    y = data['hasBug']
    num_of_bugs = data.loc[data['hasBug'] == 1].shape[0]
    num_of_all_instances = data.shape[0]
    bug_precent = float(num_of_bugs) / float(num_of_all_instances)
    for train, test in kfold.split(data):
        index += 1
        prediction_train = model.fit(x.iloc[train], y.iloc[train]).predict(x.iloc[test])
        all_predictions += create_all_eval_results(False,y.iloc[test],prediction_train,name,"training",features_type,num_of_bugs,num_of_all_instances,bug_precent,None)

    all_predictions /= index
    start_list = [name,"training",features_type,"sklearn - python"]
    result_list = start_list+ all_predictions.tolist()

    global results_all_projects
    results_all_projects.loc[len(results_all_projects)] = result_list

    model.fit(x,y)
    return model
Ejemplo n.º 9
0
def test_balanced_random_forest(imbalanced_dataset):
    n_estimators = 10
    brf = BalancedRandomForestClassifier(n_estimators=n_estimators, random_state=0)
    brf.fit(*imbalanced_dataset)

    assert len(brf.samplers_) == n_estimators
    assert len(brf.estimators_) == n_estimators
    assert len(brf.pipelines_) == n_estimators
    assert len(brf.feature_importances_) == imbalanced_dataset[0].shape[1]
Ejemplo n.º 10
0
def test_balanced_random_forest_pruning(imbalanced_dataset):
    brf = BalancedRandomForestClassifier()
    brf.fit(*imbalanced_dataset)
    n_nodes_no_pruning = brf.estimators_[0].tree_.node_count

    brf_pruned = BalancedRandomForestClassifier(ccp_alpha=0.015)
    brf_pruned.fit(*imbalanced_dataset)
    n_nodes_pruning = brf_pruned.estimators_[0].tree_.node_count

    assert n_nodes_no_pruning > n_nodes_pruning
Ejemplo n.º 11
0
def test_balanced_random_forest(imbalanced_dataset):
    n_estimators = 10
    brf = BalancedRandomForestClassifier(n_estimators=n_estimators,
                                         random_state=0)
    brf.fit(*imbalanced_dataset)

    assert len(brf.samplers_) == n_estimators
    assert len(brf.estimators_) == n_estimators
    assert len(brf.pipelines_) == n_estimators
    assert len(brf.feature_importances_) == imbalanced_dataset[0].shape[1]
Ejemplo n.º 12
0
    def evaluate_model(self):

        with open(self.result_folder +
                  '/param_RF_{}.json'.format(self.epoch)) as f:
            dati = json.load(f)

            for data in dati:

                del data['value']

                rf_model = BalancedRandomForestClassifier(**data)

                rf_auc = []

                for i in tqdm(range(20)):

                    cv = StratifiedKFold(n_splits=5,
                                         shuffle=True,
                                         random_state=i + 187462)

                    for train_index, test_index in cv.split(self.X, self.y):

                        trainX = self.X.iloc[lambda x: train_index]
                        testX = self.X.iloc[lambda x: test_index]

                        trainy = np.take(self.y, train_index)
                        testy = np.take(self.y, test_index)

                        median_imputer = SimpleImputer(missing_values=np.NaN,
                                                       strategy='median')
                        imputer = median_imputer.fit(trainX)
                        vtrainX = imputer.transform(trainX)

                        imputertest = median_imputer.fit(testX)
                        vtestX = imputertest.transform(testX)
                        trainX = pd.DataFrame(vtrainX,
                                              columns=trainX.columns,
                                              index=trainX.index)
                        testX = pd.DataFrame(vtestX,
                                             columns=testX.columns,
                                             index=testX.index)

                        # Calcolo AUC per migliori risultati da CatBoost

                        rf_model.fit(trainX, trainy)
                        roc_rf = roc_auc_score(
                            testy,
                            rf_model.predict_proba(testX)[:, 1])
                        rf_auc.append(roc_rf)

                        print(roc_rf)

            print(statistics.mean(rf_auc))
        return rf_auc
Ejemplo n.º 13
0
def _plot_championship_importance(all_res, save_directory, top = 6):
    
    save_file = save_directory + 'championship_importance.png'
    
    if os.path.exists(save_file):
        return
    
    xs = []
    ys = []
    teams = []

    for season in all_res:

        team_df = all_res[season][0]
        team_stats = all_res[season][1]
        champion = all_res[season][2]

        for team, g in team_df.groupby('TEAM'):
            x = g.nlargest(top, 'TIME')[['off_norm', 'def_norm']].unstack().values
            y = 1 if team in champion else 0

            xs.append(x)
            ys.append(y)
            teams.append(team + '_' + season)

    xs = np.vstack(xs)
    ys = np.array(ys)

    fts = []
    for ntree in tqdm([50, 75, 100, 125, 150, 175, 200]):

        for i in np.where(ys==1)[0]:

            xs_temp = xs[[x for x in range(len(xs)) if x != i]]
            ys_temp = ys[[y for y in range(len(xs)) if y != i]]

            rfr = BalancedRandomForestClassifier(n_estimators=ntree)
            rfr.fit(xs_temp, ys_temp)
            ft = rfr.feature_importances_
            fts.append(ft)
            
    fts = np.vstack(fts)
    
    feature_names = ['off' + str(i+1) for i in range(top)] + ['def' + str(i+1) for i in range(top)]
    
    fig, ax = plt.subplots(figsize=(8,6))
    for i in range(len(feature_names)):
        ax.boxplot(fts[:, i], positions=[i])
    ax.set_xticklabels(feature_names)
    ax.set_ylabel('Feature Importance', labelpad=10)
    ax.set_title('Championship Feature Importance')
    
    plt.savefig(save_file)
    plt.close()
Ejemplo n.º 14
0
def test_balanced_random_forest_oob_binomial(ratio):
    # Regression test for #655: check that the oob score is closed to 0.5
    # a binomial experiment.
    rng = np.random.RandomState(42)
    n_samples = 1000
    X = np.arange(n_samples).reshape(-1, 1)
    y = rng.binomial(1, ratio, size=n_samples)

    erf = BalancedRandomForestClassifier(oob_score=True, random_state=42)
    erf.fit(X, y)
    assert np.abs(erf.oob_score_ - 0.5) < 0.1
Ejemplo n.º 15
0
    def evaluate_on_validation_or_test(self, test=False):

        with open(self.result_folder +
                  '/param_RF_{}.json'.format(self.epoch)) as f:
            dati = json.load(f)
            for data in dati:

                del data['value']

                rf_model = BalancedRandomForestClassifier(**data)

                trainX = self.X
                trainy = self.y
                valx = self.X_validation
                valy = self.y_validation
                if test == True:
                    testx = self.X_test
                    testy = self.y_test

                median_imputer = SimpleImputer(missing_values=np.NaN,
                                               strategy='median')
                imputer = median_imputer.fit(trainX)
                vtrainX = imputer.transform(trainX)
                trainX = pd.DataFrame(vtrainX,
                                      columns=trainX.columns,
                                      index=trainX.index)

                vvalX = imputer.transform(valx)
                valx = pd.DataFrame(vvalX,
                                    columns=valx.columns,
                                    index=valx.index)

                if test == True:
                    vtest = imputer.transform(testx)
                    testx = pd.DataFrame(vtest,
                                         columns=testx.columns,
                                         index=testx.index)
                    trainX = pd.concat([trainX, valx])
                    trainy = np.concatenate((trainy, valy))

                rf_model.fit(trainX, trainy)

                if test == True:
                    roc_rf = roc_auc_score(testy,
                                           rf_model.predict_proba(testx)[:, 1])
                else:
                    roc_rf = roc_auc_score(valy,
                                           rf_model.predict_proba(valx)[:, 1])

                if test == False:
                    print("Validation AUC: {}".format(str(roc_rf)))
                else:
                    print("Test AUC: {}".format(str(roc_rf)))
    def run_best_estimator(self, train_x, train_y, test_x, test_y, estimator,
                           params, clf_type, question):
        estimator_scores = {}

        if estimator == 'BalancedRandomForestClassifier':
            clf = BalancedRandomForestClassifier(
                n_estimators=params['n_estimators'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)
        elif estimator == 'BalancedBaggingClassifier':
            clf = BalancedBaggingClassifier(
                n_estimators=params['n_estimators'],
                bootstrap=params['bootstrap'],
                max_samples=params['max_samples'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)
        elif estimator == 'EasyEnsembleClassifier':
            clf = EasyEnsembleClassifier(
                n_estimators=params['n_estimators'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)

        clf.fit(train_x, train_y)
        cross_val_scores = self.calc_cross_val_scores(clf, train_x, train_y,
                                                      clf_type, question)

        predicted_labels = clf.predict(test_x)

        tn, fp, fn, tp = confusion_matrix(test_y, predicted_labels).ravel()
        specificity = round((tn / (tn + fp)) * 100, 2)

        predicted_prob = clf.predict_proba(test_x)
        predicted_prob_true = [p[1] for p in predicted_prob]

        estimator_scores['Question'] = question
        estimator_scores['Accuracy'] = round(
            accuracy_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Balanced Accuracy'] = round(
            balanced_accuracy_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Precision'] = round(
            precision_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Recall'] = round(
            recall_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Specificity'] = specificity
        estimator_scores['F1'] = round(f1_score(test_y, predicted_labels), 2)
        estimator_scores['ROC AUC'] = round(
            roc_auc_score(test_y, predicted_prob_true), 2)

        # print('Perfect Confusion Matrix for Q-%s is: ' % (str(question).zfill(2)))
        # perfect_labels = train_y
        # print(confusion_matrix(train_y, perfect_labels))

        return cross_val_scores, estimator_scores
Ejemplo n.º 17
0
class BaselineRandomForest(BaseClassifier):
    def __init__(self):
        self.random_forest_classifier = RandomForestClassifier(
            n_estimators=500,
            max_features='auto',
            max_depth=None,
            n_jobs=1,
            class_weight=None,
            criterion='entropy',
            min_samples_split=2,
            min_samples_leaf=1)
        self.feature_preprocessor = FeaturePreprocessor()
        self.feature_list = None
        self.model_filename = 'baseline_rf.pkl'

    def fit(self, samples: pd.DataFrame, labels: pd.DataFrame):
        samples = self.feature_preprocessor.preprocess_features(samples)
        samples = self.feature_preprocessor.remove_duplicates(samples)

        # intersect samples and labels
        samples, labels = intersect_oids_in_dataframes(samples, labels)

        self.feature_list = samples.columns
        samples_np_array = samples.values
        labels_np_array = labels['classALeRCE'].loc[samples.index].values
        self.random_forest_classifier.fit(samples_np_array, labels_np_array)

    def predict_proba(self, samples: pd.DataFrame) -> pd.DataFrame:
        samples = self.feature_preprocessor.preprocess_features(samples)
        samples_np_array = samples[self.feature_list].values
        predicted_probs = self.random_forest_classifier.predict_proba(
            samples_np_array)
        predicted_probs_df = pd.DataFrame(predicted_probs,
                                          columns=self.get_list_of_classes(),
                                          index=samples.index.values)
        predicted_probs_df.index.name = 'oid'
        return predicted_probs_df

    def get_list_of_classes(self) -> list:
        return self.random_forest_classifier.classes_

    def save_model(self, directory: str) -> None:
        with open(os.path.join(directory, self.model_filename), 'wb') as f:
            pickle.dump(self.random_forest_classifier, f,
                        pickle.HIGHEST_PROTOCOL)
        with open(os.path.join(directory, 'feature_list.pkl'), 'wb') as f:
            pickle.dump(self.feature_list, f, pickle.HIGHEST_PROTOCOL)

    def load_model(self, directory: str) -> None:
        rf = pd.read_pickle(os.path.join(directory, self.model_filename))
        self.random_forest_classifier = rf
        self.feature_list = pd.read_pickle(
            os.path.join(directory, 'feature_list.pkl'))
Ejemplo n.º 18
0
def test_balanced_random_forest_oob(imbalanced_dataset):
    X, y = imbalanced_dataset
    est = BalancedRandomForestClassifier(oob_score=True, random_state=0)

    n_samples = X.shape[0]
    est.fit(X[:n_samples // 2, :], y[:n_samples // 2])
    test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:])

    assert abs(test_score - est.oob_score_) < 0.1

    # Check warning if not enough estimators
    est = BalancedRandomForestClassifier(oob_score=True, random_state=0,
                                         n_estimators=1, bootstrap=True)
    with pytest.warns(UserWarning) and np.errstate(divide="ignore",
                                                   invalid="ignore"):
        est.fit(X, y)
def random_forest(X_train, y_train, X_test, y_test, X_train_res, y_train_res):
    rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    rf.fit(X_train, y_train.values.ravel())
    y_train_rf = rf.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_rf)
    without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Random Forest (niezbalansowany): {}%".format(without))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    rf_oversampling = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    rf_oversampling.fit(X_train_res, y_train_res.ravel())
    y_train_rf = rf_oversampling.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_rf)
    with_oversampling=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Random Forest (z oversamplingiem): {}%".format(without))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    brf.fit(X_train, y_train.values.ravel())
    y_train_brf = brf.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_brf)
    within=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Random Forest (zbalansowany - undersampling): {}%".format(within))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])
    print(brf.feature_importances_)
    
    objects = ('country','gender', 'age', 'visiting Wuhan', 'from Wuhan')
    y_pos = np.arange(len(objects))
    performance = brf.feature_importances_*100
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('Procent zależności')
    plt.title('Zależność poszczególnych atrybutów')
    plt.show()

    objects = ('Random Forest niezbalansowany','Random Forest z oversamplingiem', 'Random Forest zbalansowany')
    y_pos = np.arange(len(objects))
    performance = [without, with_oversampling, within]
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('Procent dokładności')
    plt.title('Dokładność Random Forest')
    plt.show()

    return without, within
Ejemplo n.º 20
0
def evaluate(X_train, y_train, X_test, y_test):
    global seed
    clf = BalancedRandomForestClassifier(n_estimators=500, random_state=seed)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test).argsort(axis=1)
    y_pred1 = y_pred[:, -1]
    y_pred2 = y_pred[:, -2]
    return metrics.confusion_matrix(y_test, y_pred1), metrics.confusion_matrix(
        y_test, y_pred2)
Ejemplo n.º 21
0
def balanced_random_forest(train_features,
                           train_labels,
                           test_features,
                           feature_list=None,
                           hfo_type_name=None):
    rf = BalancedRandomForestClassifier(
        random_state=32,
        n_jobs=-1,  # use all available processors
        # class_weight='balanced_subsample'
    )
    rf.fit(train_features, train_labels)
    # Predict over test
    rf_predictions = rf.predict(test_features)
    rf_probs = rf.predict_proba(test_features)[:, 1]
    # IF FEATURE IMPORTANCE FIGS NOT EXISTS
    # print_feature_importances(rf, feature_list)
    # graphics.feature_importances(feature_list, rf.feature_importances_, hfo_type_name)
    return rf_predictions, rf_probs, rf
Ejemplo n.º 22
0
def apply_balanced_RF_classifier(X_train, y_train, model_path):
    '''
    Args: 
        X_train dataframe with all the features to be used for training
        y_train series containing labels for each row of X_train
        model_path path where trained balanced random forest model is to be saved
        
    Output:
        trained balanced random forest model
    '''
    BRF_model = BalancedRandomForestClassifier(n_estimators=50,
                                               random_state=0,
                                               n_jobs=-1)
    # Fit the training data
    BRF_model.fit(X_train, y_train)

    pickle_models(BRF_model, model_path)

    return BRF_model
Ejemplo n.º 23
0
def test_balanced_random_forest_error_warning_warm_start(imbalanced_dataset):
    brf = BalancedRandomForestClassifier(n_estimators=5)
    brf.fit(*imbalanced_dataset)

    with pytest.raises(ValueError, message="must be larger or equal to"):
        brf.set_params(warm_start=True, n_estimators=2)
        brf.fit(*imbalanced_dataset)

    brf.set_params(n_estimators=10)
    brf.fit(*imbalanced_dataset)

    with pytest.warns(UserWarning, match="Warm-start fitting without"):
        brf.fit(*imbalanced_dataset)
Ejemplo n.º 24
0
def test_balanced_random_forest_error_warning_warm_start(imbalanced_dataset):
    brf = BalancedRandomForestClassifier(n_estimators=5)
    brf.fit(*imbalanced_dataset)

    with pytest.raises(ValueError, match="must be larger or equal to"):
        brf.set_params(warm_start=True, n_estimators=2)
        brf.fit(*imbalanced_dataset)

    brf.set_params(n_estimators=10)
    brf.fit(*imbalanced_dataset)

    with pytest.warns(UserWarning, match="Warm-start fitting without"):
        brf.fit(*imbalanced_dataset)
    def fit(self, X, Y, sample_weight=None):
        from imblearn.ensemble import BalancedRandomForestClassifier
        estimator = BalancedRandomForestClassifier(
            n_estimators=self.n_estimators,
            criterion=self.criterion,
            max_features=self.max_features,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            bootstrap=self.bootstrap,
            min_impurity_decrease=self.min_impurity_decrease,
            random_state=self.random_state,
            n_jobs=self.n_jobs,
            class_weight=self.class_weight,
            sampling_strategy=self.sampling_strategy,
            replacement=self.replacement)

        estimator.fit(X, Y)

        self.estimator = estimator
        return self
Ejemplo n.º 26
0
def test_balanced_random_forest_attributes(imbalanced_dataset):
    X, y = imbalanced_dataset
    n_estimators = 10
    brf = BalancedRandomForestClassifier(n_estimators=n_estimators,
                                         random_state=0)
    brf.fit(X, y)

    for idx in range(n_estimators):
        X_res, y_res = brf.samplers_[idx].fit_resample(X, y)
        X_res_2, y_res_2 = brf.pipelines_[idx].named_steps[
            'randomundersampler'].fit_resample(X, y)
        assert_allclose(X_res, X_res_2)
        assert_array_equal(y_res, y_res_2)

        y_pred = brf.estimators_[idx].fit(X_res, y_res).predict(X)
        y_pred_2 = brf.pipelines_[idx].fit(X, y).predict(X)
        assert_array_equal(y_pred, y_pred_2)

        y_pred = brf.estimators_[idx].fit(X_res, y_res).predict_proba(X)
        y_pred_2 = brf.pipelines_[idx].fit(X, y).predict_proba(X)
        assert_array_equal(y_pred, y_pred_2)
Ejemplo n.º 27
0
        def objective(trial):
            train_X, val_X, train_y, val_y = self.df_train_media.loc[:, self.
                                                                     df_train_media
                                                                     .
                                                                     columns !=
                                                                     '41'].values, self.df_validation_media.loc[:, self.df_validation_media.columns != '41'].values, self.df_train_media[
                                                                         '41'].values, self.df_validation_media[
                                                                             '41'].values
            test_X, test_y = self.df_test_media.loc[:, self.df_test_media.
                                                    columns !=
                                                    '41'].values, self.df_test_media[
                                                        '41'].values
            list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000]

            n_estimators = trial.suggest_categorical('n_estimators',
                                                     list_trees)
            max_features = trial.suggest_uniform('max_features', 0.15, 1.0)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 16)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16)
            min_weight_fraction_leaf = trial.suggest_uniform(
                'min_weight_fraction_leaf', 0, 0.5)
            max_depth = trial.suggest_int('max_depth', 2, 32)

            brfmodel = BalancedRandomForestClassifier(
                n_estimators=n_estimators,
                max_features=max_features,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                max_depth=max_depth,
                min_weight_fraction_leaf=min_weight_fraction_leaf,
                bootstrap=True)
            brfmodel.fit(train_X, train_y)
            aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1])
            print(
                "Test AUC: " +
                str(roc_auc_score(test_y,
                                  brfmodel.predict_proba(test_X)[:, 1])))

            return aucbrf
Ejemplo n.º 28
0
def train_model(data):

    dataset = pd.get_dummies(
        data,
        columns=['Employment.Type', 'Driving_flag', 'Bureau_bin'],
        drop_first=True)
    #dataset = pd.get_dummies(data,columns=['Employment.Type','Driving_flag'],drop_first=True)
    X = dataset.drop('loan_default', axis=1)
    y = dataset['loan_default']

    #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,train_size=.8, stratify=y)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0,
                                                        stratify=y)

    rfc = RandomForestClassifier(class_weight='balanced', n_estimators=100)
    rfc.fit(X_train, y_train)
    lr = LogisticRegression(class_weight='balanced')
    lr.fit(X_train, y_train)
    xgb = XGBClassifier(scale_pos_weight=3.4)
    xgb.fit(X_train, y_train)

    brfc = BalancedRandomForestClassifier(max_depth=4, random_state=0)
    brfc.fit(X_train, y_train)
    bbc = BalancedBaggingClassifier(n_estimators=100, random_state=42)
    bbc.fit(X_train, y_train)
    models = [rfc, lr, xgb, brfc, bbc]
    model_names = [
        'RandomForestClassifier', 'LogisticRegression', 'XGBClassifier',
        'BalancedRandomForestClassifier', 'BalancedBaggingClassifier'
    ]
    for m, n in zip(models, model_names):
        print('Classifier: ' + n)
        predict_evaluate_classifier(X_test, y_test, m)

    return rfc, lr, xgb, brfc, bbc
Ejemplo n.º 29
0
def test_balanced_random_forest_error(imbalanced_dataset, forest_params,
                                      err_msg):
    brf = BalancedRandomForestClassifier(**forest_params)
    with pytest.raises(ValueError, match=err_msg):
        brf.fit(*imbalanced_dataset)
plot_confusion_matrix(cm_balanced_bagging, classes=np.unique(satimage.target),
                      ax=ax[1], title='Balanced bagging')

###############################################################################
# Classification using random forest classifier with and without sampling
###############################################################################
# Random forest is another popular ensemble method and it is usually
# outperforming bagging. Here, we used a vanilla random forest and its balanced
# counterpart in which each bootstrap sample is balanced.

rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0,
                                     n_jobs=-1)

rf.fit(X_train, y_train)
brf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_pred_brf = brf.predict(X_test)

# Similarly to the previous experiment, the balanced classifier outperform the
# classifier which learn from imbalanced bootstrap samples. In addition, random
# forest outsperforms the bagging classifier.

print('Random Forest classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rf),
              geometric_mean_score(y_test, y_pred_rf)))
cm_rf = confusion_matrix(y_test, y_pred_rf)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_rf, classes=np.unique(satimage.target), ax=ax[0],
Ejemplo n.º 31
0
    else:
        finite_idx = np.where(np.isfinite(column))[0]
    x = vectors[finite_idx, :]
    y = column[finite_idx]
    if y.sum() == 0 or y.sum() == len(y):
        print("%15s: undefined" % (name))
        continue
    train_x, test_x, train_y, test_y = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        stratify=y)

    if args.brf:
        rf = BalancedRandomForestClassifier(n_estimators=100, n_jobs=4)
    else:
        rf = RandomForestClassifier(n_estimators=100, n_jobs=4)

    rf.fit(train_x, train_y)
    p_te = rf.predict_proba(test_x)
    auc_te = roc_auc_score(test_y, p_te[:, 1])
    bacc = balanced_accuracy_score(test_y, p_te[:, 1].round(0))
    print("%15s: %3.5f %3.5f" % (name, auc_te, bacc))
    bacc_av += bacc
    auc_av += auc_te

    if not (args.save is None):
        gzpickle(args.save + '_%i.pkz' % i, rf)

print('Averages:')
print('AUC: %8.3f   BAcc: %8.3f' % (auc_av / (i + 1), bacc_av / (i + 1)))
Ejemplo n.º 32
0
def test_balanced_random_forest_sample_weight(imbalanced_dataset):
    rng = np.random.RandomState(42)
    X, y = imbalanced_dataset
    sample_weight = rng.rand(y.shape[0])
    brf = BalancedRandomForestClassifier(n_estimators=5, random_state=0)
    brf.fit(X, y, sample_weight)
Ejemplo n.º 33
0
# %% [markdown]
''' 
## Train a random forest classifier

*Note: this may take a while*
'''

# %%
clf = BalancedRandomForestClassifier(n_estimators=2000,
                                     replacement=True,
                                     sampling_strategy='not minority',
                                     n_jobs=4,
                                     random_state=42,
                                     verbose=1)
clf.fit(X_train, Y_train)

Y_test_pred = clf.predict(X_test)
print('\nClassifier performance')
print('Out of sample:\n',
      metrics.classification_report(Y_test, Y_test_pred, zero_division=0))

# %% [markdown]
'''
## Robustness to unforseen scenarios

What if the subjects in the test set wore the device differently from
those in the training set? For example, suppose that all the subjects in the
training set were right-handed, but the test subjects are left-handed.
This would more or less result in the device being rotated.
Ejemplo n.º 34
0
def test_balanced_random_forest_sample_weight(imbalanced_dataset):
    rng = np.random.RandomState(42)
    X, y = imbalanced_dataset
    sample_weight = rng.rand(y.shape[0])
    brf = BalancedRandomForestClassifier(n_estimators=5, random_state=0)
    brf.fit(X, y, sample_weight)
Ejemplo n.º 35
0
def test_balanced_random_forest_error(imbalanced_dataset, forest_params,
                                      err_msg):
    brf = BalancedRandomForestClassifier(**forest_params)
    with pytest.raises(ValueError, message=err_msg):
        brf.fit(*imbalanced_dataset)
Ejemplo n.º 36
0
def Clasificar(database, new, path):
    pd.options.mode.chained_assignment = None
    if 'Response by Category' in list(database.columns):
        database = database.drop(['Response by Category','Response by Description'], axis = 1)
    database = database.sample(frac= 0.4, replace = False)
    
    #Chequeo las companias que ya estaban clasificadas
    #d = new.merge(database, how ='left', left_on='Organization Name', right_on = 'Investee')[['Investee','Category.1','Area of Focus']]
    #new = new.merge(d, how = "left", left_on = "Organization Name", right_on = "Investee")
    #new = new.drop(columns=["Investee"])
    
    database["Category.1"] = database["Category.1"].replace("rejected", "Rejected")
    database["Category.1"] = database["Category.1"].replace("B2C ", "B2C")
    database["Category.1"] = database["Category.1"].replace("FIntech", "Fintech")

    database['Prediction'] = np.nan
    new['Prediction'] = np.nan
    new = new.drop(['Prediction'], axis=1)

    #CLASIFICADOR
    
    warnings.filterwarnings('ignore')
    
    
    print('Importando bases de datos')
    
    new = new.rename(columns = {'Categories':'Category','Organization Name':'Investee'})
    train = database[['Operation','Investee', 'Category', 'Description', 'Category.1', 'Area of Focus']].dropna()
    newdata = new[['Transaction Name','Investee', 'Category', 'Description']]
    
    
    print('Preprocesamiento del texto')
    
    stop_words = stopwords.words('english')
    
    for column in ['Category','Description']:
        
        train[column] = train[column].apply(lambda x: (" ".join(str(x).lower() for x in str(x).split())).encode('utf-8').decode('utf-8'))  # lower case
        train[column] = train[column].str.replace('[^\w\s]', ' ')          																											# removing punctuation
        train[column] = train[column].apply(lambda x: " ".join(str(x) for x in str(x).split() if x not in stop_words))   # removing stop words
        newdata[column] = newdata[column].apply(lambda x: (" ".join(x.lower() for x in str(x).split())))  # lower case
        newdata[column] = newdata[column].str.replace('[^\w\s]', ' ')																		# removing punctuation
        newdata[column] = newdata[column].apply(lambda x: " ".join(str(x) for x in str(x).split() if x not in stop_words))   # removing stop words
    
    
    train_src1 = train[['Category','Description','Category.1']]
    train_src1['Rejected?'] = 0
    train_src1.loc[train_src1['Category.1'] != 'Rejected', 'Rejected?'] = 1
    
    new_src1 = newdata[['Category','Description']]
    #new_src1['Rejected?'] = 0
    #new_src1.loc[new_src1['Category.1'] != 'Rejected', 'Rejected?'] = 1
    
    
    #Binarizacion
    vectorizer = CountVectorizer()
    
    vectorI = pd.DataFrame(vectorizer.fit_transform(train_src1['Category']).toarray())
    vectorI_new = pd.DataFrame(vectorizer.transform(new_src1['Category']).toarray())
    vectorIdes = pd.DataFrame(vectorizer.fit_transform(train_src1['Description']).toarray())
    vectorIdes_new = pd.DataFrame(vectorizer.transform(new_src1['Description']).toarray())
    
    vectorI = pd.concat([vectorI, vectorIdes], axis = 1)
    vectorI_new = pd.concat([vectorI_new, vectorIdes_new], axis = 1)
    
    print('Entrenamiento')
    
    #Clasificacion binaria: Rechazadas vs no rechazadas
                #Resampling + Random Forest
    brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
    brf.fit(vectorI, train_src1['Rejected?'])
    y_train_pred = brf.predict(vectorI)
    print('Confusion matrix: \n' , confusion_matrix(train_src1['Rejected?'], y_train_pred))
    print('Accuracy: \n' , accuracy_score(train_src1['Rejected?'], y_train_pred))
    print('Recall: \n' , recall_score(train_src1['Rejected?'], y_train_pred))
    
    
    print('Clasificacion y exportacion')
    #Ajustando modelo a nuevos datos
    y_new_predict = brf.predict(vectorI_new)
    y_new_predict_proba = brf.predict_proba(vectorI_new)
    
    newdata['Prediction'] = y_new_predict
    newdata['Prob. of being rejected'] = y_new_predict_proba[:,0]
    newdata['Prob. of being of interest'] = y_new_predict_proba[:,1]
    

    
    #Creamos archivo Companies y exportamos
    new = pd.concat([new, newdata[['Prediction','Prob. of being rejected','Prob. of being of interest']]], axis=1, sort=False) 

    return new
## Train a random forest classifier

*Note: this may take a while*
'''

# %%
clf = BalancedRandomForestClassifier(
    n_estimators=2000,
    replacement=True,
    sampling_strategy='not minority',
    oob_score=True,
    n_jobs=4,
    random_state=42,
    verbose=1
)
clf.fit(X_train, Y_train)

Y_test_pred = clf.predict(X_test)
print('\nClassifier performance')
print('Out of sample:\n', metrics.classification_report(Y_test, Y_test_pred, zero_division=0)) 

# This will be the training set
Y_in_train = clf.oob_decision_function_.astype('float32')
# This will be the test set
Y_in_test = clf.predict_proba(X_test).astype('float32')

# %% [markdown]
'''

## Architecture design
As a baseline, let's use a single-layer bidirectional LSTM.