Beispiel #1
0
def do_CV_Voting(LS, cv=10):
    nBits = 1250
    with measure_time("Creating fingerprint"):
        X_LS = create_fingerprints(LS["SMILES"].values, nBits=nBits)
        # drop duplicate
        data = pd.DataFrame(X_LS)
        data = data.drop_duplicates()
        X_LS = data.values

    # Drop also duplicate in the y_LS samples
    y_LS = LS["ACTIVE"].loc[data.index].values
    X_train, X_test, y_train, y_test = train_test_split(
        X_LS, y_LS, test_size=0.25, train_size=0.75, random_state=1)
    pipeline_1 = make_pipeline(ADASYN(sampling_strategy=0.25, random_state=64, n_jobs=-1),
                               BalancedRandomForestClassifier(n_estimators=600, random_state=18, n_jobs=-1))
    pipeline_2 = make_pipeline(ADASYN(random_state=64, n_jobs=-1),
                               BalancedRandomForestClassifier(n_estimators=600, random_state=24, n_jobs=-1))
    BRF = BalancedRandomForestClassifier(n_estimators=100, random_state=18, n_jobs=-1)
    BGC = make_pipeline(ADASYN(sampling_strategy=0.25, random_state=64, n_jobs=-1),
                        BalancedBaggingClassifier(estimator=DecisionTreeClassifier(max_features="log2"), n_estimators=50))
    votingModel = VotingClassifier(estimators=[(
        'pip1', pipeline_1), ('pip2', pipeline_2), ('BRF', BRF), ('BGC', BGC)], voting='soft', weights=[3, 1, 1, 1], n_jobs=-1)
    scores = cross_validate(votingModel, X_train, y_train, cv=cv, scoring=(
        'roc_auc', 'average_precision'), return_estimator=True)
    print(scores['test_roc_auc'].mean(),
          scores['test_average_precision'].mean())
    model = scores['estimator'][np.argmax(scores['test_roc_auc'])]
    y_pred = model.predict(X_test)
    conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
    print("confusion_matrix:\n", conf_mat)
Beispiel #2
0
def test_balanced_random_forest_attributes(imbalanced_dataset):
    X, y = imbalanced_dataset
    n_estimators = 10
    brf = BalancedRandomForestClassifier(
        n_estimators=n_estimators, random_state=0
    )
    brf.fit(X, y)

    for idx in range(n_estimators):
        X_res, y_res = brf.samplers_[idx].fit_resample(X, y)
        X_res_2, y_res_2 = (
            brf.pipelines_[idx]
            .named_steps["randomundersampler"]
            .fit_resample(X, y)
        )
        assert_allclose(X_res, X_res_2)
        assert_array_equal(y_res, y_res_2)

        y_pred = brf.estimators_[idx].fit(X_res, y_res).predict(X)
        y_pred_2 = brf.pipelines_[idx].fit(X, y).predict(X)
        assert_array_equal(y_pred, y_pred_2)

        y_pred = brf.estimators_[idx].fit(X_res, y_res).predict_proba(X)
        y_pred_2 = brf.pipelines_[idx].fit(X, y).predict_proba(X)
        assert_array_equal(y_pred, y_pred_2)
    def random_forest(df, drop, target, show, model_name):

        # split the table into features and outcomes
        x_cols = [i for i in df.columns if i not in drop]
        X = df[x_cols]
        y = df[target]

        # split features and outcomes into train and test data
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=1)
        brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
        brf.fit(X_train, y_train)
        y_predictions = brf.predict(X_test)

        feature_importance = sorted(
            zip(brf.feature_importances_, X.columns.tolist()))[::-1]

        # Calculating the accuracy score.
        acc_score = balanced_accuracy_score(y_test, y_predictions)

        # Displaying results
        if show == True:
            print(f"Feature Importance: {model_name}")
            for i in feature_importance:
                print(i)
            print("\n")

        return acc_score * 100
 def _train_has_damage(cls, preprocessed_df: pd.DataFrame) -> LinearModelType:
     X_train, X_test, Y_train, Y_test = cls.get_X_Y_split(
         preprocessed_df, "has_claim"
     )
     model = BalancedRandomForestClassifier()
     model.fit(X_train, Y_train)
     return model
def main():
    """ Main entrance."""
    print('Spliting challenges')
    split_challenges()
    print('Reading X...')
    X = pd.concat([pd.read_json(XY_PATH['X'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1'])
    print('Reading y...')
    y = pd.concat([pd.read_json(XY_PATH['y'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1'])

    print('\nTraining Inner sampler RFC')
    for i in range(10):
        print(f'Training 10-Fold CV #{i}', end='\r')
        X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i)

        balanced_rfc = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
        balanced_rfc.fit(X_train.to_numpy(), y_train.to_numpy().ravel())

        pd.DataFrame(balanced_rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'brf', f'y_prob_{i}.json'), orient='records')
        pd.Series(balanced_rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'brf', f'feature_importance_{i}.json'))

    print('\nTraining RandomUnderSampler')
    for i in range(10):
        print(f'Training 10-Fold CV #{i}', end='\r')
        X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i)

        rfc = RandomForestClassifier(n_estimators=100, random_state=0)
        rus = RandomUnderSampler(random_state=0)

        X_resample, y_resample = rus.fit_resample(X_train.to_numpy(), y_train.to_numpy().ravel())
        rfc.fit(X_resample, y_resample)

        pd.DataFrame(rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'rus', f'y_prob_{i}.json'), orient='records')
        pd.Series(rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'rus', f'feature_importance_{i}.json'))
Beispiel #6
0
def get_classifier(n_subj, random_state, n_jobs_rf=1, multiclass=False):
    if multiclass:
        # multiplication with 0.9 required to make the subject number agree with training set AND because one of the
        # classes has only very few subject such that we can't reasonably sample more than 100 subjects
        subsample_size = round(n_subj * 0.9 * 0.5 / 4)
        estimator = BalancedRandomForestClassifier(n_estimators=1000,
                                                   class_weight='balanced',
                                                   oob_score=False,
                                                   sampling_strategy={
                                                       0: subsample_size,
                                                       1: subsample_size,
                                                       2: subsample_size,
                                                       3: subsample_size
                                                   },
                                                   n_jobs=n_jobs_rf,
                                                   random_state=random_state,
                                                   bootstrap=False,
                                                   replacement=False)
    else:
        subsample_size = round(n_subj * 0.632 / 2)
        estimator = BalancedRandomForestClassifier(n_estimators=1000,
                                                   class_weight='balanced',
                                                   oob_score=False,
                                                   sampling_strategy={
                                                       0: subsample_size,
                                                       1: subsample_size
                                                   },
                                                   n_jobs=n_jobs_rf,
                                                   random_state=random_state,
                                                   bootstrap=False,
                                                   replacement=False)
    return estimator
def predict_model_kfold(name,path,features_type,label_name,data):
    kfold = KFold(10, True)
    #RandomForest -I 1000 -K 0 -S 1 -num-slots 1
    model = BalancedRandomForestClassifier(n_estimators=1000,max_depth=5)
    index = 0
    size = data.shape[0]
    all_predictions = 0
    x = data.drop('hasBug', axis=1)
    y = data['hasBug']
    num_of_bugs = data.loc[data['hasBug'] == 1].shape[0]
    num_of_all_instances = data.shape[0]
    bug_precent = float(num_of_bugs) / float(num_of_all_instances)
    for train, test in kfold.split(data):
        index += 1
        prediction_train = model.fit(x.iloc[train], y.iloc[train]).predict(x.iloc[test])
        all_predictions += create_all_eval_results(False,y.iloc[test],prediction_train,name,"training",features_type,num_of_bugs,num_of_all_instances,bug_precent,None)

    all_predictions /= index
    start_list = [name,"training",features_type,"sklearn - python"]
    result_list = start_list+ all_predictions.tolist()

    global results_all_projects
    results_all_projects.loc[len(results_all_projects)] = result_list

    model.fit(x,y)
    return model
Beispiel #8
0
    def __init__(self, taxonomy_dictionary, non_used_features=None):
        n_trees = 500
        self.top_classifier = RandomForestClassifier(n_estimators=n_trees,
                                                     max_depth=None,
                                                     max_features='auto')

        self.stochastic_classifier = RandomForestClassifier(
            n_estimators=n_trees, max_depth=None, max_features=0.2)

        self.periodic_classifier = RandomForestClassifier(n_estimators=n_trees,
                                                          max_depth=None,
                                                          max_features='auto')

        self.transient_classifier = RandomForestClassifier(
            n_estimators=n_trees, max_depth=None, max_features='auto')

        self.feature_preprocessor = FeaturePreprocessor(
            non_used_features=non_used_features)

        self.taxonomy_dictionary = taxonomy_dictionary
        self.feature_list = None
        self.inverted_dictionary = invert_dictionary(self.taxonomy_dictionary)
        self.pickles = {
            "features_list": "features_RF_model.pkl",
            "top_rf": "hierarchical_level_RF_model.pkl",
            "periodic_rf": "periodic_level_RF_model.pkl",
            "stochastic_rf": "stochastic_level_RF_model.pkl",
            "transient_rf": "transient_level_RF_model.pkl"
        }
        self.url_model = f"https://assets.alerce.online/pipeline/hierarchical_rf_{self.MODEL_VERSION}/"
    def make_model(self, config=None):
        """
        :param config : model parameters
        :return: self.model
        """

        if config != None:
            self.config = config

        print('Creating fresh model...')

        if self.class_ == 'RF':
            if self.type_ == 'reg':
                if self.balanced == 'balanced':
                    print('WARNING: balanced regressor not applicable')
                    self.model = RandomForestRegressor(
                        **config) if config != None else RandomForestRegressor(
                            random_state=self.seed)
                elif self.balanced == None:
                    self.model = RandomForestRegressor(
                        **config) if config != None else RandomForestRegressor(
                            random_state=self.seed)
            elif self.type_ == 'cls':
                if self.balanced == 'balanced':
                    self.model = BalancedRandomForestClassifier(
                        **config
                    ) if config != None else BalancedRandomForestClassifier(
                        random_state=self.seed)
                elif self.balanced == None:
                    self.model = RandomForestClassifier(
                        **
                        config) if config != None else RandomForestClassifier(
                            random_state=self.seed)
        elif self.class_ == 'lin':
            if self.type_ == 'reg':
                if self.balanced == 'balanced':
                    print('WARNING: balanced regressor not applicable')
                    self.model = LinearRegression(
                        **config) if config != None else LinearRegression()
                elif self.balanced == None:
                    self.model = LinearRegression(
                        **config) if config != None else LinearRegression()
            elif self.type_ == 'cls':
                if self.balanced == 'balanced':
                    self.model = LogisticRegression(
                        **config) if config != None else LogisticRegression()
                    self.model.class_weight = self.balanced
                elif self.balanced == None:
                    self.model = LogisticRegression(
                        **config) if config != None else LogisticRegression()
                    self.model.class_weight = None
        elif self.class_ == 'svm':
            assert self.type_ == 'cls', print(
                'If using SVM, make sure you have a classification problem. i.e. set type_="cls"'
            )
            self.model = SVC(**config) if config != None else SVC(kernel='rbf')

        print('Created: ', self.model)
        return self.model
Beispiel #10
0
def test_balanced_random_forest(imbalanced_dataset):
    n_estimators = 10
    brf = BalancedRandomForestClassifier(n_estimators=n_estimators, random_state=0)
    brf.fit(*imbalanced_dataset)

    assert len(brf.samplers_) == n_estimators
    assert len(brf.estimators_) == n_estimators
    assert len(brf.pipelines_) == n_estimators
    assert len(brf.feature_importances_) == imbalanced_dataset[0].shape[1]
Beispiel #11
0
def evaluate(X_train, y_train, X_test, y_test):
    global seed
    clf = BalancedRandomForestClassifier(n_estimators=500, random_state=seed)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test).argsort(axis=1)
    y_pred1 = y_pred[:, -1]
    y_pred2 = y_pred[:, -2]
    return metrics.confusion_matrix(y_test, y_pred1), metrics.confusion_matrix(
        y_test, y_pred2)
Beispiel #12
0
def test_balanced_random_forest(imbalanced_dataset):
    n_estimators = 10
    brf = BalancedRandomForestClassifier(n_estimators=n_estimators,
                                         random_state=0)
    brf.fit(*imbalanced_dataset)

    assert len(brf.samplers_) == n_estimators
    assert len(brf.estimators_) == n_estimators
    assert len(brf.pipelines_) == n_estimators
    assert len(brf.feature_importances_) == imbalanced_dataset[0].shape[1]
def test_balanced_random_forest_oob(imbalanced_dataset):
    X, y = imbalanced_dataset
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=42,
                                                        stratify=y)
    est = BalancedRandomForestClassifier(
        oob_score=True,
        random_state=0,
        n_estimators=1000,
        min_samples_leaf=2,
    )

    est.fit(X_train, y_train)
    test_score = est.score(X_test, y_test)

    assert abs(test_score - est.oob_score_) < 0.1

    # Check warning if not enough estimators
    est = BalancedRandomForestClassifier(oob_score=True,
                                         random_state=0,
                                         n_estimators=1,
                                         bootstrap=True)
    with pytest.warns(UserWarning) and np.errstate(divide="ignore",
                                                   invalid="ignore"):
        est.fit(X, y)
Beispiel #14
0
def _plot_championship_importance(all_res, save_directory, top = 6):
    
    save_file = save_directory + 'championship_importance.png'
    
    if os.path.exists(save_file):
        return
    
    xs = []
    ys = []
    teams = []

    for season in all_res:

        team_df = all_res[season][0]
        team_stats = all_res[season][1]
        champion = all_res[season][2]

        for team, g in team_df.groupby('TEAM'):
            x = g.nlargest(top, 'TIME')[['off_norm', 'def_norm']].unstack().values
            y = 1 if team in champion else 0

            xs.append(x)
            ys.append(y)
            teams.append(team + '_' + season)

    xs = np.vstack(xs)
    ys = np.array(ys)

    fts = []
    for ntree in tqdm([50, 75, 100, 125, 150, 175, 200]):

        for i in np.where(ys==1)[0]:

            xs_temp = xs[[x for x in range(len(xs)) if x != i]]
            ys_temp = ys[[y for y in range(len(xs)) if y != i]]

            rfr = BalancedRandomForestClassifier(n_estimators=ntree)
            rfr.fit(xs_temp, ys_temp)
            ft = rfr.feature_importances_
            fts.append(ft)
            
    fts = np.vstack(fts)
    
    feature_names = ['off' + str(i+1) for i in range(top)] + ['def' + str(i+1) for i in range(top)]
    
    fig, ax = plt.subplots(figsize=(8,6))
    for i in range(len(feature_names)):
        ax.boxplot(fts[:, i], positions=[i])
    ax.set_xticklabels(feature_names)
    ax.set_ylabel('Feature Importance', labelpad=10)
    ax.set_title('Championship Feature Importance')
    
    plt.savefig(save_file)
    plt.close()
Beispiel #15
0
    def evaluate_model(self):

        with open(self.result_folder +
                  '/param_RF_{}.json'.format(self.epoch)) as f:
            dati = json.load(f)

            for data in dati:

                del data['value']

                rf_model = BalancedRandomForestClassifier(**data)

                rf_auc = []

                for i in tqdm(range(20)):

                    cv = StratifiedKFold(n_splits=5,
                                         shuffle=True,
                                         random_state=i + 187462)

                    for train_index, test_index in cv.split(self.X, self.y):

                        trainX = self.X.iloc[lambda x: train_index]
                        testX = self.X.iloc[lambda x: test_index]

                        trainy = np.take(self.y, train_index)
                        testy = np.take(self.y, test_index)

                        median_imputer = SimpleImputer(missing_values=np.NaN,
                                                       strategy='median')
                        imputer = median_imputer.fit(trainX)
                        vtrainX = imputer.transform(trainX)

                        imputertest = median_imputer.fit(testX)
                        vtestX = imputertest.transform(testX)
                        trainX = pd.DataFrame(vtrainX,
                                              columns=trainX.columns,
                                              index=trainX.index)
                        testX = pd.DataFrame(vtestX,
                                             columns=testX.columns,
                                             index=testX.index)

                        # Calcolo AUC per migliori risultati da CatBoost

                        rf_model.fit(trainX, trainy)
                        roc_rf = roc_auc_score(
                            testy,
                            rf_model.predict_proba(testX)[:, 1])
                        rf_auc.append(roc_rf)

                        print(roc_rf)

            print(statistics.mean(rf_auc))
        return rf_auc
def test_balanced_random_forest_oob_binomial(ratio):
    # Regression test for #655: check that the oob score is closed to 0.5
    # a binomial experiment.
    rng = np.random.RandomState(42)
    n_samples = 1000
    X = np.arange(n_samples).reshape(-1, 1)
    y = rng.binomial(1, ratio, size=n_samples)

    erf = BalancedRandomForestClassifier(oob_score=True, random_state=42)
    erf.fit(X, y)
    assert np.abs(erf.oob_score_ - 0.5) < 0.1
Beispiel #17
0
class BaselineRandomForest(BaseClassifier):
    def __init__(self):
        self.random_forest_classifier = RandomForestClassifier(
            n_estimators=500,
            max_features='auto',
            max_depth=None,
            n_jobs=1,
            class_weight=None,
            criterion='entropy',
            min_samples_split=2,
            min_samples_leaf=1)
        self.feature_preprocessor = FeaturePreprocessor()
        self.feature_list = None
        self.model_filename = 'baseline_rf.pkl'

    def fit(self, samples: pd.DataFrame, labels: pd.DataFrame):
        samples = self.feature_preprocessor.preprocess_features(samples)
        samples = self.feature_preprocessor.remove_duplicates(samples)

        # intersect samples and labels
        samples, labels = intersect_oids_in_dataframes(samples, labels)

        self.feature_list = samples.columns
        samples_np_array = samples.values
        labels_np_array = labels['classALeRCE'].loc[samples.index].values
        self.random_forest_classifier.fit(samples_np_array, labels_np_array)

    def predict_proba(self, samples: pd.DataFrame) -> pd.DataFrame:
        samples = self.feature_preprocessor.preprocess_features(samples)
        samples_np_array = samples[self.feature_list].values
        predicted_probs = self.random_forest_classifier.predict_proba(
            samples_np_array)
        predicted_probs_df = pd.DataFrame(predicted_probs,
                                          columns=self.get_list_of_classes(),
                                          index=samples.index.values)
        predicted_probs_df.index.name = 'oid'
        return predicted_probs_df

    def get_list_of_classes(self) -> list:
        return self.random_forest_classifier.classes_

    def save_model(self, directory: str) -> None:
        with open(os.path.join(directory, self.model_filename), 'wb') as f:
            pickle.dump(self.random_forest_classifier, f,
                        pickle.HIGHEST_PROTOCOL)
        with open(os.path.join(directory, 'feature_list.pkl'), 'wb') as f:
            pickle.dump(self.feature_list, f, pickle.HIGHEST_PROTOCOL)

    def load_model(self, directory: str) -> None:
        rf = pd.read_pickle(os.path.join(directory, self.model_filename))
        self.random_forest_classifier = rf
        self.feature_list = pd.read_pickle(
            os.path.join(directory, 'feature_list.pkl'))
def test_little_tree_with_small_max_samples():
    rng = np.random.RandomState(1)

    X = rng.randn(10000, 2)
    y = rng.randn(10000) > 0

    # First fit with no restriction on max samples
    est1 = BalancedRandomForestClassifier(
        n_estimators=1,
        random_state=rng,
        max_samples=None,
    )

    # Second fit with max samples restricted to just 2
    est2 = BalancedRandomForestClassifier(
        n_estimators=1,
        random_state=rng,
        max_samples=2,
    )

    est1.fit(X, y)
    est2.fit(X, y)

    tree1 = est1.estimators_[0].tree_
    tree2 = est2.estimators_[0].tree_

    msg = "Tree without `max_samples` restriction should have more nodes"
    assert tree1.node_count > tree2.node_count, msg
 def __init__(self,
              max_depth=None,
              n_features=10,
              selector=ranksum,
              trend="both",
              space_mask=None):
     self.max_depth = max_depth
     self.n_features = n_features
     self.selector = selector
     self.model_ = BalancedRandomForestClassifier(max_depth=max_depth,
                                                  n_estimators=100,
                                                  random_state=777)
     self.trend = trend
     self.space_mask = space_mask
Beispiel #20
0
	def __init__(self, iterations=1, transform_first=False, untrained_model=BalancedRandomForestClassifier(random_state=42,n_jobs=40), max_train_test_samples=100, mode_interaction_extract='knee', include_self_interactions=False, penalty=3, pelt_model='l2', no_changepoint_strategy='median'):
		"""https://github.com/ModelOriented/SAFE/blob/master/SafeTransformer/SafeTransformer.py"""
		steps=[]
		for i in range(iterations):
			steps.extend([['interaction{}'.format(i),InteractionTransformer(copy.deepcopy(untrained_model), max_train_test_samples, mode_interaction_extract, include_self_interactions)],
						  ['transformer{}'.format(i),SafeTransformer(penalty=penalty, model=copy.deepcopy(untrained_model), pelt_model=pelt_model, no_changepoint_strategy=no_changepoint_strategy)]])
		self.pipeline=Pipeline(steps)
def get_balanced_models():
    models = list()
    #LR
    models.append(
        ('LR_Bal', LogisticRegression(solver='lbfgs',
                                      class_weight='balanced')))
    # LDA
    models.append(('LDA', LinearDiscriminantAnalysis()))
    #KNN
    models.append(('KNN', KNeighborsClassifier()))
    #NB
    models.append(('NB', GaussianNB()))
    #MNB
    #models.append(('MNB', MultinomialNB()))
    #GPC
    #models.append(('GPC', GaussianProcessClassifier()))
    if X.shape[0] < 100000:
        #SVM Balanced
        models.append(('SVM_Bal', SVC(gamma='scale', class_weight='balanced')))
        #SVM Weight
        models.append(('SVM_W', SVC(gamma='scale', class_weight=weights)))
    #Balanced RF
    models.append(
        ('Bal_RF', BalancedRandomForestClassifier(n_estimators=1000)))
    #RF
    models.append(('RF_Bal',
                   RandomForestClassifier(n_estimators=1000,
                                          class_weight='balanced')))
    #DT
    models.append(('DT_Bal', DecisionTreeClassifier(class_weight='balanced')))
    #Bag
    models.append(('BAG', BaggingClassifier(n_estimators=1000)))
    #XGB
    models.append(('XGB_W', XGBClassifier(scale_pos_weight=weights)))
    return models
def apply_ml_model(X_train_input, y_train_input, X_test_input, y_test_input):
    models = ['LREG','RFC','Tree','Balanced RFC']
    scores = []
    # Specify the target classes
    classes = ["No re-admission","Re-admission in < 30 days"]
    for model in models:
        if model == 'LREG':
            model_select = LogisticRegression(solver='lbfgs', max_iter=500, random_state=78)
        elif model == 'RFC':
            model_select = RandomForestClassifier(n_estimators= 128, random_state=78)
        elif model == 'Tree':
            model_select = tree.DecisionTreeClassifier(random_state=78)
        elif model == 'Balanced RFC':
            model_select = BalancedRandomForestClassifier(n_estimators=128, random_state=78)
        model_select.fit(X_train_input, y_train_input)
        y_pred = model_select.predict(X_test_input)
        # Create a DataFrame from the confusion matrix.
        cm = confusion_matrix(y_test_input, y_pred)
        # Calculating the accuracy score.
        acc_score = balanced_accuracy_score(y_test, y_pred)
        scores.append(acc_score)
        print(f"Model: {model}")
        # Displaying results
        print("Confusion Matrix")
        cm_df = pd.DataFrame(
        cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
        print(cm_df)
        print(f"Accuracy Score : {acc_score}\n")
        print("Classification Report")
        print(classification_report_imbalanced(y_test_input, y_pred))
def fourth_test(X_train, y_train, X_test, y_test):
    print("Test with BalancedRandomForestClassifier or BalancedBaggingClassifier\n")

    print("BalancedRandomForestClassifier")
    scores = cross_validate(BalancedRandomForestClassifier(max_depth=None, n_estimators=500, random_state=0, n_jobs=2, max_features='log2', oob_score=False), X_train, y_train, cv=10, scoring=('roc_auc', 'average_precision'), return_estimator=True)
    print(scores['test_roc_auc'].mean(),
          scores['test_average_precision'].mean())
    log_model = scores['estimator'][np.argmax(scores['test_roc_auc'])]
    y_log_pred = log_model.predict(X_test)
    conf_mat = confusion_matrix(y_true=y_test, y_pred=y_log_pred)
    print("confusion_matrix:\n", conf_mat)
    print()

    print("BalancedBaggingClassifier")
    tree = DecisionTreeClassifier(max_features='auto')
    resample_bagging = BalancedBaggingClassifier(
        base_estimator=tree, n_estimators=100, random_state=0, n_jobs=2, oob_score=True)
    scores = cross_validate(resample_bagging, X_train, y_train, cv=10, scoring=(
        'roc_auc', 'average_precision'), return_estimator=True)
    print(scores['test_roc_auc'].mean(),
          scores['test_average_precision'].mean())
    rf_model = scores['estimator'][np.argmax(scores['test_roc_auc'])]
    y_rf_pred = rf_model.predict(X_test)
    conf_mat = confusion_matrix(y_true=y_test, y_pred=y_rf_pred)
    print("confusion_matrix:\n", conf_mat)

    """
Beispiel #24
0
def do_CV_grid(LS, cv=10):
    nBits = 1250
    with measure_time("Creating fingerprint"):
        X_LS = create_fingerprints(LS["SMILES"].values, nBits=nBits)
        # drop duplicate
        data = pd.DataFrame(X_LS)
        data = data.drop_duplicates()
        X_LS = data.values
    # Drop also duplicate in the y_LS samples
    y_LS = LS["ACTIVE"].loc[data.index].values
    X_train, X_test, y_train, y_test = train_test_split(
        X_LS, y_LS, test_size=0.25, train_size=0.75, random_state=1)
    pipeline = Pipeline([('ada', ADASYN(sampling_strategy=0.25, random_state=64, n_jobs=-1)),
                         ('BRF', BalancedRandomForestClassifier(n_estimators=500, random_state=18, n_jobs=-1, bootstrap=False))])
    param = {}
    param['BRF__n_estimators'] = [500]
    param['BRF__max_features'] = [None, 'log2']
    #param['BRF__criterion'] = ['gini', 'entropy']

    clf = GridSearchCV(pipeline, param, scoring='roc_auc', n_jobs=2, cv=10)
    clf.fit(X_train, y_train)
    print("Best parameters set found on development set:")
    print()
    print(clf.cv_results_)
    print(clf.best_params_)
    print(clf.best_score_)
    print()

    y_pred = clf.predict(X_test)
    conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
    print("confusion_matrix:\n", conf_mat)
    print("Classification report")
    print(classification_report(y_true=y_test, y_pred=y_pred))
def test_balanced_random_forest_grid_search(imbalanced_dataset):
    brf = BalancedRandomForestClassifier()
    grid = GridSearchCV(brf, {
        "n_estimators": (1, 2),
        "max_depth": (1, 2)
    },
                        cv=3)
    grid.fit(*imbalanced_dataset)
def test_balanced_random_forest_pruning(imbalanced_dataset):
    brf = BalancedRandomForestClassifier()
    brf.fit(*imbalanced_dataset)
    n_nodes_no_pruning = brf.estimators_[0].tree_.node_count

    brf_pruned = BalancedRandomForestClassifier(ccp_alpha=0.015)
    brf_pruned.fit(*imbalanced_dataset)
    n_nodes_pruning = brf_pruned.estimators_[0].tree_.node_count

    assert n_nodes_no_pruning > n_nodes_pruning
def random_forest(X_train, y_train, X_test, y_test, X_train_res, y_train_res):
    rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    rf.fit(X_train, y_train.values.ravel())
    y_train_rf = rf.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_rf)
    without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Random Forest (niezbalansowany): {}%".format(without))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    rf_oversampling = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    rf_oversampling.fit(X_train_res, y_train_res.ravel())
    y_train_rf = rf_oversampling.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_rf)
    with_oversampling=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Random Forest (z oversamplingiem): {}%".format(without))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    brf.fit(X_train, y_train.values.ravel())
    y_train_brf = brf.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_brf)
    within=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Random Forest (zbalansowany - undersampling): {}%".format(within))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])
    print(brf.feature_importances_)
    
    objects = ('country','gender', 'age', 'visiting Wuhan', 'from Wuhan')
    y_pos = np.arange(len(objects))
    performance = brf.feature_importances_*100
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('Procent zależności')
    plt.title('Zależność poszczególnych atrybutów')
    plt.show()

    objects = ('Random Forest niezbalansowany','Random Forest z oversamplingiem', 'Random Forest zbalansowany')
    y_pos = np.arange(len(objects))
    performance = [without, with_oversampling, within]
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('Procent dokładności')
    plt.title('Dokładność Random Forest')
    plt.show()

    return without, within
Beispiel #28
0
        def objective(trial):

            train_X, val_X, train_y, val_y = train_test_split(self.X,
                                                              self.y,
                                                              test_size=0.2)
            median_imputer = SimpleImputer(missing_values=np.NaN,
                                           strategy='median')
            v_train_X = median_imputer.fit_transform(train_X)
            v_val_X = median_imputer.fit_transform(val_X)
            train_X = pd.DataFrame(v_train_X,
                                   columns=train_X.columns,
                                   index=train_X.index)
            val_X = pd.DataFrame(v_val_X,
                                 columns=val_X.columns,
                                 index=val_X.index)

            v_test_X = median_imputer.fit_transform(self.X_validation)
            test_X = pd.DataFrame(v_test_X,
                                  columns=self.X_validation.columns,
                                  index=self.X_validation.index)

            list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000]

            brf_n_estimators = trial.suggest_categorical(
                'n_estimators', list_trees)
            brf_max_features = trial.suggest_uniform('max_features', 0.15, 1.0)
            brf_min_samples_split = trial.suggest_int('min_samples_split', 2,
                                                      16)
            brf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16)
            brf_min_weight_fraction_leaf = trial.suggest_uniform(
                'min_weight_fraction_leaf', 0, 0.5)
            brf_max_depth = trial.suggest_int('max_depth', 2, 32)

            brfmodel = BalancedRandomForestClassifier(
                n_estimators=brf_n_estimators,
                max_features=brf_max_features,
                min_samples_split=brf_min_samples_split,
                min_samples_leaf=brf_min_samples_leaf,
                max_depth=brf_max_depth,
                min_weight_fraction_leaf=brf_min_weight_fraction_leaf,
                bootstrap=True)

            brfmodel.fit(train_X, train_y)

            aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1])
            aucbrf_test = roc_auc_score(self.y_validation,
                                        brfmodel.predict_proba(test_X)[:, 1])
            print('Accuracy test ' + str(
                accuracy_score(self.y_validation, brfmodel.predict(test_X))))

            plt.figure()
            plot_confusion_matrix(brfmodel,
                                  test_X,
                                  self.y_validation,
                                  cmap=plt.cm.Blues,
                                  normalize=None)
            plt.show()
            print(aucbrf_test)

            return aucbrf
Beispiel #29
0
def test_balanced_random_forest_grid_search(imbalanced_dataset):
    brf = BalancedRandomForestClassifier()
    grid = GridSearchCV(brf, {
        'n_estimators': (1, 2),
        'max_depth': (1, 2)
    },
                        cv=3,
                        iid=False)
    grid.fit(*imbalanced_dataset)
Beispiel #30
0
    def model_checking(self):

        X = self.df[self.features]
        Y = self.df[self.target]

        pipelines = [
            Pipeline(steps=[('classifier',
                             BalancedRandomForestClassifier(
                                 n_estimators=200))]),
            Pipeline(steps=[
                # ('rfe', RFE(XGBClassifier(), )),
                ('classifier', BalancedBaggingClassifier(n_estimators=200))
            ]),
            Pipeline(steps=[('rfe', SMOTE()),
                            ('classifier',
                             XGBClassifier(n_estimators=1000, reg_alpha=1))]),
            Pipeline(steps=[('rfe', BorderlineSMOTE()),
                            ('classifier',
                             XGBClassifier(n_estimators=1000, reg_alpha=1))]),
            Pipeline(steps=[
                # ('rfe', RFE(XGBClassifier(), )),
                ('classifier',
                 XGBClassifier(
                     n_estimators=1000, scale_pos_weight=3, reg_alpha=1))
            ]),
            Pipeline(
                steps=[('rfe', RFE(XGBClassifier())),
                       ('classifier',
                        XGBClassifier(
                            n_estimators=1000, scale_pos_weight=3, reg_alpha=1)
                        )])
        ]

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.25,
                                                            stratify=Y)

        for pipe in pipelines:
            scores = cross_val_score(pipe,
                                     X_train.values,
                                     y_train,
                                     scoring='precision',
                                     cv=StratifiedKFold(5))
            print("cross val scores")
            print(sum(scores) / 5)
            pipe.fit(X_train.values, y_train.values)
            y_pred = pipe.predict(X_test.values)

            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            print("test scores")
            print(
                f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}"
            )
def apply_balanced_RF_classifier(X_train, y_train, model_path):
    '''
    Args: 
        X_train dataframe with all the features to be used for training
        y_train series containing labels for each row of X_train
        model_path path where trained balanced random forest model is to be saved
        
    Output:
        trained balanced random forest model
    '''
    BRF_model = BalancedRandomForestClassifier(n_estimators=50,
                                               random_state=0,
                                               n_jobs=-1)
    # Fit the training data
    BRF_model.fit(X_train, y_train)

    pickle_models(BRF_model, model_path)

    return BRF_model
    def run_best_estimator(self, train_x, train_y, test_x, test_y, estimator,
                           params, clf_type, question):
        estimator_scores = {}

        if estimator == 'BalancedRandomForestClassifier':
            clf = BalancedRandomForestClassifier(
                n_estimators=params['n_estimators'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)
        elif estimator == 'BalancedBaggingClassifier':
            clf = BalancedBaggingClassifier(
                n_estimators=params['n_estimators'],
                bootstrap=params['bootstrap'],
                max_samples=params['max_samples'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)
        elif estimator == 'EasyEnsembleClassifier':
            clf = EasyEnsembleClassifier(
                n_estimators=params['n_estimators'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)

        clf.fit(train_x, train_y)
        cross_val_scores = self.calc_cross_val_scores(clf, train_x, train_y,
                                                      clf_type, question)

        predicted_labels = clf.predict(test_x)

        tn, fp, fn, tp = confusion_matrix(test_y, predicted_labels).ravel()
        specificity = round((tn / (tn + fp)) * 100, 2)

        predicted_prob = clf.predict_proba(test_x)
        predicted_prob_true = [p[1] for p in predicted_prob]

        estimator_scores['Question'] = question
        estimator_scores['Accuracy'] = round(
            accuracy_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Balanced Accuracy'] = round(
            balanced_accuracy_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Precision'] = round(
            precision_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Recall'] = round(
            recall_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Specificity'] = specificity
        estimator_scores['F1'] = round(f1_score(test_y, predicted_labels), 2)
        estimator_scores['ROC AUC'] = round(
            roc_auc_score(test_y, predicted_prob_true), 2)

        # print('Perfect Confusion Matrix for Q-%s is: ' % (str(question).zfill(2)))
        # perfect_labels = train_y
        # print(confusion_matrix(train_y, perfect_labels))

        return cross_val_scores, estimator_scores
Beispiel #33
0
def test_balanced_random_forest_attributes(imbalanced_dataset):
    X, y = imbalanced_dataset
    n_estimators = 10
    brf = BalancedRandomForestClassifier(n_estimators=n_estimators,
                                         random_state=0)
    brf.fit(X, y)

    for idx in range(n_estimators):
        X_res, y_res = brf.samplers_[idx].fit_resample(X, y)
        X_res_2, y_res_2 = brf.pipelines_[idx].named_steps[
            'randomundersampler'].fit_resample(X, y)
        assert_allclose(X_res, X_res_2)
        assert_array_equal(y_res, y_res_2)

        y_pred = brf.estimators_[idx].fit(X_res, y_res).predict(X)
        y_pred_2 = brf.pipelines_[idx].fit(X, y).predict(X)
        assert_array_equal(y_pred, y_pred_2)

        y_pred = brf.estimators_[idx].fit(X_res, y_res).predict_proba(X)
        y_pred_2 = brf.pipelines_[idx].fit(X, y).predict_proba(X)
        assert_array_equal(y_pred, y_pred_2)
Beispiel #34
0
def test_balanced_random_forest_oob(imbalanced_dataset):
    X, y = imbalanced_dataset
    est = BalancedRandomForestClassifier(oob_score=True, random_state=0)

    n_samples = X.shape[0]
    est.fit(X[:n_samples // 2, :], y[:n_samples // 2])
    test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:])

    assert abs(test_score - est.oob_score_) < 0.1

    # Check warning if not enough estimators
    est = BalancedRandomForestClassifier(oob_score=True, random_state=0,
                                         n_estimators=1, bootstrap=True)
    with pytest.warns(UserWarning) and np.errstate(divide="ignore",
                                                   invalid="ignore"):
        est.fit(X, y)
Beispiel #35
0
def test_balanced_random_forest_error_warning_warm_start(imbalanced_dataset):
    brf = BalancedRandomForestClassifier(n_estimators=5)
    brf.fit(*imbalanced_dataset)

    with pytest.raises(ValueError, message="must be larger or equal to"):
        brf.set_params(warm_start=True, n_estimators=2)
        brf.fit(*imbalanced_dataset)

    brf.set_params(n_estimators=10)
    brf.fit(*imbalanced_dataset)

    with pytest.warns(UserWarning, match="Warm-start fitting without"):
        brf.fit(*imbalanced_dataset)
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_bbc),
              geometric_mean_score(y_test, y_pred_bbc)))
cm_balanced_bagging = confusion_matrix(y_test, y_pred_bbc)
plot_confusion_matrix(cm_balanced_bagging, classes=np.unique(satimage.target),
                      ax=ax[1], title='Balanced bagging')

###############################################################################
# Classification using random forest classifier with and without sampling
###############################################################################
# Random forest is another popular ensemble method and it is usually
# outperforming bagging. Here, we used a vanilla random forest and its balanced
# counterpart in which each bootstrap sample is balanced.

rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0,
                                     n_jobs=-1)

rf.fit(X_train, y_train)
brf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_pred_brf = brf.predict(X_test)

# Similarly to the previous experiment, the balanced classifier outperform the
# classifier which learn from imbalanced bootstrap samples. In addition, random
# forest outsperforms the bagging classifier.

print('Random Forest classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rf),
              geometric_mean_score(y_test, y_pred_rf)))
Beispiel #37
0
def test_balanced_random_forest_error(imbalanced_dataset, forest_params,
                                      err_msg):
    brf = BalancedRandomForestClassifier(**forest_params)
    with pytest.raises(ValueError, message=err_msg):
        brf.fit(*imbalanced_dataset)
Beispiel #38
0
def test_balanced_random_forest_sample_weight(imbalanced_dataset):
    rng = np.random.RandomState(42)
    X, y = imbalanced_dataset
    sample_weight = rng.rand(y.shape[0])
    brf = BalancedRandomForestClassifier(n_estimators=5, random_state=0)
    brf.fit(X, y, sample_weight)