def do_CV_Voting(LS, cv=10): nBits = 1250 with measure_time("Creating fingerprint"): X_LS = create_fingerprints(LS["SMILES"].values, nBits=nBits) # drop duplicate data = pd.DataFrame(X_LS) data = data.drop_duplicates() X_LS = data.values # Drop also duplicate in the y_LS samples y_LS = LS["ACTIVE"].loc[data.index].values X_train, X_test, y_train, y_test = train_test_split( X_LS, y_LS, test_size=0.25, train_size=0.75, random_state=1) pipeline_1 = make_pipeline(ADASYN(sampling_strategy=0.25, random_state=64, n_jobs=-1), BalancedRandomForestClassifier(n_estimators=600, random_state=18, n_jobs=-1)) pipeline_2 = make_pipeline(ADASYN(random_state=64, n_jobs=-1), BalancedRandomForestClassifier(n_estimators=600, random_state=24, n_jobs=-1)) BRF = BalancedRandomForestClassifier(n_estimators=100, random_state=18, n_jobs=-1) BGC = make_pipeline(ADASYN(sampling_strategy=0.25, random_state=64, n_jobs=-1), BalancedBaggingClassifier(estimator=DecisionTreeClassifier(max_features="log2"), n_estimators=50)) votingModel = VotingClassifier(estimators=[( 'pip1', pipeline_1), ('pip2', pipeline_2), ('BRF', BRF), ('BGC', BGC)], voting='soft', weights=[3, 1, 1, 1], n_jobs=-1) scores = cross_validate(votingModel, X_train, y_train, cv=cv, scoring=( 'roc_auc', 'average_precision'), return_estimator=True) print(scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()) model = scores['estimator'][np.argmax(scores['test_roc_auc'])] y_pred = model.predict(X_test) conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred) print("confusion_matrix:\n", conf_mat)
def test_balanced_random_forest_attributes(imbalanced_dataset): X, y = imbalanced_dataset n_estimators = 10 brf = BalancedRandomForestClassifier( n_estimators=n_estimators, random_state=0 ) brf.fit(X, y) for idx in range(n_estimators): X_res, y_res = brf.samplers_[idx].fit_resample(X, y) X_res_2, y_res_2 = ( brf.pipelines_[idx] .named_steps["randomundersampler"] .fit_resample(X, y) ) assert_allclose(X_res, X_res_2) assert_array_equal(y_res, y_res_2) y_pred = brf.estimators_[idx].fit(X_res, y_res).predict(X) y_pred_2 = brf.pipelines_[idx].fit(X, y).predict(X) assert_array_equal(y_pred, y_pred_2) y_pred = brf.estimators_[idx].fit(X_res, y_res).predict_proba(X) y_pred_2 = brf.pipelines_[idx].fit(X, y).predict_proba(X) assert_array_equal(y_pred, y_pred_2)
def random_forest(df, drop, target, show, model_name): # split the table into features and outcomes x_cols = [i for i in df.columns if i not in drop] X = df[x_cols] y = df[target] # split features and outcomes into train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0) brf.fit(X_train, y_train) y_predictions = brf.predict(X_test) feature_importance = sorted( zip(brf.feature_importances_, X.columns.tolist()))[::-1] # Calculating the accuracy score. acc_score = balanced_accuracy_score(y_test, y_predictions) # Displaying results if show == True: print(f"Feature Importance: {model_name}") for i in feature_importance: print(i) print("\n") return acc_score * 100
def _train_has_damage(cls, preprocessed_df: pd.DataFrame) -> LinearModelType: X_train, X_test, Y_train, Y_test = cls.get_X_Y_split( preprocessed_df, "has_claim" ) model = BalancedRandomForestClassifier() model.fit(X_train, Y_train) return model
def main(): """ Main entrance.""" print('Spliting challenges') split_challenges() print('Reading X...') X = pd.concat([pd.read_json(XY_PATH['X'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1']) print('Reading y...') y = pd.concat([pd.read_json(XY_PATH['y'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1']) print('\nTraining Inner sampler RFC') for i in range(10): print(f'Training 10-Fold CV #{i}', end='\r') X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i) balanced_rfc = BalancedRandomForestClassifier(n_estimators=100, random_state=0) balanced_rfc.fit(X_train.to_numpy(), y_train.to_numpy().ravel()) pd.DataFrame(balanced_rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'brf', f'y_prob_{i}.json'), orient='records') pd.Series(balanced_rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'brf', f'feature_importance_{i}.json')) print('\nTraining RandomUnderSampler') for i in range(10): print(f'Training 10-Fold CV #{i}', end='\r') X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i) rfc = RandomForestClassifier(n_estimators=100, random_state=0) rus = RandomUnderSampler(random_state=0) X_resample, y_resample = rus.fit_resample(X_train.to_numpy(), y_train.to_numpy().ravel()) rfc.fit(X_resample, y_resample) pd.DataFrame(rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'rus', f'y_prob_{i}.json'), orient='records') pd.Series(rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'rus', f'feature_importance_{i}.json'))
def get_classifier(n_subj, random_state, n_jobs_rf=1, multiclass=False): if multiclass: # multiplication with 0.9 required to make the subject number agree with training set AND because one of the # classes has only very few subject such that we can't reasonably sample more than 100 subjects subsample_size = round(n_subj * 0.9 * 0.5 / 4) estimator = BalancedRandomForestClassifier(n_estimators=1000, class_weight='balanced', oob_score=False, sampling_strategy={ 0: subsample_size, 1: subsample_size, 2: subsample_size, 3: subsample_size }, n_jobs=n_jobs_rf, random_state=random_state, bootstrap=False, replacement=False) else: subsample_size = round(n_subj * 0.632 / 2) estimator = BalancedRandomForestClassifier(n_estimators=1000, class_weight='balanced', oob_score=False, sampling_strategy={ 0: subsample_size, 1: subsample_size }, n_jobs=n_jobs_rf, random_state=random_state, bootstrap=False, replacement=False) return estimator
def predict_model_kfold(name,path,features_type,label_name,data): kfold = KFold(10, True) #RandomForest -I 1000 -K 0 -S 1 -num-slots 1 model = BalancedRandomForestClassifier(n_estimators=1000,max_depth=5) index = 0 size = data.shape[0] all_predictions = 0 x = data.drop('hasBug', axis=1) y = data['hasBug'] num_of_bugs = data.loc[data['hasBug'] == 1].shape[0] num_of_all_instances = data.shape[0] bug_precent = float(num_of_bugs) / float(num_of_all_instances) for train, test in kfold.split(data): index += 1 prediction_train = model.fit(x.iloc[train], y.iloc[train]).predict(x.iloc[test]) all_predictions += create_all_eval_results(False,y.iloc[test],prediction_train,name,"training",features_type,num_of_bugs,num_of_all_instances,bug_precent,None) all_predictions /= index start_list = [name,"training",features_type,"sklearn - python"] result_list = start_list+ all_predictions.tolist() global results_all_projects results_all_projects.loc[len(results_all_projects)] = result_list model.fit(x,y) return model
def __init__(self, taxonomy_dictionary, non_used_features=None): n_trees = 500 self.top_classifier = RandomForestClassifier(n_estimators=n_trees, max_depth=None, max_features='auto') self.stochastic_classifier = RandomForestClassifier( n_estimators=n_trees, max_depth=None, max_features=0.2) self.periodic_classifier = RandomForestClassifier(n_estimators=n_trees, max_depth=None, max_features='auto') self.transient_classifier = RandomForestClassifier( n_estimators=n_trees, max_depth=None, max_features='auto') self.feature_preprocessor = FeaturePreprocessor( non_used_features=non_used_features) self.taxonomy_dictionary = taxonomy_dictionary self.feature_list = None self.inverted_dictionary = invert_dictionary(self.taxonomy_dictionary) self.pickles = { "features_list": "features_RF_model.pkl", "top_rf": "hierarchical_level_RF_model.pkl", "periodic_rf": "periodic_level_RF_model.pkl", "stochastic_rf": "stochastic_level_RF_model.pkl", "transient_rf": "transient_level_RF_model.pkl" } self.url_model = f"https://assets.alerce.online/pipeline/hierarchical_rf_{self.MODEL_VERSION}/"
def make_model(self, config=None): """ :param config : model parameters :return: self.model """ if config != None: self.config = config print('Creating fresh model...') if self.class_ == 'RF': if self.type_ == 'reg': if self.balanced == 'balanced': print('WARNING: balanced regressor not applicable') self.model = RandomForestRegressor( **config) if config != None else RandomForestRegressor( random_state=self.seed) elif self.balanced == None: self.model = RandomForestRegressor( **config) if config != None else RandomForestRegressor( random_state=self.seed) elif self.type_ == 'cls': if self.balanced == 'balanced': self.model = BalancedRandomForestClassifier( **config ) if config != None else BalancedRandomForestClassifier( random_state=self.seed) elif self.balanced == None: self.model = RandomForestClassifier( ** config) if config != None else RandomForestClassifier( random_state=self.seed) elif self.class_ == 'lin': if self.type_ == 'reg': if self.balanced == 'balanced': print('WARNING: balanced regressor not applicable') self.model = LinearRegression( **config) if config != None else LinearRegression() elif self.balanced == None: self.model = LinearRegression( **config) if config != None else LinearRegression() elif self.type_ == 'cls': if self.balanced == 'balanced': self.model = LogisticRegression( **config) if config != None else LogisticRegression() self.model.class_weight = self.balanced elif self.balanced == None: self.model = LogisticRegression( **config) if config != None else LogisticRegression() self.model.class_weight = None elif self.class_ == 'svm': assert self.type_ == 'cls', print( 'If using SVM, make sure you have a classification problem. i.e. set type_="cls"' ) self.model = SVC(**config) if config != None else SVC(kernel='rbf') print('Created: ', self.model) return self.model
def test_balanced_random_forest(imbalanced_dataset): n_estimators = 10 brf = BalancedRandomForestClassifier(n_estimators=n_estimators, random_state=0) brf.fit(*imbalanced_dataset) assert len(brf.samplers_) == n_estimators assert len(brf.estimators_) == n_estimators assert len(brf.pipelines_) == n_estimators assert len(brf.feature_importances_) == imbalanced_dataset[0].shape[1]
def evaluate(X_train, y_train, X_test, y_test): global seed clf = BalancedRandomForestClassifier(n_estimators=500, random_state=seed) clf = clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test).argsort(axis=1) y_pred1 = y_pred[:, -1] y_pred2 = y_pred[:, -2] return metrics.confusion_matrix(y_test, y_pred1), metrics.confusion_matrix( y_test, y_pred2)
def test_balanced_random_forest_oob(imbalanced_dataset): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y) est = BalancedRandomForestClassifier( oob_score=True, random_state=0, n_estimators=1000, min_samples_leaf=2, ) est.fit(X_train, y_train) test_score = est.score(X_test, y_test) assert abs(test_score - est.oob_score_) < 0.1 # Check warning if not enough estimators est = BalancedRandomForestClassifier(oob_score=True, random_state=0, n_estimators=1, bootstrap=True) with pytest.warns(UserWarning) and np.errstate(divide="ignore", invalid="ignore"): est.fit(X, y)
def _plot_championship_importance(all_res, save_directory, top = 6): save_file = save_directory + 'championship_importance.png' if os.path.exists(save_file): return xs = [] ys = [] teams = [] for season in all_res: team_df = all_res[season][0] team_stats = all_res[season][1] champion = all_res[season][2] for team, g in team_df.groupby('TEAM'): x = g.nlargest(top, 'TIME')[['off_norm', 'def_norm']].unstack().values y = 1 if team in champion else 0 xs.append(x) ys.append(y) teams.append(team + '_' + season) xs = np.vstack(xs) ys = np.array(ys) fts = [] for ntree in tqdm([50, 75, 100, 125, 150, 175, 200]): for i in np.where(ys==1)[0]: xs_temp = xs[[x for x in range(len(xs)) if x != i]] ys_temp = ys[[y for y in range(len(xs)) if y != i]] rfr = BalancedRandomForestClassifier(n_estimators=ntree) rfr.fit(xs_temp, ys_temp) ft = rfr.feature_importances_ fts.append(ft) fts = np.vstack(fts) feature_names = ['off' + str(i+1) for i in range(top)] + ['def' + str(i+1) for i in range(top)] fig, ax = plt.subplots(figsize=(8,6)) for i in range(len(feature_names)): ax.boxplot(fts[:, i], positions=[i]) ax.set_xticklabels(feature_names) ax.set_ylabel('Feature Importance', labelpad=10) ax.set_title('Championship Feature Importance') plt.savefig(save_file) plt.close()
def evaluate_model(self): with open(self.result_folder + '/param_RF_{}.json'.format(self.epoch)) as f: dati = json.load(f) for data in dati: del data['value'] rf_model = BalancedRandomForestClassifier(**data) rf_auc = [] for i in tqdm(range(20)): cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i + 187462) for train_index, test_index in cv.split(self.X, self.y): trainX = self.X.iloc[lambda x: train_index] testX = self.X.iloc[lambda x: test_index] trainy = np.take(self.y, train_index) testy = np.take(self.y, test_index) median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') imputer = median_imputer.fit(trainX) vtrainX = imputer.transform(trainX) imputertest = median_imputer.fit(testX) vtestX = imputertest.transform(testX) trainX = pd.DataFrame(vtrainX, columns=trainX.columns, index=trainX.index) testX = pd.DataFrame(vtestX, columns=testX.columns, index=testX.index) # Calcolo AUC per migliori risultati da CatBoost rf_model.fit(trainX, trainy) roc_rf = roc_auc_score( testy, rf_model.predict_proba(testX)[:, 1]) rf_auc.append(roc_rf) print(roc_rf) print(statistics.mean(rf_auc)) return rf_auc
def test_balanced_random_forest_oob_binomial(ratio): # Regression test for #655: check that the oob score is closed to 0.5 # a binomial experiment. rng = np.random.RandomState(42) n_samples = 1000 X = np.arange(n_samples).reshape(-1, 1) y = rng.binomial(1, ratio, size=n_samples) erf = BalancedRandomForestClassifier(oob_score=True, random_state=42) erf.fit(X, y) assert np.abs(erf.oob_score_ - 0.5) < 0.1
class BaselineRandomForest(BaseClassifier): def __init__(self): self.random_forest_classifier = RandomForestClassifier( n_estimators=500, max_features='auto', max_depth=None, n_jobs=1, class_weight=None, criterion='entropy', min_samples_split=2, min_samples_leaf=1) self.feature_preprocessor = FeaturePreprocessor() self.feature_list = None self.model_filename = 'baseline_rf.pkl' def fit(self, samples: pd.DataFrame, labels: pd.DataFrame): samples = self.feature_preprocessor.preprocess_features(samples) samples = self.feature_preprocessor.remove_duplicates(samples) # intersect samples and labels samples, labels = intersect_oids_in_dataframes(samples, labels) self.feature_list = samples.columns samples_np_array = samples.values labels_np_array = labels['classALeRCE'].loc[samples.index].values self.random_forest_classifier.fit(samples_np_array, labels_np_array) def predict_proba(self, samples: pd.DataFrame) -> pd.DataFrame: samples = self.feature_preprocessor.preprocess_features(samples) samples_np_array = samples[self.feature_list].values predicted_probs = self.random_forest_classifier.predict_proba( samples_np_array) predicted_probs_df = pd.DataFrame(predicted_probs, columns=self.get_list_of_classes(), index=samples.index.values) predicted_probs_df.index.name = 'oid' return predicted_probs_df def get_list_of_classes(self) -> list: return self.random_forest_classifier.classes_ def save_model(self, directory: str) -> None: with open(os.path.join(directory, self.model_filename), 'wb') as f: pickle.dump(self.random_forest_classifier, f, pickle.HIGHEST_PROTOCOL) with open(os.path.join(directory, 'feature_list.pkl'), 'wb') as f: pickle.dump(self.feature_list, f, pickle.HIGHEST_PROTOCOL) def load_model(self, directory: str) -> None: rf = pd.read_pickle(os.path.join(directory, self.model_filename)) self.random_forest_classifier = rf self.feature_list = pd.read_pickle( os.path.join(directory, 'feature_list.pkl'))
def test_little_tree_with_small_max_samples(): rng = np.random.RandomState(1) X = rng.randn(10000, 2) y = rng.randn(10000) > 0 # First fit with no restriction on max samples est1 = BalancedRandomForestClassifier( n_estimators=1, random_state=rng, max_samples=None, ) # Second fit with max samples restricted to just 2 est2 = BalancedRandomForestClassifier( n_estimators=1, random_state=rng, max_samples=2, ) est1.fit(X, y) est2.fit(X, y) tree1 = est1.estimators_[0].tree_ tree2 = est2.estimators_[0].tree_ msg = "Tree without `max_samples` restriction should have more nodes" assert tree1.node_count > tree2.node_count, msg
def __init__(self, max_depth=None, n_features=10, selector=ranksum, trend="both", space_mask=None): self.max_depth = max_depth self.n_features = n_features self.selector = selector self.model_ = BalancedRandomForestClassifier(max_depth=max_depth, n_estimators=100, random_state=777) self.trend = trend self.space_mask = space_mask
def __init__(self, iterations=1, transform_first=False, untrained_model=BalancedRandomForestClassifier(random_state=42,n_jobs=40), max_train_test_samples=100, mode_interaction_extract='knee', include_self_interactions=False, penalty=3, pelt_model='l2', no_changepoint_strategy='median'): """https://github.com/ModelOriented/SAFE/blob/master/SafeTransformer/SafeTransformer.py""" steps=[] for i in range(iterations): steps.extend([['interaction{}'.format(i),InteractionTransformer(copy.deepcopy(untrained_model), max_train_test_samples, mode_interaction_extract, include_self_interactions)], ['transformer{}'.format(i),SafeTransformer(penalty=penalty, model=copy.deepcopy(untrained_model), pelt_model=pelt_model, no_changepoint_strategy=no_changepoint_strategy)]]) self.pipeline=Pipeline(steps)
def get_balanced_models(): models = list() #LR models.append( ('LR_Bal', LogisticRegression(solver='lbfgs', class_weight='balanced'))) # LDA models.append(('LDA', LinearDiscriminantAnalysis())) #KNN models.append(('KNN', KNeighborsClassifier())) #NB models.append(('NB', GaussianNB())) #MNB #models.append(('MNB', MultinomialNB())) #GPC #models.append(('GPC', GaussianProcessClassifier())) if X.shape[0] < 100000: #SVM Balanced models.append(('SVM_Bal', SVC(gamma='scale', class_weight='balanced'))) #SVM Weight models.append(('SVM_W', SVC(gamma='scale', class_weight=weights))) #Balanced RF models.append( ('Bal_RF', BalancedRandomForestClassifier(n_estimators=1000))) #RF models.append(('RF_Bal', RandomForestClassifier(n_estimators=1000, class_weight='balanced'))) #DT models.append(('DT_Bal', DecisionTreeClassifier(class_weight='balanced'))) #Bag models.append(('BAG', BaggingClassifier(n_estimators=1000))) #XGB models.append(('XGB_W', XGBClassifier(scale_pos_weight=weights))) return models
def apply_ml_model(X_train_input, y_train_input, X_test_input, y_test_input): models = ['LREG','RFC','Tree','Balanced RFC'] scores = [] # Specify the target classes classes = ["No re-admission","Re-admission in < 30 days"] for model in models: if model == 'LREG': model_select = LogisticRegression(solver='lbfgs', max_iter=500, random_state=78) elif model == 'RFC': model_select = RandomForestClassifier(n_estimators= 128, random_state=78) elif model == 'Tree': model_select = tree.DecisionTreeClassifier(random_state=78) elif model == 'Balanced RFC': model_select = BalancedRandomForestClassifier(n_estimators=128, random_state=78) model_select.fit(X_train_input, y_train_input) y_pred = model_select.predict(X_test_input) # Create a DataFrame from the confusion matrix. cm = confusion_matrix(y_test_input, y_pred) # Calculating the accuracy score. acc_score = balanced_accuracy_score(y_test, y_pred) scores.append(acc_score) print(f"Model: {model}") # Displaying results print("Confusion Matrix") cm_df = pd.DataFrame( cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]) print(cm_df) print(f"Accuracy Score : {acc_score}\n") print("Classification Report") print(classification_report_imbalanced(y_test_input, y_pred))
def fourth_test(X_train, y_train, X_test, y_test): print("Test with BalancedRandomForestClassifier or BalancedBaggingClassifier\n") print("BalancedRandomForestClassifier") scores = cross_validate(BalancedRandomForestClassifier(max_depth=None, n_estimators=500, random_state=0, n_jobs=2, max_features='log2', oob_score=False), X_train, y_train, cv=10, scoring=('roc_auc', 'average_precision'), return_estimator=True) print(scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()) log_model = scores['estimator'][np.argmax(scores['test_roc_auc'])] y_log_pred = log_model.predict(X_test) conf_mat = confusion_matrix(y_true=y_test, y_pred=y_log_pred) print("confusion_matrix:\n", conf_mat) print() print("BalancedBaggingClassifier") tree = DecisionTreeClassifier(max_features='auto') resample_bagging = BalancedBaggingClassifier( base_estimator=tree, n_estimators=100, random_state=0, n_jobs=2, oob_score=True) scores = cross_validate(resample_bagging, X_train, y_train, cv=10, scoring=( 'roc_auc', 'average_precision'), return_estimator=True) print(scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()) rf_model = scores['estimator'][np.argmax(scores['test_roc_auc'])] y_rf_pred = rf_model.predict(X_test) conf_mat = confusion_matrix(y_true=y_test, y_pred=y_rf_pred) print("confusion_matrix:\n", conf_mat) """
def do_CV_grid(LS, cv=10): nBits = 1250 with measure_time("Creating fingerprint"): X_LS = create_fingerprints(LS["SMILES"].values, nBits=nBits) # drop duplicate data = pd.DataFrame(X_LS) data = data.drop_duplicates() X_LS = data.values # Drop also duplicate in the y_LS samples y_LS = LS["ACTIVE"].loc[data.index].values X_train, X_test, y_train, y_test = train_test_split( X_LS, y_LS, test_size=0.25, train_size=0.75, random_state=1) pipeline = Pipeline([('ada', ADASYN(sampling_strategy=0.25, random_state=64, n_jobs=-1)), ('BRF', BalancedRandomForestClassifier(n_estimators=500, random_state=18, n_jobs=-1, bootstrap=False))]) param = {} param['BRF__n_estimators'] = [500] param['BRF__max_features'] = [None, 'log2'] #param['BRF__criterion'] = ['gini', 'entropy'] clf = GridSearchCV(pipeline, param, scoring='roc_auc', n_jobs=2, cv=10) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.cv_results_) print(clf.best_params_) print(clf.best_score_) print() y_pred = clf.predict(X_test) conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred) print("confusion_matrix:\n", conf_mat) print("Classification report") print(classification_report(y_true=y_test, y_pred=y_pred))
def test_balanced_random_forest_grid_search(imbalanced_dataset): brf = BalancedRandomForestClassifier() grid = GridSearchCV(brf, { "n_estimators": (1, 2), "max_depth": (1, 2) }, cv=3) grid.fit(*imbalanced_dataset)
def test_balanced_random_forest_pruning(imbalanced_dataset): brf = BalancedRandomForestClassifier() brf.fit(*imbalanced_dataset) n_nodes_no_pruning = brf.estimators_[0].tree_.node_count brf_pruned = BalancedRandomForestClassifier(ccp_alpha=0.015) brf_pruned.fit(*imbalanced_dataset) n_nodes_pruning = brf_pruned.estimators_[0].tree_.node_count assert n_nodes_no_pruning > n_nodes_pruning
def random_forest(X_train, y_train, X_test, y_test, X_train_res, y_train_res): rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) rf.fit(X_train, y_train.values.ravel()) y_train_rf = rf.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_rf) without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Random Forest (niezbalansowany): {}%".format(without)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) rf_oversampling = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) rf_oversampling.fit(X_train_res, y_train_res.ravel()) y_train_rf = rf_oversampling.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_rf) with_oversampling=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Random Forest (z oversamplingiem): {}%".format(without)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) brf.fit(X_train, y_train.values.ravel()) y_train_brf = brf.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_brf) within=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Random Forest (zbalansowany - undersampling): {}%".format(within)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) print(brf.feature_importances_) objects = ('country','gender', 'age', 'visiting Wuhan', 'from Wuhan') y_pos = np.arange(len(objects)) performance = brf.feature_importances_*100 plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Procent zależności') plt.title('Zależność poszczególnych atrybutów') plt.show() objects = ('Random Forest niezbalansowany','Random Forest z oversamplingiem', 'Random Forest zbalansowany') y_pos = np.arange(len(objects)) performance = [without, with_oversampling, within] plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Procent dokładności') plt.title('Dokładność Random Forest') plt.show() return without, within
def objective(trial): train_X, val_X, train_y, val_y = train_test_split(self.X, self.y, test_size=0.2) median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') v_train_X = median_imputer.fit_transform(train_X) v_val_X = median_imputer.fit_transform(val_X) train_X = pd.DataFrame(v_train_X, columns=train_X.columns, index=train_X.index) val_X = pd.DataFrame(v_val_X, columns=val_X.columns, index=val_X.index) v_test_X = median_imputer.fit_transform(self.X_validation) test_X = pd.DataFrame(v_test_X, columns=self.X_validation.columns, index=self.X_validation.index) list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000] brf_n_estimators = trial.suggest_categorical( 'n_estimators', list_trees) brf_max_features = trial.suggest_uniform('max_features', 0.15, 1.0) brf_min_samples_split = trial.suggest_int('min_samples_split', 2, 16) brf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16) brf_min_weight_fraction_leaf = trial.suggest_uniform( 'min_weight_fraction_leaf', 0, 0.5) brf_max_depth = trial.suggest_int('max_depth', 2, 32) brfmodel = BalancedRandomForestClassifier( n_estimators=brf_n_estimators, max_features=brf_max_features, min_samples_split=brf_min_samples_split, min_samples_leaf=brf_min_samples_leaf, max_depth=brf_max_depth, min_weight_fraction_leaf=brf_min_weight_fraction_leaf, bootstrap=True) brfmodel.fit(train_X, train_y) aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1]) aucbrf_test = roc_auc_score(self.y_validation, brfmodel.predict_proba(test_X)[:, 1]) print('Accuracy test ' + str( accuracy_score(self.y_validation, brfmodel.predict(test_X)))) plt.figure() plot_confusion_matrix(brfmodel, test_X, self.y_validation, cmap=plt.cm.Blues, normalize=None) plt.show() print(aucbrf_test) return aucbrf
def test_balanced_random_forest_grid_search(imbalanced_dataset): brf = BalancedRandomForestClassifier() grid = GridSearchCV(brf, { 'n_estimators': (1, 2), 'max_depth': (1, 2) }, cv=3, iid=False) grid.fit(*imbalanced_dataset)
def model_checking(self): X = self.df[self.features] Y = self.df[self.target] pipelines = [ Pipeline(steps=[('classifier', BalancedRandomForestClassifier( n_estimators=200))]), Pipeline(steps=[ # ('rfe', RFE(XGBClassifier(), )), ('classifier', BalancedBaggingClassifier(n_estimators=200)) ]), Pipeline(steps=[('rfe', SMOTE()), ('classifier', XGBClassifier(n_estimators=1000, reg_alpha=1))]), Pipeline(steps=[('rfe', BorderlineSMOTE()), ('classifier', XGBClassifier(n_estimators=1000, reg_alpha=1))]), Pipeline(steps=[ # ('rfe', RFE(XGBClassifier(), )), ('classifier', XGBClassifier( n_estimators=1000, scale_pos_weight=3, reg_alpha=1)) ]), Pipeline( steps=[('rfe', RFE(XGBClassifier())), ('classifier', XGBClassifier( n_estimators=1000, scale_pos_weight=3, reg_alpha=1) )]) ] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, stratify=Y) for pipe in pipelines: scores = cross_val_score(pipe, X_train.values, y_train, scoring='precision', cv=StratifiedKFold(5)) print("cross val scores") print(sum(scores) / 5) pipe.fit(X_train.values, y_train.values) y_pred = pipe.predict(X_test.values) acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) print("test scores") print( f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}" )
def apply_balanced_RF_classifier(X_train, y_train, model_path): ''' Args: X_train dataframe with all the features to be used for training y_train series containing labels for each row of X_train model_path path where trained balanced random forest model is to be saved Output: trained balanced random forest model ''' BRF_model = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) # Fit the training data BRF_model.fit(X_train, y_train) pickle_models(BRF_model, model_path) return BRF_model
def run_best_estimator(self, train_x, train_y, test_x, test_y, estimator, params, clf_type, question): estimator_scores = {} if estimator == 'BalancedRandomForestClassifier': clf = BalancedRandomForestClassifier( n_estimators=params['n_estimators'], sampling_strategy=params['sampling_strategy'], random_state=42) elif estimator == 'BalancedBaggingClassifier': clf = BalancedBaggingClassifier( n_estimators=params['n_estimators'], bootstrap=params['bootstrap'], max_samples=params['max_samples'], sampling_strategy=params['sampling_strategy'], random_state=42) elif estimator == 'EasyEnsembleClassifier': clf = EasyEnsembleClassifier( n_estimators=params['n_estimators'], sampling_strategy=params['sampling_strategy'], random_state=42) clf.fit(train_x, train_y) cross_val_scores = self.calc_cross_val_scores(clf, train_x, train_y, clf_type, question) predicted_labels = clf.predict(test_x) tn, fp, fn, tp = confusion_matrix(test_y, predicted_labels).ravel() specificity = round((tn / (tn + fp)) * 100, 2) predicted_prob = clf.predict_proba(test_x) predicted_prob_true = [p[1] for p in predicted_prob] estimator_scores['Question'] = question estimator_scores['Accuracy'] = round( accuracy_score(test_y, predicted_labels) * 100, 2) estimator_scores['Balanced Accuracy'] = round( balanced_accuracy_score(test_y, predicted_labels) * 100, 2) estimator_scores['Precision'] = round( precision_score(test_y, predicted_labels) * 100, 2) estimator_scores['Recall'] = round( recall_score(test_y, predicted_labels) * 100, 2) estimator_scores['Specificity'] = specificity estimator_scores['F1'] = round(f1_score(test_y, predicted_labels), 2) estimator_scores['ROC AUC'] = round( roc_auc_score(test_y, predicted_prob_true), 2) # print('Perfect Confusion Matrix for Q-%s is: ' % (str(question).zfill(2))) # perfect_labels = train_y # print(confusion_matrix(train_y, perfect_labels)) return cross_val_scores, estimator_scores
def test_balanced_random_forest_attributes(imbalanced_dataset): X, y = imbalanced_dataset n_estimators = 10 brf = BalancedRandomForestClassifier(n_estimators=n_estimators, random_state=0) brf.fit(X, y) for idx in range(n_estimators): X_res, y_res = brf.samplers_[idx].fit_resample(X, y) X_res_2, y_res_2 = brf.pipelines_[idx].named_steps[ 'randomundersampler'].fit_resample(X, y) assert_allclose(X_res, X_res_2) assert_array_equal(y_res, y_res_2) y_pred = brf.estimators_[idx].fit(X_res, y_res).predict(X) y_pred_2 = brf.pipelines_[idx].fit(X, y).predict(X) assert_array_equal(y_pred, y_pred_2) y_pred = brf.estimators_[idx].fit(X_res, y_res).predict_proba(X) y_pred_2 = brf.pipelines_[idx].fit(X, y).predict_proba(X) assert_array_equal(y_pred, y_pred_2)
def test_balanced_random_forest_oob(imbalanced_dataset): X, y = imbalanced_dataset est = BalancedRandomForestClassifier(oob_score=True, random_state=0) n_samples = X.shape[0] est.fit(X[:n_samples // 2, :], y[:n_samples // 2]) test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:]) assert abs(test_score - est.oob_score_) < 0.1 # Check warning if not enough estimators est = BalancedRandomForestClassifier(oob_score=True, random_state=0, n_estimators=1, bootstrap=True) with pytest.warns(UserWarning) and np.errstate(divide="ignore", invalid="ignore"): est.fit(X, y)
def test_balanced_random_forest_error_warning_warm_start(imbalanced_dataset): brf = BalancedRandomForestClassifier(n_estimators=5) brf.fit(*imbalanced_dataset) with pytest.raises(ValueError, message="must be larger or equal to"): brf.set_params(warm_start=True, n_estimators=2) brf.fit(*imbalanced_dataset) brf.set_params(n_estimators=10) brf.fit(*imbalanced_dataset) with pytest.warns(UserWarning, match="Warm-start fitting without"): brf.fit(*imbalanced_dataset)
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_bbc), geometric_mean_score(y_test, y_pred_bbc))) cm_balanced_bagging = confusion_matrix(y_test, y_pred_bbc) plot_confusion_matrix(cm_balanced_bagging, classes=np.unique(satimage.target), ax=ax[1], title='Balanced bagging') ############################################################################### # Classification using random forest classifier with and without sampling ############################################################################### # Random forest is another popular ensemble method and it is usually # outperforming bagging. Here, we used a vanilla random forest and its balanced # counterpart in which each bootstrap sample is balanced. rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) rf.fit(X_train, y_train) brf.fit(X_train, y_train) y_pred_rf = rf.predict(X_test) y_pred_brf = brf.predict(X_test) # Similarly to the previous experiment, the balanced classifier outperform the # classifier which learn from imbalanced bootstrap samples. In addition, random # forest outsperforms the bagging classifier. print('Random Forest classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_rf), geometric_mean_score(y_test, y_pred_rf)))
def test_balanced_random_forest_error(imbalanced_dataset, forest_params, err_msg): brf = BalancedRandomForestClassifier(**forest_params) with pytest.raises(ValueError, message=err_msg): brf.fit(*imbalanced_dataset)
def test_balanced_random_forest_sample_weight(imbalanced_dataset): rng = np.random.RandomState(42) X, y = imbalanced_dataset sample_weight = rng.rand(y.shape[0]) brf = BalancedRandomForestClassifier(n_estimators=5, random_state=0) brf.fit(X, y, sample_weight)