def test_little_tree_with_small_max_samples(): rng = np.random.RandomState(1) X = rng.randn(10000, 2) y = rng.randn(10000) > 0 # First fit with no restriction on max samples est1 = BalancedRandomForestClassifier( n_estimators=1, random_state=rng, max_samples=None, ) # Second fit with max samples restricted to just 2 est2 = BalancedRandomForestClassifier( n_estimators=1, random_state=rng, max_samples=2, ) est1.fit(X, y) est2.fit(X, y) tree1 = est1.estimators_[0].tree_ tree2 = est2.estimators_[0].tree_ msg = "Tree without `max_samples` restriction should have more nodes" assert tree1.node_count > tree2.node_count, msg
def do_CV_Voting(LS, cv=10): nBits = 1250 with measure_time("Creating fingerprint"): X_LS = create_fingerprints(LS["SMILES"].values, nBits=nBits) # drop duplicate data = pd.DataFrame(X_LS) data = data.drop_duplicates() X_LS = data.values # Drop also duplicate in the y_LS samples y_LS = LS["ACTIVE"].loc[data.index].values X_train, X_test, y_train, y_test = train_test_split( X_LS, y_LS, test_size=0.25, train_size=0.75, random_state=1) pipeline_1 = make_pipeline(ADASYN(sampling_strategy=0.25, random_state=64, n_jobs=-1), BalancedRandomForestClassifier(n_estimators=600, random_state=18, n_jobs=-1)) pipeline_2 = make_pipeline(ADASYN(random_state=64, n_jobs=-1), BalancedRandomForestClassifier(n_estimators=600, random_state=24, n_jobs=-1)) BRF = BalancedRandomForestClassifier(n_estimators=100, random_state=18, n_jobs=-1) BGC = make_pipeline(ADASYN(sampling_strategy=0.25, random_state=64, n_jobs=-1), BalancedBaggingClassifier(estimator=DecisionTreeClassifier(max_features="log2"), n_estimators=50)) votingModel = VotingClassifier(estimators=[( 'pip1', pipeline_1), ('pip2', pipeline_2), ('BRF', BRF), ('BGC', BGC)], voting='soft', weights=[3, 1, 1, 1], n_jobs=-1) scores = cross_validate(votingModel, X_train, y_train, cv=cv, scoring=( 'roc_auc', 'average_precision'), return_estimator=True) print(scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()) model = scores['estimator'][np.argmax(scores['test_roc_auc'])] y_pred = model.predict(X_test) conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred) print("confusion_matrix:\n", conf_mat)
def test_balanced_random_forest_oob(imbalanced_dataset): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y) est = BalancedRandomForestClassifier( oob_score=True, random_state=0, n_estimators=1000, min_samples_leaf=2, ) est.fit(X_train, y_train) test_score = est.score(X_test, y_test) assert abs(test_score - est.oob_score_) < 0.1 # Check warning if not enough estimators est = BalancedRandomForestClassifier(oob_score=True, random_state=0, n_estimators=1, bootstrap=True) with pytest.warns(UserWarning) and np.errstate(divide="ignore", invalid="ignore"): est.fit(X, y)
def get_classifier(n_subj, random_state, n_jobs_rf=1, multiclass=False): if multiclass: # multiplication with 0.9 required to make the subject number agree with training set AND because one of the # classes has only very few subject such that we can't reasonably sample more than 100 subjects subsample_size = round(n_subj * 0.9 * 0.5 / 4) estimator = BalancedRandomForestClassifier(n_estimators=1000, class_weight='balanced', oob_score=False, sampling_strategy={ 0: subsample_size, 1: subsample_size, 2: subsample_size, 3: subsample_size }, n_jobs=n_jobs_rf, random_state=random_state, bootstrap=False, replacement=False) else: subsample_size = round(n_subj * 0.632 / 2) estimator = BalancedRandomForestClassifier(n_estimators=1000, class_weight='balanced', oob_score=False, sampling_strategy={ 0: subsample_size, 1: subsample_size }, n_jobs=n_jobs_rf, random_state=random_state, bootstrap=False, replacement=False) return estimator
def make_model(self, config=None): """ :param config : model parameters :return: self.model """ if config != None: self.config = config print('Creating fresh model...') if self.class_ == 'RF': if self.type_ == 'reg': if self.balanced == 'balanced': print('WARNING: balanced regressor not applicable') self.model = RandomForestRegressor( **config) if config != None else RandomForestRegressor( random_state=self.seed) elif self.balanced == None: self.model = RandomForestRegressor( **config) if config != None else RandomForestRegressor( random_state=self.seed) elif self.type_ == 'cls': if self.balanced == 'balanced': self.model = BalancedRandomForestClassifier( **config ) if config != None else BalancedRandomForestClassifier( random_state=self.seed) elif self.balanced == None: self.model = RandomForestClassifier( ** config) if config != None else RandomForestClassifier( random_state=self.seed) elif self.class_ == 'lin': if self.type_ == 'reg': if self.balanced == 'balanced': print('WARNING: balanced regressor not applicable') self.model = LinearRegression( **config) if config != None else LinearRegression() elif self.balanced == None: self.model = LinearRegression( **config) if config != None else LinearRegression() elif self.type_ == 'cls': if self.balanced == 'balanced': self.model = LogisticRegression( **config) if config != None else LogisticRegression() self.model.class_weight = self.balanced elif self.balanced == None: self.model = LogisticRegression( **config) if config != None else LogisticRegression() self.model.class_weight = None elif self.class_ == 'svm': assert self.type_ == 'cls', print( 'If using SVM, make sure you have a classification problem. i.e. set type_="cls"' ) self.model = SVC(**config) if config != None else SVC(kernel='rbf') print('Created: ', self.model) return self.model
def test_balanced_random_forest_pruning(imbalanced_dataset): brf = BalancedRandomForestClassifier() brf.fit(*imbalanced_dataset) n_nodes_no_pruning = brf.estimators_[0].tree_.node_count brf_pruned = BalancedRandomForestClassifier(ccp_alpha=0.015) brf_pruned.fit(*imbalanced_dataset) n_nodes_pruning = brf_pruned.estimators_[0].tree_.node_count assert n_nodes_no_pruning > n_nodes_pruning
def _train_has_damage(cls, preprocessed_df: pd.DataFrame) -> LinearModelType: X_train, X_test, Y_train, Y_test = cls.get_X_Y_split( preprocessed_df, "has_claim" ) model = BalancedRandomForestClassifier() model.fit(X_train, Y_train) return model
def random_forest(df, drop, target, show, model_name): # split the table into features and outcomes x_cols = [i for i in df.columns if i not in drop] X = df[x_cols] y = df[target] # split features and outcomes into train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0) brf.fit(X_train, y_train) y_predictions = brf.predict(X_test) feature_importance = sorted( zip(brf.feature_importances_, X.columns.tolist()))[::-1] # Calculating the accuracy score. acc_score = balanced_accuracy_score(y_test, y_predictions) # Displaying results if show == True: print(f"Feature Importance: {model_name}") for i in feature_importance: print(i) print("\n") return acc_score * 100
def main(): """ Main entrance.""" print('Spliting challenges') split_challenges() print('Reading X...') X = pd.concat([pd.read_json(XY_PATH['X'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1']) print('Reading y...') y = pd.concat([pd.read_json(XY_PATH['y'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1']) print('\nTraining Inner sampler RFC') for i in range(10): print(f'Training 10-Fold CV #{i}', end='\r') X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i) balanced_rfc = BalancedRandomForestClassifier(n_estimators=100, random_state=0) balanced_rfc.fit(X_train.to_numpy(), y_train.to_numpy().ravel()) pd.DataFrame(balanced_rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'brf', f'y_prob_{i}.json'), orient='records') pd.Series(balanced_rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'brf', f'feature_importance_{i}.json')) print('\nTraining RandomUnderSampler') for i in range(10): print(f'Training 10-Fold CV #{i}', end='\r') X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i) rfc = RandomForestClassifier(n_estimators=100, random_state=0) rus = RandomUnderSampler(random_state=0) X_resample, y_resample = rus.fit_resample(X_train.to_numpy(), y_train.to_numpy().ravel()) rfc.fit(X_resample, y_resample) pd.DataFrame(rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'rus', f'y_prob_{i}.json'), orient='records') pd.Series(rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'rus', f'feature_importance_{i}.json'))
def get_balanced_models(): models = list() #LR models.append( ('LR_Bal', LogisticRegression(solver='lbfgs', class_weight='balanced'))) # LDA models.append(('LDA', LinearDiscriminantAnalysis())) #KNN models.append(('KNN', KNeighborsClassifier())) #NB models.append(('NB', GaussianNB())) #MNB #models.append(('MNB', MultinomialNB())) #GPC #models.append(('GPC', GaussianProcessClassifier())) if X.shape[0] < 100000: #SVM Balanced models.append(('SVM_Bal', SVC(gamma='scale', class_weight='balanced'))) #SVM Weight models.append(('SVM_W', SVC(gamma='scale', class_weight=weights))) #Balanced RF models.append( ('Bal_RF', BalancedRandomForestClassifier(n_estimators=1000))) #RF models.append(('RF_Bal', RandomForestClassifier(n_estimators=1000, class_weight='balanced'))) #DT models.append(('DT_Bal', DecisionTreeClassifier(class_weight='balanced'))) #Bag models.append(('BAG', BaggingClassifier(n_estimators=1000))) #XGB models.append(('XGB_W', XGBClassifier(scale_pos_weight=weights))) return models
def __init__(self, iterations=1, transform_first=False, untrained_model=BalancedRandomForestClassifier(random_state=42,n_jobs=40), max_train_test_samples=100, mode_interaction_extract='knee', include_self_interactions=False, penalty=3, pelt_model='l2', no_changepoint_strategy='median'): """https://github.com/ModelOriented/SAFE/blob/master/SafeTransformer/SafeTransformer.py""" steps=[] for i in range(iterations): steps.extend([['interaction{}'.format(i),InteractionTransformer(copy.deepcopy(untrained_model), max_train_test_samples, mode_interaction_extract, include_self_interactions)], ['transformer{}'.format(i),SafeTransformer(penalty=penalty, model=copy.deepcopy(untrained_model), pelt_model=pelt_model, no_changepoint_strategy=no_changepoint_strategy)]]) self.pipeline=Pipeline(steps)
def fourth_test(X_train, y_train, X_test, y_test): print("Test with BalancedRandomForestClassifier or BalancedBaggingClassifier\n") print("BalancedRandomForestClassifier") scores = cross_validate(BalancedRandomForestClassifier(max_depth=None, n_estimators=500, random_state=0, n_jobs=2, max_features='log2', oob_score=False), X_train, y_train, cv=10, scoring=('roc_auc', 'average_precision'), return_estimator=True) print(scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()) log_model = scores['estimator'][np.argmax(scores['test_roc_auc'])] y_log_pred = log_model.predict(X_test) conf_mat = confusion_matrix(y_true=y_test, y_pred=y_log_pred) print("confusion_matrix:\n", conf_mat) print() print("BalancedBaggingClassifier") tree = DecisionTreeClassifier(max_features='auto') resample_bagging = BalancedBaggingClassifier( base_estimator=tree, n_estimators=100, random_state=0, n_jobs=2, oob_score=True) scores = cross_validate(resample_bagging, X_train, y_train, cv=10, scoring=( 'roc_auc', 'average_precision'), return_estimator=True) print(scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()) rf_model = scores['estimator'][np.argmax(scores['test_roc_auc'])] y_rf_pred = rf_model.predict(X_test) conf_mat = confusion_matrix(y_true=y_test, y_pred=y_rf_pred) print("confusion_matrix:\n", conf_mat) """
def test_balanced_random_forest_attributes(imbalanced_dataset): X, y = imbalanced_dataset n_estimators = 10 brf = BalancedRandomForestClassifier( n_estimators=n_estimators, random_state=0 ) brf.fit(X, y) for idx in range(n_estimators): X_res, y_res = brf.samplers_[idx].fit_resample(X, y) X_res_2, y_res_2 = ( brf.pipelines_[idx] .named_steps["randomundersampler"] .fit_resample(X, y) ) assert_allclose(X_res, X_res_2) assert_array_equal(y_res, y_res_2) y_pred = brf.estimators_[idx].fit(X_res, y_res).predict(X) y_pred_2 = brf.pipelines_[idx].fit(X, y).predict(X) assert_array_equal(y_pred, y_pred_2) y_pred = brf.estimators_[idx].fit(X_res, y_res).predict_proba(X) y_pred_2 = brf.pipelines_[idx].fit(X, y).predict_proba(X) assert_array_equal(y_pred, y_pred_2)
def apply_ml_model(X_train_input, y_train_input, X_test_input, y_test_input): models = ['LREG','RFC','Tree','Balanced RFC'] scores = [] # Specify the target classes classes = ["No re-admission","Re-admission in < 30 days"] for model in models: if model == 'LREG': model_select = LogisticRegression(solver='lbfgs', max_iter=500, random_state=78) elif model == 'RFC': model_select = RandomForestClassifier(n_estimators= 128, random_state=78) elif model == 'Tree': model_select = tree.DecisionTreeClassifier(random_state=78) elif model == 'Balanced RFC': model_select = BalancedRandomForestClassifier(n_estimators=128, random_state=78) model_select.fit(X_train_input, y_train_input) y_pred = model_select.predict(X_test_input) # Create a DataFrame from the confusion matrix. cm = confusion_matrix(y_test_input, y_pred) # Calculating the accuracy score. acc_score = balanced_accuracy_score(y_test, y_pred) scores.append(acc_score) print(f"Model: {model}") # Displaying results print("Confusion Matrix") cm_df = pd.DataFrame( cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]) print(cm_df) print(f"Accuracy Score : {acc_score}\n") print("Classification Report") print(classification_report_imbalanced(y_test_input, y_pred))
def test_balanced_random_forest_oob(imbalanced_dataset): X, y = imbalanced_dataset est = BalancedRandomForestClassifier(oob_score=True, random_state=0) n_samples = X.shape[0] est.fit(X[:n_samples // 2, :], y[:n_samples // 2]) test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:]) assert abs(test_score - est.oob_score_) < 0.1 # Check warning if not enough estimators est = BalancedRandomForestClassifier(oob_score=True, random_state=0, n_estimators=1, bootstrap=True) with pytest.warns(UserWarning) and np.errstate(divide="ignore", invalid="ignore"): est.fit(X, y)
def do_CV_grid(LS, cv=10): nBits = 1250 with measure_time("Creating fingerprint"): X_LS = create_fingerprints(LS["SMILES"].values, nBits=nBits) # drop duplicate data = pd.DataFrame(X_LS) data = data.drop_duplicates() X_LS = data.values # Drop also duplicate in the y_LS samples y_LS = LS["ACTIVE"].loc[data.index].values X_train, X_test, y_train, y_test = train_test_split( X_LS, y_LS, test_size=0.25, train_size=0.75, random_state=1) pipeline = Pipeline([('ada', ADASYN(sampling_strategy=0.25, random_state=64, n_jobs=-1)), ('BRF', BalancedRandomForestClassifier(n_estimators=500, random_state=18, n_jobs=-1, bootstrap=False))]) param = {} param['BRF__n_estimators'] = [500] param['BRF__max_features'] = [None, 'log2'] #param['BRF__criterion'] = ['gini', 'entropy'] clf = GridSearchCV(pipeline, param, scoring='roc_auc', n_jobs=2, cv=10) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.cv_results_) print(clf.best_params_) print(clf.best_score_) print() y_pred = clf.predict(X_test) conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred) print("confusion_matrix:\n", conf_mat) print("Classification report") print(classification_report(y_true=y_test, y_pred=y_pred))
def objective(trial): train_X, val_X, train_y, val_y = train_test_split(self.X, self.y, test_size=0.2) median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') v_train_X = median_imputer.fit_transform(train_X) v_val_X = median_imputer.fit_transform(val_X) train_X = pd.DataFrame(v_train_X, columns=train_X.columns, index=train_X.index) val_X = pd.DataFrame(v_val_X, columns=val_X.columns, index=val_X.index) v_test_X = median_imputer.fit_transform(self.X_validation) test_X = pd.DataFrame(v_test_X, columns=self.X_validation.columns, index=self.X_validation.index) list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000] brf_n_estimators = trial.suggest_categorical( 'n_estimators', list_trees) brf_max_features = trial.suggest_uniform('max_features', 0.15, 1.0) brf_min_samples_split = trial.suggest_int('min_samples_split', 2, 16) brf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16) brf_min_weight_fraction_leaf = trial.suggest_uniform( 'min_weight_fraction_leaf', 0, 0.5) brf_max_depth = trial.suggest_int('max_depth', 2, 32) brfmodel = BalancedRandomForestClassifier( n_estimators=brf_n_estimators, max_features=brf_max_features, min_samples_split=brf_min_samples_split, min_samples_leaf=brf_min_samples_leaf, max_depth=brf_max_depth, min_weight_fraction_leaf=brf_min_weight_fraction_leaf, bootstrap=True) brfmodel.fit(train_X, train_y) aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1]) aucbrf_test = roc_auc_score(self.y_validation, brfmodel.predict_proba(test_X)[:, 1]) print('Accuracy test ' + str( accuracy_score(self.y_validation, brfmodel.predict(test_X)))) plt.figure() plot_confusion_matrix(brfmodel, test_X, self.y_validation, cmap=plt.cm.Blues, normalize=None) plt.show() print(aucbrf_test) return aucbrf
def test_balanced_random_forest_grid_search(imbalanced_dataset): brf = BalancedRandomForestClassifier() grid = GridSearchCV(brf, { "n_estimators": (1, 2), "max_depth": (1, 2) }, cv=3) grid.fit(*imbalanced_dataset)
def test_balanced_random_forest(imbalanced_dataset): n_estimators = 10 brf = BalancedRandomForestClassifier(n_estimators=n_estimators, random_state=0) brf.fit(*imbalanced_dataset) assert len(brf.samplers_) == n_estimators assert len(brf.estimators_) == n_estimators assert len(brf.pipelines_) == n_estimators assert len(brf.feature_importances_) == imbalanced_dataset[0].shape[1]
def test_balanced_random_forest_grid_search(imbalanced_dataset): brf = BalancedRandomForestClassifier() grid = GridSearchCV(brf, { 'n_estimators': (1, 2), 'max_depth': (1, 2) }, cv=3, iid=False) grid.fit(*imbalanced_dataset)
def evaluate(X_train, y_train, X_test, y_test): global seed clf = BalancedRandomForestClassifier(n_estimators=500, random_state=seed) clf = clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test).argsort(axis=1) y_pred1 = y_pred[:, -1] y_pred2 = y_pred[:, -2] return metrics.confusion_matrix(y_test, y_pred1), metrics.confusion_matrix( y_test, y_pred2)
def model_checking(self): X = self.df[self.features] Y = self.df[self.target] pipelines = [ Pipeline(steps=[('classifier', BalancedRandomForestClassifier( n_estimators=200))]), Pipeline(steps=[ # ('rfe', RFE(XGBClassifier(), )), ('classifier', BalancedBaggingClassifier(n_estimators=200)) ]), Pipeline(steps=[('rfe', SMOTE()), ('classifier', XGBClassifier(n_estimators=1000, reg_alpha=1))]), Pipeline(steps=[('rfe', BorderlineSMOTE()), ('classifier', XGBClassifier(n_estimators=1000, reg_alpha=1))]), Pipeline(steps=[ # ('rfe', RFE(XGBClassifier(), )), ('classifier', XGBClassifier( n_estimators=1000, scale_pos_weight=3, reg_alpha=1)) ]), Pipeline( steps=[('rfe', RFE(XGBClassifier())), ('classifier', XGBClassifier( n_estimators=1000, scale_pos_weight=3, reg_alpha=1) )]) ] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, stratify=Y) for pipe in pipelines: scores = cross_val_score(pipe, X_train.values, y_train, scoring='precision', cv=StratifiedKFold(5)) print("cross val scores") print(sum(scores) / 5) pipe.fit(X_train.values, y_train.values) y_pred = pipe.predict(X_test.values) acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) print("test scores") print( f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}" )
def evaluate_model(self): with open(self.result_folder + '/param_RF_{}.json'.format(self.epoch)) as f: dati = json.load(f) for data in dati: del data['value'] rf_model = BalancedRandomForestClassifier(**data) rf_auc = [] for i in tqdm(range(20)): cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i + 187462) for train_index, test_index in cv.split(self.X, self.y): trainX = self.X.iloc[lambda x: train_index] testX = self.X.iloc[lambda x: test_index] trainy = np.take(self.y, train_index) testy = np.take(self.y, test_index) median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') imputer = median_imputer.fit(trainX) vtrainX = imputer.transform(trainX) imputertest = median_imputer.fit(testX) vtestX = imputertest.transform(testX) trainX = pd.DataFrame(vtrainX, columns=trainX.columns, index=trainX.index) testX = pd.DataFrame(vtestX, columns=testX.columns, index=testX.index) # Calcolo AUC per migliori risultati da CatBoost rf_model.fit(trainX, trainy) roc_rf = roc_auc_score( testy, rf_model.predict_proba(testX)[:, 1]) rf_auc.append(roc_rf) print(roc_rf) print(statistics.mean(rf_auc)) return rf_auc
def find_clf_parameters(self, train_x, train_y, clf_type): max_depth = [2, 4, 6] min_samples_leaf = np.arange(1, 4) min_samples_split = np.arange(2, 5) n_estimators = [100, 300] criterion = ['gini', 'entropy'] sampling_strategy = ['auto', 'majority', 'not majority'] models1 = { 'BalancedRandomForestClassifier': BalancedRandomForestClassifier(random_state=42), 'EasyEnsembleClassifier': EasyEnsembleClassifier(random_state=42) } params1 = { 'BalancedRandomForestClassifier': [{ 'criterion': criterion, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'sampling_strategy': sampling_strategy, }], 'EasyEnsembleClassifier': [{ 'n_estimators': n_estimators, 'sampling_strategy': sampling_strategy }], } helper1 = EstimatorSelectionHelper(models1, params1) if clf_type == 'binary': helper1.fit(train_x, train_y, cv=5, scoring='balanced_accuracy', n_jobs=-1) if clf_type == 'multi-class': helper1.fit(train_x, train_y, cv=5, scoring='balanced_accuracy', n_jobs=-1) df = helper1.score_summary() best_estimator = df['estimator'].iloc[0] best_estimator_params = df['params'].iloc[0] return best_estimator, best_estimator_params
def _plot_championship_importance(all_res, save_directory, top = 6): save_file = save_directory + 'championship_importance.png' if os.path.exists(save_file): return xs = [] ys = [] teams = [] for season in all_res: team_df = all_res[season][0] team_stats = all_res[season][1] champion = all_res[season][2] for team, g in team_df.groupby('TEAM'): x = g.nlargest(top, 'TIME')[['off_norm', 'def_norm']].unstack().values y = 1 if team in champion else 0 xs.append(x) ys.append(y) teams.append(team + '_' + season) xs = np.vstack(xs) ys = np.array(ys) fts = [] for ntree in tqdm([50, 75, 100, 125, 150, 175, 200]): for i in np.where(ys==1)[0]: xs_temp = xs[[x for x in range(len(xs)) if x != i]] ys_temp = ys[[y for y in range(len(xs)) if y != i]] rfr = BalancedRandomForestClassifier(n_estimators=ntree) rfr.fit(xs_temp, ys_temp) ft = rfr.feature_importances_ fts.append(ft) fts = np.vstack(fts) feature_names = ['off' + str(i+1) for i in range(top)] + ['def' + str(i+1) for i in range(top)] fig, ax = plt.subplots(figsize=(8,6)) for i in range(len(feature_names)): ax.boxplot(fts[:, i], positions=[i]) ax.set_xticklabels(feature_names) ax.set_ylabel('Feature Importance', labelpad=10) ax.set_title('Championship Feature Importance') plt.savefig(save_file) plt.close()
def test_balanced_random_forest_oob_binomial(ratio): # Regression test for #655: check that the oob score is closed to 0.5 # a binomial experiment. rng = np.random.RandomState(42) n_samples = 1000 X = np.arange(n_samples).reshape(-1, 1) y = rng.binomial(1, ratio, size=n_samples) erf = BalancedRandomForestClassifier(oob_score=True, random_state=42) erf.fit(X, y) assert np.abs(erf.oob_score_ - 0.5) < 0.1
def do_sampling_research(X_train, y_train, X_test, y_test, nBits=124, info_features=False): print("First tests:") # Without sampling if info_features: get_info_features(y_train, pd.DataFrame( {'ACTIVE': y_train}), save="Count_before_{}.pdf".format(nBits)) pipeline = make_pipeline(ADASYN(sampling_strategy=0.25, random_state=64, n_jobs=-1), BalancedRandomForestClassifier(n_estimators=100, random_state=18, n_jobs=-1)) display_confusion_matrix(pipeline, X_train, y_train, X_test, y_test, save="confusion_matrix_before_{}.pdf".format(nBits), title="Confusion Matrix before sampling with {} nBits".format(nBits)) print("\nSecond test:") # With sampling if info_features: get_info_features(y_train, pd.DataFrame( {'ACTIVE': y_train}), save="Count_after_{}.pdf".format(nBits)) pipeline = make_pipeline(ADASYN(sampling_strategy=0.28, random_state=64, n_jobs=-1), BalancedRandomForestClassifier(n_estimators=50, random_state=18, n_jobs=-1)) display_confusion_matrix(pipeline, X_train, y_train, X_test, y_test, save="confusion_matrix_after_{}.pdf".format(nBits), title="Confusion Matrix after sampling with {} nBits".format(nBits))
def evaluate_on_validation_or_test(self, test=False): with open(self.result_folder + '/param_RF_{}.json'.format(self.epoch)) as f: dati = json.load(f) for data in dati: del data['value'] rf_model = BalancedRandomForestClassifier(**data) trainX = self.X trainy = self.y valx = self.X_validation valy = self.y_validation if test == True: testx = self.X_test testy = self.y_test median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') imputer = median_imputer.fit(trainX) vtrainX = imputer.transform(trainX) trainX = pd.DataFrame(vtrainX, columns=trainX.columns, index=trainX.index) vvalX = imputer.transform(valx) valx = pd.DataFrame(vvalX, columns=valx.columns, index=valx.index) if test == True: vtest = imputer.transform(testx) testx = pd.DataFrame(vtest, columns=testx.columns, index=testx.index) trainX = pd.concat([trainX, valx]) trainy = np.concatenate((trainy, valy)) rf_model.fit(trainX, trainy) if test == True: roc_rf = roc_auc_score(testy, rf_model.predict_proba(testx)[:, 1]) else: roc_rf = roc_auc_score(valy, rf_model.predict_proba(valx)[:, 1]) if test == False: print("Validation AUC: {}".format(str(roc_rf))) else: print("Test AUC: {}".format(str(roc_rf)))
def run_best_estimator(self, train_x, train_y, test_x, test_y, estimator, params, clf_type, question): estimator_scores = {} if estimator == 'BalancedRandomForestClassifier': clf = BalancedRandomForestClassifier( n_estimators=params['n_estimators'], sampling_strategy=params['sampling_strategy'], random_state=42) elif estimator == 'BalancedBaggingClassifier': clf = BalancedBaggingClassifier( n_estimators=params['n_estimators'], bootstrap=params['bootstrap'], max_samples=params['max_samples'], sampling_strategy=params['sampling_strategy'], random_state=42) elif estimator == 'EasyEnsembleClassifier': clf = EasyEnsembleClassifier( n_estimators=params['n_estimators'], sampling_strategy=params['sampling_strategy'], random_state=42) clf.fit(train_x, train_y) cross_val_scores = self.calc_cross_val_scores(clf, train_x, train_y, clf_type, question) predicted_labels = clf.predict(test_x) tn, fp, fn, tp = confusion_matrix(test_y, predicted_labels).ravel() specificity = round((tn / (tn + fp)) * 100, 2) predicted_prob = clf.predict_proba(test_x) predicted_prob_true = [p[1] for p in predicted_prob] estimator_scores['Question'] = question estimator_scores['Accuracy'] = round( accuracy_score(test_y, predicted_labels) * 100, 2) estimator_scores['Balanced Accuracy'] = round( balanced_accuracy_score(test_y, predicted_labels) * 100, 2) estimator_scores['Precision'] = round( precision_score(test_y, predicted_labels) * 100, 2) estimator_scores['Recall'] = round( recall_score(test_y, predicted_labels) * 100, 2) estimator_scores['Specificity'] = specificity estimator_scores['F1'] = round(f1_score(test_y, predicted_labels), 2) estimator_scores['ROC AUC'] = round( roc_auc_score(test_y, predicted_prob_true), 2) # print('Perfect Confusion Matrix for Q-%s is: ' % (str(question).zfill(2))) # perfect_labels = train_y # print(confusion_matrix(train_y, perfect_labels)) return cross_val_scores, estimator_scores
def get_classifiers(): classifiers = [ DummyClassifier(), LogisticRegression(), PassiveAggressiveClassifier(), RidgeClassifier(), SGDClassifier(), \ KNeighborsClassifier(), MLPClassifier(), LinearSVC(), \ NuSVC(), SVC(), DecisionTreeClassifier(), ExtraTreeClassifier(), AdaBoostClassifier(), \ BaggingClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), \ RandomForestClassifier(), GaussianProcessClassifier(), \ EasyEnsembleClassifier(), BalancedBaggingClassifier(), BalancedRandomForestClassifier(), XGBClassifier() ] return classifiers