from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from comparator import Comparator as Tester pd.set_option('display.max_columns', None) df = pd.read_csv(r"./data/processed_data.csv", engine="python") tester1 = Tester('SeriousDlqin2yrs') tester1.addDataset('processed_data', df) # tester1.addModel('1', RandomForestClassifier(n_estimators=100, max_depth=16, max_features='auto', min_samples_leaf=100)) # tester1.addModel('2', RandomForestClassifier(n_estimators=100, max_depth=12, max_features='auto', min_samples_leaf=150)) # tester1.addModel('3', RandomForestClassifier(n_estimators=100, max_depth=8, max_features='auto')) # tester1.addModel('1', GradientBoostingClassifier(n_estimators=200, learning_rate=0.08, subsample=0.95, max_depth=5, min_samples_leaf=43)) tester1.addModel('2', GradientBoostingClassifier(n_estimators=200, learning_rate=0.08, subsample=0.85, max_depth=5, min_samples_leaf=550)) tester1.runTests() # n_estimators = [30, 50, 70, 90, 110, 130] # learning_rate = [0.03, 0.05, 0.08, 0.1, 0.13, 0.15] # plt.subplot(1, 2, 1) # line1 = plt.plot(n_estimators, learning_rate, 'y', label='time_spent') # # plt.subplot(1, 2, 2) # line2 = plt.plot(n_estimators, learning_rate, 'y', label='time_spent') # plt.ylabel('time_spent') # # plt.ylim([0.84, 0.87]) # # df = pd.read_csv(r"./data/processed_data.csv", engine="python") # print(df.shape[0])
tester = Comparator('SeriousDlqin2yrs') tester.addDataset('missing data processed', df) tester.addDataset('debt ratio outliers removed', removed_debt_outliers) # 164 removed tester.addDataset('debt ratio outliers replaced', repalace_debt_ratio) # 164 removed tester.addDataset('overdue outliers replaced', repalace98) #269 removed tester.addDataset('utilization outliers removed', dfus) # 241 removed tester.addDataset('overdue outliers removed', drop98) tester.addDataset('outliers added', add_outliers) tester.addDataset('best_data', best_data) # rf_default = RandomForestClassifier() # dbdt_default = GradientBoostingClassifier() # tester.addModel('default RF', rf_default) # tester.addModel('default GBDT ', dbdt_default) rf = RandomForestClassifier(n_estimators=32, max_depth=8, random_state=0, max_features='auto', oob_score=True) # dbdt = GradientBoostingClassifier(n_estimators=250, subsample=0.8, min_samples_split=1000, learning_rate=0.06, max_depth=6 ) tester.addModel('RF', rf) # tester.addModel('GBDT', dbdt) # tester.addModel('Simple SVM', svm.LinearSVC()) test_auc, train_auc, time_spent = tester.runTests()
from sklearn.ensemble import RandomForestClassifier from comparator import Comparator import numpy df = pd.read_csv(r"./data/processed_data.csv", engine="python") add_outliers = df.copy() outlier_count = int(df.shape[0] * 0.05) index = numpy.random.randint(0, df.shape[0], outlier_count) add_outliers.reset_index(drop=True, inplace=True) for i in index: add_outliers.at[i, 'DebtRatio'] = numpy.random.randint(3000, 30000) comparator = Comparator('SeriousDlqin2yrs') comparator.addDataset('data', df) comparator.addDataset('outliers added', add_outliers) # comparator.addModel('tuned RF', RandomForestClassifier(n_estimators=100, max_depth=16, max_features='auto', min_samples_leaf=100)) # comparator.addModel('default RF', RandomForestClassifier()) comparator.addModel( 'tuned GBDT', GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, subsample=0.85, max_depth=5, min_samples_leaf=500)) comparator.addModel('default GBDT', GradientBoostingClassifier()) comparator.runTests()
paras["n_estimators"] = n_estimators paras["max_depth"] = max_depth paras["max_features"] = max_features paras["min_samples_split"] = min_samples_split paras["min_samples_leaf"] = min_samples_leaf to_tuning = 'min_samples_leaf' rfc = [] for i in range(0, len(paras[to_tuning])): rfc.append( RandomForestClassifier(n_estimators=100, max_depth=16, max_features='auto', min_samples_leaf=int(min_samples_leaf[i]))) tester1.addModel(i, rfc[i]) test_auc, train_auc, time_spent = tester1.runTests() # plt.subplot(121) line1, = plt.plot(paras[to_tuning], train_auc, 'b', label='Train AUC') line2, = plt.plot(paras[to_tuning], test_auc, 'r', label='Test AUC') plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) plt.ylabel('AUC score') plt.xlabel(to_tuning) plt.ylim([0.85, 0.9]) # plt.subplot(122) # line3, = plt.plot(paras[to_tuning], time_spent) # plt.ylabel('time spent') # plt.xlabel(to_tuning)
# # plt.show() for j in range(0, len(learning_rate)): rfc = [] comparator = Comparator('SeriousDlqin2yrs') df = pd.read_csv(r"./data/processed_data.csv", engine="python") comparator.addDataset('processed_data', df) to_tuning = 'n_estimators' for i in range(0, len(paras[to_tuning])): rfc.append( GradientBoostingClassifier(n_estimators=n_estimators[i], learning_rate=learning_rate[j], subsample=0.85, max_depth=5, min_samples_leaf=550)) comparator.addModel(i, rfc[i]) test_auc, train_auc, time_spent = comparator.runTests() plt.subplot(len(learning_rate) / 2, 2, j + 1) plt.title("learning_rate=" + str(learning_rate[j])) line1, = plt.plot(paras[to_tuning], train_auc, 'b', label='Train AUC') line2, = plt.plot(paras[to_tuning], test_auc, 'r', label='Test AUC') plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) plt.ylabel('AUC score') plt.xlabel(to_tuning) plt.ylim([0.85, 0.875]) plt.show()