import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from comparator import Comparator as Tester pd.set_option('display.max_columns', None) df = pd.read_csv(r"./data/processed_data.csv", engine="python") tester1 = Tester('SeriousDlqin2yrs') tester1.addDataset('processed_data', df) # tester1.addModel('1', RandomForestClassifier(n_estimators=100, max_depth=16, max_features='auto', min_samples_leaf=100)) # tester1.addModel('2', RandomForestClassifier(n_estimators=100, max_depth=12, max_features='auto', min_samples_leaf=150)) # tester1.addModel('3', RandomForestClassifier(n_estimators=100, max_depth=8, max_features='auto')) # tester1.addModel('1', GradientBoostingClassifier(n_estimators=200, learning_rate=0.08, subsample=0.95, max_depth=5, min_samples_leaf=43)) tester1.addModel('2', GradientBoostingClassifier(n_estimators=200, learning_rate=0.08, subsample=0.85, max_depth=5, min_samples_leaf=550)) tester1.runTests() # n_estimators = [30, 50, 70, 90, 110, 130] # learning_rate = [0.03, 0.05, 0.08, 0.1, 0.13, 0.15] # plt.subplot(1, 2, 1) # line1 = plt.plot(n_estimators, learning_rate, 'y', label='time_spent') # # plt.subplot(1, 2, 2) # line2 = plt.plot(n_estimators, learning_rate, 'y', label='time_spent') # plt.ylabel('time_spent') # # plt.ylim([0.84, 0.87])
from sklearn.ensemble import RandomForestClassifier from comparator import Comparator import numpy df = pd.read_csv(r"./data/processed_data.csv", engine="python") add_outliers = df.copy() outlier_count = int(df.shape[0] * 0.05) index = numpy.random.randint(0, df.shape[0], outlier_count) add_outliers.reset_index(drop=True, inplace=True) for i in index: add_outliers.at[i, 'DebtRatio'] = numpy.random.randint(3000, 30000) comparator = Comparator('SeriousDlqin2yrs') comparator.addDataset('data', df) comparator.addDataset('outliers added', add_outliers) # comparator.addModel('tuned RF', RandomForestClassifier(n_estimators=100, max_depth=16, max_features='auto', min_samples_leaf=100)) # comparator.addModel('default RF', RandomForestClassifier()) comparator.addModel( 'tuned GBDT', GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, subsample=0.85, max_depth=5, min_samples_leaf=500)) comparator.addModel('default GBDT', GradientBoostingClassifier()) comparator.runTests()
'NumberOfTime30-59DaysPastDueNotWorse'] = 18 best_data.loc[best_data['NumberOfTime60-89DaysPastDueNotWorse'] > 90, 'NumberOfTime60-89DaysPastDueNotWorse'] = 18 best_data.loc[best_data['NumberOfTimes90DaysLate'] > 90, 'NumberOfTimes90DaysLate'] = 18 best_data = best_data.drop( best_data[best_data['RevolvingUtilizationOfUnsecuredLines'] > 10].index) df = df.drop('Unnamed: 0', axis=1) df.to_csv(r"./data/processed_data.csv") # 验证 tester = Comparator('SeriousDlqin2yrs') tester.addDataset('missing data processed', df) tester.addDataset('debt ratio outliers removed', removed_debt_outliers) # 164 removed tester.addDataset('debt ratio outliers replaced', repalace_debt_ratio) # 164 removed tester.addDataset('overdue outliers replaced', repalace98) #269 removed tester.addDataset('utilization outliers removed', dfus) # 241 removed tester.addDataset('overdue outliers removed', drop98) tester.addDataset('outliers added', add_outliers) tester.addDataset('best_data', best_data) # rf_default = RandomForestClassifier() # dbdt_default = GradientBoostingClassifier() # tester.addModel('default RF', rf_default) # tester.addModel('default GBDT ', dbdt_default)
loss = ['deviance', 'exponential'] min_samples_leaf = np.linspace(1, 1500, 15, endpoint=True) paras["n_estimators"] = n_estimators paras["learning_rate"] = learning_rate paras["max_depth"] = max_depth paras["max_features"] = max_features paras["min_samples_split"] = min_samples_split paras["min_samples_leaf"] = min_samples_leaf paras["loss"] = loss paras["subsample"] = subsample # 默认参数 ('default GBDT', 'processed_data') --> AUC: 0.8638 (+/- 0.0059) comparator = Comparator('SeriousDlqin2yrs') df = pd.read_csv(r"./data/processed_data.csv", engine="python") comparator.addDataset('processed_data', df) to_tuning = 'loss' # rfc = [] # for i in range(0, len(paras[to_tuning])): # rfc.append(GradientBoostingClassifier(n_estimators=200, learning_rate=0.08, subsample=0.85, max_depth=5, min_samples_leaf=550, loss = loss[i])) # comparator.addModel(i, rfc[i]) # test_auc, train_auc, time_spent = comparator.runTests() # line1, = plt.plot(paras[to_tuning], train_auc, 'b', label='Train AUC') # line2, = plt.plot(paras[to_tuning], test_auc, 'r', label='Test AUC') # plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) # plt.ylabel('AUC score') # plt.xlabel(to_tuning) # plt.ylim([0.845, 0.9]) #