Ejemplo n.º 1
0
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from comparator import Comparator as Tester
pd.set_option('display.max_columns', None)


df = pd.read_csv(r"./data/processed_data.csv", engine="python")

tester1 = Tester('SeriousDlqin2yrs')
tester1.addDataset('processed_data', df)
# tester1.addModel('1', RandomForestClassifier(n_estimators=100, max_depth=16, max_features='auto', min_samples_leaf=100))
# tester1.addModel('2', RandomForestClassifier(n_estimators=100, max_depth=12, max_features='auto', min_samples_leaf=150))
# tester1.addModel('3', RandomForestClassifier(n_estimators=100, max_depth=8, max_features='auto'))

# tester1.addModel('1', GradientBoostingClassifier(n_estimators=200, learning_rate=0.08, subsample=0.95, max_depth=5, min_samples_leaf=43))
tester1.addModel('2', GradientBoostingClassifier(n_estimators=200, learning_rate=0.08, subsample=0.85, max_depth=5, min_samples_leaf=550))
tester1.runTests()

# n_estimators = [30, 50, 70, 90, 110, 130]
# learning_rate = [0.03,  0.05, 0.08,  0.1, 0.13, 0.15]
# plt.subplot(1, 2, 1)
# line1 = plt.plot(n_estimators, learning_rate, 'y', label='time_spent')
#
# plt.subplot(1, 2, 2)
# line2 = plt.plot(n_estimators, learning_rate, 'y', label='time_spent')
# plt.ylabel('time_spent')
#
# plt.ylim([0.84, 0.87])
Ejemplo n.º 2
0
from sklearn.ensemble import RandomForestClassifier
from comparator import Comparator
import numpy

df = pd.read_csv(r"./data/processed_data.csv", engine="python")

add_outliers = df.copy()
outlier_count = int(df.shape[0] * 0.05)
index = numpy.random.randint(0, df.shape[0], outlier_count)
add_outliers.reset_index(drop=True, inplace=True)
for i in index:
    add_outliers.at[i, 'DebtRatio'] = numpy.random.randint(3000, 30000)

comparator = Comparator('SeriousDlqin2yrs')

comparator.addDataset('data', df)
comparator.addDataset('outliers added', add_outliers)

# comparator.addModel('tuned RF', RandomForestClassifier(n_estimators=100, max_depth=16, max_features='auto', min_samples_leaf=100))
# comparator.addModel('default RF', RandomForestClassifier())
comparator.addModel(
    'tuned GBDT',
    GradientBoostingClassifier(n_estimators=200,
                               learning_rate=0.05,
                               subsample=0.85,
                               max_depth=5,
                               min_samples_leaf=500))
comparator.addModel('default GBDT', GradientBoostingClassifier())

comparator.runTests()
Ejemplo n.º 3
0
              'NumberOfTime30-59DaysPastDueNotWorse'] = 18
best_data.loc[best_data['NumberOfTime60-89DaysPastDueNotWorse'] > 90,
              'NumberOfTime60-89DaysPastDueNotWorse'] = 18
best_data.loc[best_data['NumberOfTimes90DaysLate'] > 90,
              'NumberOfTimes90DaysLate'] = 18

best_data = best_data.drop(
    best_data[best_data['RevolvingUtilizationOfUnsecuredLines'] > 10].index)

df = df.drop('Unnamed: 0', axis=1)
df.to_csv(r"./data/processed_data.csv")

# 验证
tester = Comparator('SeriousDlqin2yrs')

tester.addDataset('missing data processed', df)
tester.addDataset('debt ratio outliers removed',
                  removed_debt_outliers)  # 164 removed
tester.addDataset('debt ratio outliers replaced',
                  repalace_debt_ratio)  # 164 removed
tester.addDataset('overdue outliers replaced', repalace98)  #269 removed
tester.addDataset('utilization outliers removed', dfus)  # 241 removed
tester.addDataset('overdue outliers removed', drop98)
tester.addDataset('outliers added', add_outliers)
tester.addDataset('best_data', best_data)

# rf_default = RandomForestClassifier()
# dbdt_default = GradientBoostingClassifier()
# tester.addModel('default RF', rf_default)
# tester.addModel('default GBDT ', dbdt_default)
Ejemplo n.º 4
0
loss = ['deviance', 'exponential']
min_samples_leaf = np.linspace(1, 1500, 15, endpoint=True)

paras["n_estimators"] = n_estimators
paras["learning_rate"] = learning_rate
paras["max_depth"] = max_depth
paras["max_features"] = max_features
paras["min_samples_split"] = min_samples_split
paras["min_samples_leaf"] = min_samples_leaf
paras["loss"] = loss
paras["subsample"] = subsample

# 默认参数 ('default GBDT', 'processed_data') --> AUC: 0.8638 (+/- 0.0059)
comparator = Comparator('SeriousDlqin2yrs')
df = pd.read_csv(r"./data/processed_data.csv", engine="python")
comparator.addDataset('processed_data', df)

to_tuning = 'loss'

# rfc = []
# for i in range(0, len(paras[to_tuning])):
#     rfc.append(GradientBoostingClassifier(n_estimators=200, learning_rate=0.08, subsample=0.85, max_depth=5, min_samples_leaf=550, loss = loss[i]))
#     comparator.addModel(i, rfc[i])
# test_auc, train_auc, time_spent = comparator.runTests()
# line1, = plt.plot(paras[to_tuning], train_auc, 'b', label='Train AUC')
# line2, = plt.plot(paras[to_tuning], test_auc, 'r', label='Test AUC')
# plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
# plt.ylabel('AUC score')
# plt.xlabel(to_tuning)
# plt.ylim([0.845, 0.9])
#