예제 #1
0
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from comparator import Comparator as Tester
pd.set_option('display.max_columns', None)


df = pd.read_csv(r"./data/processed_data.csv", engine="python")

tester1 = Tester('SeriousDlqin2yrs')
tester1.addDataset('processed_data', df)
# tester1.addModel('1', RandomForestClassifier(n_estimators=100, max_depth=16, max_features='auto', min_samples_leaf=100))
# tester1.addModel('2', RandomForestClassifier(n_estimators=100, max_depth=12, max_features='auto', min_samples_leaf=150))
# tester1.addModel('3', RandomForestClassifier(n_estimators=100, max_depth=8, max_features='auto'))

# tester1.addModel('1', GradientBoostingClassifier(n_estimators=200, learning_rate=0.08, subsample=0.95, max_depth=5, min_samples_leaf=43))
tester1.addModel('2', GradientBoostingClassifier(n_estimators=200, learning_rate=0.08, subsample=0.85, max_depth=5, min_samples_leaf=550))
tester1.runTests()

# n_estimators = [30, 50, 70, 90, 110, 130]
# learning_rate = [0.03,  0.05, 0.08,  0.1, 0.13, 0.15]
# plt.subplot(1, 2, 1)
# line1 = plt.plot(n_estimators, learning_rate, 'y', label='time_spent')
#
# plt.subplot(1, 2, 2)
# line2 = plt.plot(n_estimators, learning_rate, 'y', label='time_spent')
# plt.ylabel('time_spent')
#
# plt.ylim([0.84, 0.87])
#
# df = pd.read_csv(r"./data/processed_data.csv", engine="python")
# print(df.shape[0])
예제 #2
0
tester = Comparator('SeriousDlqin2yrs')

tester.addDataset('missing data processed', df)
tester.addDataset('debt ratio outliers removed',
                  removed_debt_outliers)  # 164 removed
tester.addDataset('debt ratio outliers replaced',
                  repalace_debt_ratio)  # 164 removed
tester.addDataset('overdue outliers replaced', repalace98)  #269 removed
tester.addDataset('utilization outliers removed', dfus)  # 241 removed
tester.addDataset('overdue outliers removed', drop98)
tester.addDataset('outliers added', add_outliers)
tester.addDataset('best_data', best_data)

# rf_default = RandomForestClassifier()
# dbdt_default = GradientBoostingClassifier()
# tester.addModel('default RF', rf_default)
# tester.addModel('default GBDT ', dbdt_default)

rf = RandomForestClassifier(n_estimators=32,
                            max_depth=8,
                            random_state=0,
                            max_features='auto',
                            oob_score=True)
# dbdt = GradientBoostingClassifier(n_estimators=250, subsample=0.8, min_samples_split=1000, learning_rate=0.06, max_depth=6 )
tester.addModel('RF', rf)
# tester.addModel('GBDT', dbdt)

# tester.addModel('Simple SVM', svm.LinearSVC())

test_auc, train_auc, time_spent = tester.runTests()
예제 #3
0
from sklearn.ensemble import RandomForestClassifier
from comparator import Comparator
import numpy

df = pd.read_csv(r"./data/processed_data.csv", engine="python")

add_outliers = df.copy()
outlier_count = int(df.shape[0] * 0.05)
index = numpy.random.randint(0, df.shape[0], outlier_count)
add_outliers.reset_index(drop=True, inplace=True)
for i in index:
    add_outliers.at[i, 'DebtRatio'] = numpy.random.randint(3000, 30000)

comparator = Comparator('SeriousDlqin2yrs')

comparator.addDataset('data', df)
comparator.addDataset('outliers added', add_outliers)

# comparator.addModel('tuned RF', RandomForestClassifier(n_estimators=100, max_depth=16, max_features='auto', min_samples_leaf=100))
# comparator.addModel('default RF', RandomForestClassifier())
comparator.addModel(
    'tuned GBDT',
    GradientBoostingClassifier(n_estimators=200,
                               learning_rate=0.05,
                               subsample=0.85,
                               max_depth=5,
                               min_samples_leaf=500))
comparator.addModel('default GBDT', GradientBoostingClassifier())

comparator.runTests()
예제 #4
0
paras["n_estimators"] = n_estimators
paras["max_depth"] = max_depth
paras["max_features"] = max_features
paras["min_samples_split"] = min_samples_split
paras["min_samples_leaf"] = min_samples_leaf

to_tuning = 'min_samples_leaf'

rfc = []
for i in range(0, len(paras[to_tuning])):
    rfc.append(
        RandomForestClassifier(n_estimators=100,
                               max_depth=16,
                               max_features='auto',
                               min_samples_leaf=int(min_samples_leaf[i])))
    tester1.addModel(i, rfc[i])

test_auc, train_auc, time_spent = tester1.runTests()

# plt.subplot(121)
line1, = plt.plot(paras[to_tuning], train_auc, 'b', label='Train AUC')
line2, = plt.plot(paras[to_tuning], test_auc, 'r', label='Test AUC')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('AUC score')
plt.xlabel(to_tuning)
plt.ylim([0.85, 0.9])

# plt.subplot(122)
# line3, = plt.plot(paras[to_tuning], time_spent)
# plt.ylabel('time spent')
# plt.xlabel(to_tuning)
예제 #5
0
#
# plt.show()

for j in range(0, len(learning_rate)):
    rfc = []
    comparator = Comparator('SeriousDlqin2yrs')
    df = pd.read_csv(r"./data/processed_data.csv", engine="python")
    comparator.addDataset('processed_data', df)

    to_tuning = 'n_estimators'

    for i in range(0, len(paras[to_tuning])):
        rfc.append(
            GradientBoostingClassifier(n_estimators=n_estimators[i],
                                       learning_rate=learning_rate[j],
                                       subsample=0.85,
                                       max_depth=5,
                                       min_samples_leaf=550))
        comparator.addModel(i, rfc[i])
    test_auc, train_auc, time_spent = comparator.runTests()
    plt.subplot(len(learning_rate) / 2, 2, j + 1)
    plt.title("learning_rate=" + str(learning_rate[j]))
    line1, = plt.plot(paras[to_tuning], train_auc, 'b', label='Train AUC')
    line2, = plt.plot(paras[to_tuning], test_auc, 'r', label='Test AUC')
    plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
    plt.ylabel('AUC score')
    plt.xlabel(to_tuning)

    plt.ylim([0.85, 0.875])
plt.show()