Exemple #1
0
@author: 87875
"""

# Traditional Credit Scoring Using Logistic Regression
import scorecardpy as sc
import matplotlib.pyplot as plt

# data prepare ------
# load germancredit data
dat = sc.germancredit()

# filter variable via missing rate, iv, identical value rate
dt_s = sc.var_filter(dat, y="creditability")

# breaking dt into train and test
train, test = sc.split_df(dt_s, 'creditability').values()

# woe binning ------
bins = sc.woebin(dt_s, y="creditability")
print(type(bins))
for k, v in bins.items():
    print(k)

print(bins["purpose"])
print(bins["purpose"].columns)
print(type(bins["purpose"]))
# sc.woebin_plot(bins["purpose"])
# plt.show()

# =============================================================================
# print("qq: 1467288927")
Exemple #2
0
                     'BAD',
                     method='chimerge',
                     x=['JOB'],
                     breaks_list=break_list)
bins['JOB'] = job_bins['JOB']

# Plot WOE bins
# fig, axs = plt.subplots(ncols=2)
# sc.woebin_plot(bins, figsize=[8,5])

# Print results of binning
# for k, bin_ in bins.items():
#     print(bins[k].iloc[:,0:-2].round(2).to_latex(index=False))

# split into train and test set
train, test = sc.split_df(df, 'BAD').values()

# Convert values into woe
train_woe = sc.woebin_ply(train, bins)
test_woe = sc.woebin_ply(test, bins)

# Add constant
train_woe = sm.add_constant(train_woe)
test_woe = sm.add_constant(test_woe)

y_train = train_woe.loc[:, 'BAD']
X_train = train_woe.loc[:, train_woe.columns != 'BAD']
y_test = test_woe.loc[:, 'BAD']
X_test = test_woe.loc[:, train_woe.columns != 'BAD']

# Fit logit model
Exemple #3
0
 def split(self):
     train, test = sc.split_df(self.df,
                               y='DEFAULT_FLAG',
                               ratio=0.8,
                               seed=186).values()
     return train, test
Exemple #4
0
for e in SERVICES:
    #Detecting a particular service
    COEF2 = pd.DataFrame()
    data = data_x.copy()
    data = data.loc[data['SERVICES'] == e]
    data.drop(['SERVICES'], axis=1, inplace=True)
    #------------------ Grouping zone if there is not enough data------------------------
    Clases_UPZ = np.unique(data['ZONE'])
    Data_UPZ = data.groupby('ZONE').groups
    for i in Clases_UPZ:
        numero_clases = Data_UPZ[i]
        if len(numero_clases) < 10:
            data['ZONE'].loc[numero_clases] = 'ZONE_Other'

#-------------------- 1. OBTAINING BINS---------------------------------------
    train_b, test_b = sc.split_df(data, y='OUTCOME', ratio=0.7,
                                  seed=100).values()
    bins = sc.woebin(
        train_b,
        y='OUTCOME',
        min_perc_fine_bin=0.01,  # How many bins to cut initially into
        min_perc_coarse_bin=0.05,  # Minimum percentage per final bin
        stop_limit=0.2,  # Minimum information value
        max_num_bin=10,  # Maximum number of bins
        method='tree')

    #Transforming variables to dummies
    train, test, deleted_var = dummies_on(train_b, test_b, bins, continuous)
    #Defining Train Data and Test Data
    X_train = train[train.columns.difference(['OUTCOME'])]
    Y_train = train['OUTCOME']
    Y_train = Y_train.astype('int')
Exemple #5
0


#默认删除信息只<0.02,缺失率>95%,单类别比例>95%的变量
dt_s = sc.var_filter(data, y='status')
print('变量预处理前后变化:', data.shape, '->', dt_s.shape)
#print(data.columns)
#print(dt_s.columns)



#分箱WOE转换
bins = sc.woebin(dt_s, y='status')
# bins

train, test = sc.split_df(dt_s, 'status').values()
print('训练集、测试集划分比例为:', train.shape[0], ':', test.shape[0])

train_woe = sc.woebin_ply(train, bins)
test_woe = sc.woebin_ply(test, bins)
#train_woe.head()

y_train = train_woe.loc[:,'status']
X_train = train_woe.loc[:, train_woe.columns != 'status']
y_test = test_woe.loc[:, 'status']
X_test = test_woe.loc[:, train_woe.columns != 'status']

lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
lr.fit(X_train, y_train)

train_pred = lr.predict_proba(X_train)[:, 1]
df = df[cols]

df.drop(['diabetes' ], axis=1, inplace=True)

bins = sc.woebin(df, 'outcome', method='chimerge')

cols = df.iloc[:, 2:].columns
break_list = {}
for col in cols:
    break_list[col] = [1.0]
    
bins.update(sc.woebin(df, 'outcome', method='chimerge', x=cols.tolist(), 
                      breaks_list=break_list))

# split into train and test set
train, test = sc.split_df(df, 'outcome').values()

# Convert values into woe
train_woe = sc.woebin_ply(train, bins)
test_woe = sc.woebin_ply(test, bins)

train_woe = sm.add_constant(train_woe)
test_woe = sm.add_constant(test_woe)

y_train = train_woe.loc[:,'outcome']
X_train = train_woe.loc[:,train_woe.columns != 'outcome']
y_test = test_woe.loc[:,'outcome']
X_test = test_woe.loc[:,train_woe.columns != 'outcome']

# Fit logit model
lr = sm.GLM(y_train, X_train, family=sm.families.Binomial())