@author: 87875 """ # Traditional Credit Scoring Using Logistic Regression import scorecardpy as sc import matplotlib.pyplot as plt # data prepare ------ # load germancredit data dat = sc.germancredit() # filter variable via missing rate, iv, identical value rate dt_s = sc.var_filter(dat, y="creditability") # breaking dt into train and test train, test = sc.split_df(dt_s, 'creditability').values() # woe binning ------ bins = sc.woebin(dt_s, y="creditability") print(type(bins)) for k, v in bins.items(): print(k) print(bins["purpose"]) print(bins["purpose"].columns) print(type(bins["purpose"])) # sc.woebin_plot(bins["purpose"]) # plt.show() # ============================================================================= # print("qq: 1467288927")
'BAD', method='chimerge', x=['JOB'], breaks_list=break_list) bins['JOB'] = job_bins['JOB'] # Plot WOE bins # fig, axs = plt.subplots(ncols=2) # sc.woebin_plot(bins, figsize=[8,5]) # Print results of binning # for k, bin_ in bins.items(): # print(bins[k].iloc[:,0:-2].round(2).to_latex(index=False)) # split into train and test set train, test = sc.split_df(df, 'BAD').values() # Convert values into woe train_woe = sc.woebin_ply(train, bins) test_woe = sc.woebin_ply(test, bins) # Add constant train_woe = sm.add_constant(train_woe) test_woe = sm.add_constant(test_woe) y_train = train_woe.loc[:, 'BAD'] X_train = train_woe.loc[:, train_woe.columns != 'BAD'] y_test = test_woe.loc[:, 'BAD'] X_test = test_woe.loc[:, train_woe.columns != 'BAD'] # Fit logit model
def split(self): train, test = sc.split_df(self.df, y='DEFAULT_FLAG', ratio=0.8, seed=186).values() return train, test
for e in SERVICES: #Detecting a particular service COEF2 = pd.DataFrame() data = data_x.copy() data = data.loc[data['SERVICES'] == e] data.drop(['SERVICES'], axis=1, inplace=True) #------------------ Grouping zone if there is not enough data------------------------ Clases_UPZ = np.unique(data['ZONE']) Data_UPZ = data.groupby('ZONE').groups for i in Clases_UPZ: numero_clases = Data_UPZ[i] if len(numero_clases) < 10: data['ZONE'].loc[numero_clases] = 'ZONE_Other' #-------------------- 1. OBTAINING BINS--------------------------------------- train_b, test_b = sc.split_df(data, y='OUTCOME', ratio=0.7, seed=100).values() bins = sc.woebin( train_b, y='OUTCOME', min_perc_fine_bin=0.01, # How many bins to cut initially into min_perc_coarse_bin=0.05, # Minimum percentage per final bin stop_limit=0.2, # Minimum information value max_num_bin=10, # Maximum number of bins method='tree') #Transforming variables to dummies train, test, deleted_var = dummies_on(train_b, test_b, bins, continuous) #Defining Train Data and Test Data X_train = train[train.columns.difference(['OUTCOME'])] Y_train = train['OUTCOME'] Y_train = Y_train.astype('int')
#默认删除信息只<0.02,缺失率>95%,单类别比例>95%的变量 dt_s = sc.var_filter(data, y='status') print('变量预处理前后变化:', data.shape, '->', dt_s.shape) #print(data.columns) #print(dt_s.columns) #分箱WOE转换 bins = sc.woebin(dt_s, y='status') # bins train, test = sc.split_df(dt_s, 'status').values() print('训练集、测试集划分比例为:', train.shape[0], ':', test.shape[0]) train_woe = sc.woebin_ply(train, bins) test_woe = sc.woebin_ply(test, bins) #train_woe.head() y_train = train_woe.loc[:,'status'] X_train = train_woe.loc[:, train_woe.columns != 'status'] y_test = test_woe.loc[:, 'status'] X_test = test_woe.loc[:, train_woe.columns != 'status'] lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1) lr.fit(X_train, y_train) train_pred = lr.predict_proba(X_train)[:, 1]
df = df[cols] df.drop(['diabetes' ], axis=1, inplace=True) bins = sc.woebin(df, 'outcome', method='chimerge') cols = df.iloc[:, 2:].columns break_list = {} for col in cols: break_list[col] = [1.0] bins.update(sc.woebin(df, 'outcome', method='chimerge', x=cols.tolist(), breaks_list=break_list)) # split into train and test set train, test = sc.split_df(df, 'outcome').values() # Convert values into woe train_woe = sc.woebin_ply(train, bins) test_woe = sc.woebin_ply(test, bins) train_woe = sm.add_constant(train_woe) test_woe = sm.add_constant(test_woe) y_train = train_woe.loc[:,'outcome'] X_train = train_woe.loc[:,train_woe.columns != 'outcome'] y_test = test_woe.loc[:,'outcome'] X_test = test_woe.loc[:,train_woe.columns != 'outcome'] # Fit logit model lr = sm.GLM(y_train, X_train, family=sm.families.Binomial())