def woe_transform(self, train, test): # includes var filtering and one-hot encoding of 'INDUSTRY' column in all data train = sc.var_filter(train, 'DEFAULT_FLAG', var_kp='INDUSTRY') self.encode_categorical(train) bins = sc.woebin(train, 'DEFAULT_FLAG') train_woe = sc.woebin_ply(train, bins) train_columns = [ 'ACCESS_CREDIT', 'ASSESSMENT_YEAR', 'MEDIUM_TERM_LIQUIDITY', 'OWNERS_MANAGEMENT', 'PRODUCT_DEMAND', 'PROFITABILITY', 'SHORT_TERM_LIQUIDITY', 'TURNOVER', 'DEFAULT_FLAG', 'INDUSTRY' ] test_selected = test[train_columns] self.encode_categorical(test_selected) test_woe = sc.woebin_ply(test_selected, bins) return train_woe, test_woe
mode_ratio_threshold=0.95 #mode_ratio_threshold为阈值 raw_feature_num=len(X.columns) if_delete_feature=np.zeros([raw_feature_num,1]) for i in range(0,raw_feature_num): if_delete_feature[i]=(len(np.where(X.iloc[:,i]==mode(X.iloc[np.where(~X.iloc[:,i].isna())[0],i])[0][0])[0])/len(X.iloc[np.where(~X.iloc[:,i].isna())[0],i])>mode_ratio_threshold) X=X.iloc[:,np.where(if_delete_feature==0)[0]] # #### 同一变量使用scorecard内置函数与手动筛选的结果比较 # In[32]: #立信样本使用scorecard内置函数粗筛之后的效果 dt_s = sc.var_filter(data1, y = "flagy") print(dt_s.shape) # In[33]: #立信样本使用 ''' 1.粗筛变量,删掉缺失值超过nan_ratio_threshold的变量 ''' nan_ratio_threshold=0.95 #nan_ratio_threshold为阈值 count_null=np.zeros(np.shape(X))#计算空值量 count_null[np.where(X.isnull())]=1#计算非空值量并赋值为1 count_null_sumfactor=sum(count_null)/np.shape(X)[0]#计算变量空值占比 X=X.iloc[:,np.where(count_null_sumfactor<=nan_ratio_threshold)[0]] #取非空值小于95%的变量赋值给X
""" Created on Tue Aug 4 20:03:21 2020 @author: 87875 """ # Traditional Credit Scoring Using Logistic Regression import scorecardpy as sc import matplotlib.pyplot as plt # data prepare ------ # load germancredit data dat = sc.germancredit() # filter variable via missing rate, iv, identical value rate dt_s = sc.var_filter(dat, y="creditability") # breaking dt into train and test train, test = sc.split_df(dt_s, 'creditability').values() # woe binning ------ bins = sc.woebin(dt_s, y="creditability") print(type(bins)) for k, v in bins.items(): print(k) print(bins["purpose"]) print(bins["purpose"].columns) print(type(bins["purpose"])) # sc.woebin_plot(bins["purpose"]) # plt.show()
#解释变量 data_X = data_dateDiff.iloc[:, 1:] data = pd.concat([data_X, data_Y], axis=1) data = data.drop('最近一次贷款时间', axis=1) #响应变量01编码 labelEncoder = LabelEncoder() data['status'] = labelEncoder.fit_transform(data['status'].values) data = data.astype('str') #默认删除信息只<0.02,缺失率>95%,单类别比例>95%的变量 dt_s = sc.var_filter(data, y='status') print('变量预处理前后变化:', data.shape, '->', dt_s.shape) #print(data.columns) #print(dt_s.columns) #分箱WOE转换 bins = sc.woebin(dt_s, y='status') # bins train, test = sc.split_df(dt_s, 'status').values() print('训练集、测试集划分比例为:', train.shape[0], ':', test.shape[0]) train_woe = sc.woebin_ply(train, bins) test_woe = sc.woebin_ply(test, bins)