def Select_Feature(feature_name, threshold, train_source, data): keep_feature = [] for feature in feature_name: ks_value = ks(data['label'], data[feature])['ks'] if ks_value >= threshold: keep_feature.append(feature) return keep_feature
def get_feature_nan_and_ks(feature_X, Y): feature_all = feature_X.columns ks_list = [] miss_ratio_list = [] for feature in feature_all: miss_ratio_list.append( round(feature_X[feature].isnull().sum() * 1.0 / len(feature_X), 5)) X_y = pd.DataFrame(zip(feature_X[feature], Y)).dropna().values ks_list.append(round(ks(X_y[:, 1], X_y[:, 0])['ks'], 5)) nan_dict = dict(zip(feature_all, miss_ratio_list)) ks_dict = dict(zip(feature_all, ks_list)) return nan_dict, ks_dict
def feature_summary(data, feature_use, split_source=False): feature_use_nan = pd.DataFrame(feature_use) feature_zero = pd.DataFrame(feature_use) feature_ks = pd.DataFrame(feature_use) data_length = len(data) feature_use_nan_list = [] feature_zero_list = [] feature_ks_list = [] for f in feature_use: feature_use_nan_list.append(sum(data[f].isnull()) * 1.0 / data_length) feature_zero_list.append(sum(data[f] == 0) * 1.0 / data_length) ks_dict = ks(data['label'], data[f]) feature_ks_list.append(ks_dict['ks']) plot_ks(ks_dict, f, 'all') feature_use_nan['all'] = feature_use_nan_list feature_zero['all'] = feature_zero_list feature_ks['all'] = feature_ks_list if split_source: source = data.source.unique() for s in source: data_s = data[data.source == s] data_s_length = len(data_s) feature_use_nan_list = [] feature_zero_list = [] feature_ks_list = [] for f in feature_use: feature_use_nan_list.append( sum(data_s[f].isnull()) * 1.0 / data_s_length) feature_zero_list.append( sum(data_s[f] == 0) * 1.0 / data_s_length) ks_dict = ks(data_s['label'], data_s[f]) feature_ks_list.append(ks_dict['ks']) plot_ks(ks_dict, f, s) feature_use_nan[s] = feature_use_nan_list feature_zero[s] = feature_zero_list feature_ks[s] = feature_ks_list return feature_use_nan, feature_zero, feature_ks
def evaluate_cv(X_train,y_train,model,pars,fold_num = 5,to_balance = False,num_round = 100): try: X_train = X_train.values y_train = y_train.values except: pass #from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=310) ks_value_list = [] auc_value_list = [] for i,(train_index, test_index) in enumerate(skf.split(X_train, y_train)): train_x = X_train[train_index] train_y = y_train[train_index] if to_balance == True: train_x,train_y = balance_data(train_x,train_y) test_x = X_train[test_index] test_y = y_train[test_index] if model == 'gbdt': gbdt = GBDT_Fit(train_x,train_y,pars) test_y_predict = GBDT_Predict(gbdt,test_x) elif model == 'xgb': xgb_model = XGBoost_Fit(train_x,train_y,pars,num_round = num_round ,X_val=test_x,y_val=test_y) test_y_predict = XGBoost_Predict(xgb_model,test_x) elif model == 'lgb' : lgb_model = LightGBM_Fit(train_x,train_y,pars,num_round = num_round ,X_val=test_x,y_val=test_y) test_y_predict = LightGBM_Predict(lgb_model,test_x) elif model == 'gbdt_lr' : gbdt_lr,model_onehot,gbdt = GBDTLR_Fit(train_x,train_y,pars) test_y_predict = GBDTLR_Predict(gbdt_lr,model_onehot,gbdt,test_x) ks_value = ks(test_y,test_y_predict)['ks'] auc_value = roc_auc_score(test_y, test_y_predict) ks_value_list.append(ks_value) auc_value_list.append(auc_value) print 'now fold %d , all %d flods , ks : %.3f , auc : %.3f'%(i+1,fold_num,ks_value,auc_value) ks_mean = np.mean(ks_value_list) ks_std = np.std(ks_value_list) auc_mean = np.mean(auc_value_list ) auc_std = np.std(auc_value_list ) #print 'cv | ks mean : %.3f , ks std : %.3f , auc mean : %.4f'%(ks_mean,ks_std,auc_mean) cv_result = 'cv | ks mean : %.3f , ks std : %.3f , auc mean : %.4f , auc std : %.4f'%(ks_mean,ks_std,auc_mean,auc_std) return cv_result
def fill_nan(X_train, y_train, way): fill_nan_val = [] for feature in X_train.columns: index_null = pd.isnull(X_train[feature]) if index_null.sum() > 0: if way == 'dis': index_null = pd.isnull(X_train[feature]) label_miss = y_train[index_null == True] miss_overdue_ratio = sum(label_miss) * 100 / ( float(len(label_miss)) + 10e-8) ks_info = ks(y_train[index_null == False], X_train[feature][index_null == False], 20) delta_list = abs(ks_info['overdue_ratio'] - miss_overdue_ratio) span = ks_info['span_list'][delta_list.argmin()] try: val1 = float(span.strip().split(',')[0].split('(')[1]) except: val1 = float(span.strip().split(',')[0].split('[')[1]) val2 = float(span.strip().split(',')[1].split(']')[0]) val = (val1 + val2) / 2.0 elif way == 'avg': val = X_train[feature].mean() elif way == 'mid': val = X_train[feature].median() elif isinstance(way, int) or isinstance(way, float): val = way else: print 'error input , try again' return None, None else: val = None X_train[feature] = X_train[feature].fillna(value=val) fill_nan_val.append(val) fill_nan_dict = dict(zip(X_train.columns, fill_nan_val)) return X_train, fill_nan_dict
def evaluate_stack_cv(X_train,y_train,gbdt_pars,xgb_pars,lgb_pars,stacking_model,to_balance = False,fold_num = 5,stack_fold = 2): try: X_train = X_train.values y_train = y_train.values except: pass #from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=310) ks_value_list = [] auc_value_list = [] for i,(train_index, test_index) in enumerate(skf.split(X_train, y_train)): train_x = X_train[train_index] train_y = y_train[train_index] if to_balance == True: train_x,train_y = balance_data(train_x,train_y) test_x = X_train[test_index] test_y = y_train[test_index] gbdt_model_list,xgb_model_list,lgb_model_list,stacking_model = StackModel_Fit(train_x,train_y, gbdt_pars,xgb_pars,lgb_pars,stacking_model,stack_fold = stack_fold) test_y_predict = StackModel_Predict(gbdt_model_list,xgb_model_list,lgb_model_list,stacking_model,test_x) ks_value = ks(test_y,test_y_predict)['ks'] auc_value = roc_auc_score(test_y, test_y_predict) ks_value_list.append(ks_value) auc_value_list.append(auc_value) print 'now fold %d , all %d flods , ks : %.3f , auc : %.3f'%(i+1,fold_num,ks_value,auc_value) ks_mean = np.mean(ks_value_list) ks_std = np.std(ks_value_list) auc_mean = np.mean(auc_value_list ) auc_std = np.std(auc_value_list ) #print 'cv | ks mean : %.3f , ks std : %.3f , auc mean : %.4f'%(ks_mean,ks_std,auc_mean) cv_result = 'cv | ks mean : %.3f , ks std : %.3f , auc mean : %.4f , auc std : %.4f'%(ks_mean,ks_std,auc_mean,auc_std) return cv_result
cv_result = evaluate_cv(X_train, y_train, model='xgb', pars=xgb_best_pars, fold_num=5, to_balance=False, num_round=60) print cv_result xgb_model = XGBoost_Fit(X_train, y_train, xgb_best_pars, num_round=60) ks_all = {} for test_source in source_list: print test_source X_test, y_test = get_X_y(data, feature_use, test_source) pred = XGBoost_Predict(xgb_model, X_test) ks_result = ks(y_test, pred) ks_all[test_source] = ks_result for source, ks_value in ks_all.items(): print source print ks_value['ks'] gc.collect() # GBDT print '-------------------------GBDT Result-------------------------' gbdt_best_pars = { 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8,
feature_use_nan[s] = feature_use_nan_list feature_zero[s] = feature_zero_list feature_ks[s] = feature_ks_list return feature_use_nan, feature_zero, feature_ks feature_use_nan, feature_zero, feature_ks = feature_summary(data, feature_use, split_source=True) feature_use_nan.to_csv('../Data/feature_use_nan.csv') feature_zero.to_csv('../Data/feature_zero.csv') feature_ks.to_csv('../Data/feature_ks.csv') feature_ks = open('../Data/Feature_KS_Detail.txt', 'w') for feature in feature_use: ks_dict = ks(data['label'], data[feature]) line = print_ks(ks_dict, feature) feature_ks.write(line) feature_ks.write( '---------------------------------------------------------------------\n' ) feature_ks.close() ''' import seaborn as sns import matplotlib.pyplot as plt for i,s in enumerate(source_list): sns.kdeplot(pred_list[i],label=s) plt.legend() '''
str(test_data_describe_select[test_data_describe_select.source == s].t_max.values[0])[:10] ) log_text = log_text + '----- source : %s cnt : %d overdue_rate : %.3f start_day : %s end_day : %s -----'%(s, test_data_describe_select[test_data_describe_select.source == s].sample_cnt.values[0], test_data_describe_select[test_data_describe_select.source == s].overdue_rate.values[0], str(test_data_describe_select[test_data_describe_select.source == s].t_min.values[0])[:10], str(test_data_describe_select[test_data_describe_select.source == s].t_max.values[0])[:10] ) + '\n' data_temp = data_test_select[(data_test_select.source == s)] X_test,y_test = get_X_y(data_temp) #self_cv = evaluate_cv(X_test,y_test,model='xgb',pars=xgb_pars,fold_num = 5,num_round=60) #print 'self cv :' #log_text = log_text + 'self cv : \n' #print self_cv #log_text = log_text + str(self_cv) + '\n' pred_xgb = XGBoost_Predict(xgb_model,X_test) pred_list.append(pred_xgb) ks_result = ks(y_test,pred_xgb) auc_value = roc_auc_score(y_test, pred_xgb) print 'cross sample :' print 'ks : %.3f auc : %.3f '%(ks_result['ks'],auc_value) log_text = log_text + 'cross sample : \n' log_text = log_text + 'ks : %.3f auc : %.3f '%(ks_result['ks'],auc_value) + '\n' print print_ks(ks_result) np.save(pred_path,pred_list) f = open(log_path,'w') f.write(log_text) f.close()