def drop_feature(X_train, y_train, X_test='', coverage_threshold=0.1, ks_threshold=0.05): if not coverage_threshold and not ks_threshold: return (X_train, X_test) sample_num = len(X_train) for col in X_train.columns: nan_index = pd.isnull(X_train[col]) x_col = X_train[col][~nan_index] y_col = y_train[~nan_index] coverage_ratio = len(x_col) / float(sample_num) class_num = len(set(x_col)) ks_value = max(ks.ks_analysis( x_col.values, y_col.values)['ks_value']) if class_num > 2 else 1 if any([ class_num < 2, coverage_ratio < coverage_threshold, ks_value < ks_threshold ]): X_train = X_train.drop([col], axis=1) if len(X_test): X_test = X_test.drop([col], axis=1) return (X_train, X_test)
def ks_analysises(self): #调用ks_analysis对全部特征的有效性进行ks分析 print('start ks_analysis...') print('[total|done|todo]') dfks = pd.DataFrame() features_num = len(self.__features) for i, col in enumerate(self.__features): try: dfcol = ks.ks_analysis(self.__dfdata[col].values, self.__dfdata['label'].values) ks_value = max(dfcol['ks_value']) dfcol.index = [[col] * len(dfcol), [ks_value] * len(dfcol), range(len(dfcol))] dfks = pd.concat([dfks, dfcol]) dfks.index.names = ['feature', 'ks', 'seq'] except: pass if np.mod(i + 1, 100) == 0: print('[{}|{}|{}]'.format(features_num, i + 1, features_num - i - 1)) print('[{}|{}|{}]'.format(features_num, i + 1, features_num - i - 1)) dfks = dfks.sort_index(axis=0, level=[1, 0, 2], ascending=[False, True, True]) return (dfks)
def test(self, clf, dftest=pd.DataFrame()): info = "\nstart test model ... " print(info) self.report_info = self.report_info + info + '\n' # 若传入新的dftest,则需要再次做数据预处理 if len(dftest) > 0: print('preprocessing test data...\n') # 禁止数据预处理期间打印输出 stdout = sys.stdout sys.stdout = open(os.devnull, 'w') X_train, y_train, X_test, y_test = self.preprocess_data( self.dftrain, dftest) # 恢复打印输出 sys.stdout = stdout # 预处理后的训练和测试集 self.X_train, self.y_train = X_train, y_train self.X_test, self.y_test = X_test, y_test y_test_hat = clf.predict_proba(self.X_test)[:, -1] dfks_test = ks.ks_analysis(y_test_hat, np.ravel(self.y_test)) ks_test = max(dfks_test['ks_value']) auc_test = metrics.roc_auc_score(np.ravel(self.y_test), y_test_hat) info = 'test: ks = {} \t auc = {} '.format(ks_test, auc_test) + '\n' prettyks = ks.print_ks(y_test_hat, np.ravel(self.y_test)) info = info + str(prettyks) + '\n' print(info) self.report_info = self.report_info + info + '\n'
def test(self, bst, dftest=pd.DataFrame()): info = "\nstart test xgboost model ... \n" print(info) self.report_info = self.report_info + info + '\n' # 若传入新的dftest,则需要再次做数据预处理 if len(dftest) > 0: print('preprocessing test data...') # 禁止数据预处理期间打印输出 stdout = sys.stdout sys.stdout = open(os.devnull, 'w') X_train, y_train, X_test, y_test = self.preprocess_data( self.dftrain, dftest) # 恢复打印输出 sys.stdout = stdout # 预处理后的测试集 self.X_test, self.y_test = X_test, y_test self.dtest = xgb.DMatrix(self.X_test, self.y_test['label']) y_test_hat = bst.predict(self.dtest) dfks_test = ks.ks_analysis(y_test_hat, np.ravel(self.y_test)) ks_test = max(dfks_test['ks_value']) auc_test = auc(np.ravel(self.y_test), y_test_hat) info = 'test: ks = {} \t auc = {} '.format(ks_test, auc_test) + '\n' prettyks = ks.print_ks(y_test_hat, np.ravel(self.y_test)) info = info + str(prettyks) + '\n' print(info) self.report_info = self.report_info + info + '\n'
def ks_analysis(self, data, label): """ 'feature_interval',#区间 'order_num', #订单数量 'order_ratio', #订单占比 'overdue_num', #逾期订单数量 'overdue_ratio', #逾期订单占比 'normal_num', #正常订单数量 'normal_ratio', #正常订单占比 'overdue_cum_ratio', #累计逾期订单比例 'normal_cum_ratio', #累计正常订单比例 'ks_value' #ks统计值 """ return (ks.ks_analysis(data, label))
def fill_nan(X_train, y_train, X_test='', method='infer'): X_train_new, X_test_new = pd.DataFrame(), pd.DataFrame() method_dict = {'mean': np.mean, 'median': np.median, 'most': most_value} if method in ['0', '-1']: for col in X_train.columns: nan_index = pd.isnull(X_train[col]) X_train_new[col] = X_train[col].fillna(value=int(method)) if len(X_test): X_test_new[col] = X_test[col].fillna(value=int(method)) if any(nan_index): X_train_new[col + '_isnan'] = nan_index.astype('int').values if len(X_test): X_test_new[col + '_isnan'] = pd.isnull( X_test[col]).astype('int').values elif method in ['mean', 'median', 'most']: for col in X_train.columns: nan_index = pd.isnull(X_train[col]) #根据不同的填充策略计算相应的填充值 func = method_dict[method] fill_value = func(X_train[col].dropna().values) X_train_new[col] = X_train[col].fillna(value=fill_value) if len(X_test): X_test_new[col] = X_test[col].fillna(value=fill_value) if any(nan_index): X_train_new[col + '_isnan'] = nan_index.astype('int').values if len(X_test): X_test_new[col + '_isnan'] = pd.isnull( X_test[col]).astype('int').values elif method == 'infer': for col in X_train.columns: nan_index = pd.isnull(X_train[col]) x_col = X_train[col][~nan_index] y_col = y_train[~nan_index] if not any(nan_index): X_train_new[col] = X_train[col] if len(X_test): X_test_new[col] = X_test[col] continue nan_overdue_ratio = (lambda x: sum(x) / float(len(x)))( y_train[nan_index].values) dfks = ks.ks_analysis(X_train[col].values, y_train.values) # 找到逾期率最接近缺失样本逾期率的分组 g = np.abs(dfks['overdue_ratio'].values - nan_overdue_ratio).argmin() # 寻找到对应的取值范围 str_interval = dfks['feature_interval'][g] p, q = [ float(x) for x in re.sub(r'\[|\]|\)', '', str_interval).split(',') ] if ')' in str_interval: l = [x for x in x_col if x >= p and x < q] else: l = [x for x in x_col if x >= p and x <= q] # 计算该范围内特征的平均值 fill_value = np.mean(l) X_train_new[col] = X_train[col].fillna(value=fill_value) if len(X_test): X_test_new[col] = X_test[col].fillna(value=fill_value) if any(nan_index): X_train_new[col + '_isnan'] = nan_index.astype('int').values if len(X_test): X_test_new[col + '_isnan'] = pd.isnull( X_test[col]).astype('int').values # 如果有的X_test列有缺失值,但对应X_train不缺失,使用中位数填充策略 for col in X_test_new.columns: nan_index = pd.isnull(X_test_new[col]) if not any(nan_index): continue fill_value = np.median(X_train_new[col].values) X_test_new[col] = X_test_new[col].fillna(value=fill_value) return (X_train_new, X_test_new)
def train(self, clf, cv=5, model_idx=5): if cv: #skf = StratifiedKFold(n_splits = cv,shuffle=True) k, ks_mean_train, auc_mean_train, ks_mean_validate, auc_mean_validate = 0, 0, 0, 0, 0 models = {} #for train_index,validate_index in skf.split(self.X_train,np.ravel(self.y_train)): for train_index, validate_index in stratified_kfold( self.X_train, np.ravel(self.y_train), nfolds=cv): k = k + 1 nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') info = '\n{}: k = {}'.format(nowtime, k) print(info) self.report_info = self.report_info + info + '\n' X_train_k, y_train_k = self.X_train.iloc[ train_index, :], self.y_train.iloc[train_index, :] X_validate_k, y_validate_k = self.X_train.iloc[ validate_index, :], self.y_train.iloc[validate_index, :] clf.fit(X_train_k, np.ravel(y_train_k)) predict_train_k = clf.predict_proba(X_train_k)[:, -1] predict_validate_k = clf.predict_proba(X_validate_k)[:, -1] dfks_train = ks.ks_analysis(predict_train_k, y_train_k.values) dfks_validate = ks.ks_analysis(predict_validate_k, y_validate_k.values) ks_train, ks_validate = max(dfks_train['ks_value']), max( dfks_validate['ks_value']) auc_validate = metrics.roc_auc_score(np.ravel(y_validate_k), predict_validate_k) auc_train = metrics.roc_auc_score(np.ravel(y_train_k), predict_train_k) ks_mean_train = ks_mean_train + ks_train auc_mean_train = auc_mean_train + auc_train ks_mean_validate = ks_mean_validate + ks_validate auc_mean_validate = auc_mean_validate + auc_validate info = '\ntrain: ks = {} \t auc = {} '.format( ks_train, auc_train) prettyks = ks.print_ks(predict_train_k, y_train_k.values) info = info + '\n' + str(prettyks) + '\n' info = info + '\nvalidate: ks = {} \t auc = {}'.format( ks_validate, auc_validate) + '\n' prettyks = ks.print_ks(predict_validate_k, y_validate_k.values) info = info + str(prettyks) + '\n' print(info) self.report_info = self.report_info + info models[k] = clf ks_mean_train = ks_mean_train / float(k) auc_mean_train = auc_mean_train / float(k) ks_mean_validate = ks_mean_validate / float(k) auc_mean_validate = auc_mean_validate / float(k) nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') info = '\n================================================================================ %s\n' % nowtime info = info + 'train : ks mean {:.5f} ; auc mean {:.5f}'.format( ks_mean_train, auc_mean_train) + '\n' info = info + 'validate : ks mean {:.5f} ; auc mean {:.5f}'.format( ks_mean_validate, auc_mean_validate) + '\n' print(info) self.report_info = self.report_info + info clf = models[model_idx] # 处理 cv = 0 或 cv = None时无需交叉验证逻辑 else: nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') info = '\n================================================================================ %s\n' % nowtime print(info) self.report_info = self.report_info + info clf.fit(self.X_train, np.ravel(self.y_train)) predict_train = clf.predict_proba(self.X_train)[:, -1] dfks_train = ks.ks_analysis(predict_train, self.y_train.values) ks_train = max(dfks_train['ks_value']) auc_train = metrics.roc_auc_score(np.ravel(self.y_train), predict_train) info = '\ntrain: ks = {} \t auc = {} '.format(ks_train, auc_train) + '\n' prettyks = ks.print_ks(predict_train, self.y_train.values) info = info + str(prettyks) + '\n' print(info) self.report_info = self.report_info + info return (clf)
def train(self, cv=5, model_idx=5, params_dict=params_dict, n_jobs=4, verbose_eval=20): info = "start train xgboost model ..." print(info) self.report_info = self.report_info + info + '\n' params_dict_copy = params_dict.copy() params_dict_copy.update({'nthread': n_jobs}) if cv: k, ks_mean_train, auc_mean_train, ks_mean_validate, auc_mean_validate = 0, 0, 0, 0, 0 models = {} for train_index, validate_index in stratified_kfold( self.X_train, np.ravel(self.y_train), nfolds=cv): k = k + 1 nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print( '\n================================================================================ %s\n' % nowtime) info = 'k = {}'.format(k) print(info) self.report_info = self.report_info + info + '\n' X_train_k, y_train_k = self.X_train.iloc[ train_index, :], self.y_train.iloc[train_index, :] X_validate_k, y_validate_k = self.X_train.iloc[ validate_index, :], self.y_train.iloc[validate_index, :] dtrain_k = xgb.DMatrix(X_train_k, y_train_k['label']) dvalid_k = xgb.DMatrix(X_validate_k, y_validate_k['label']) bst, _ = train_xgb(params_dict_copy, dtrain_k, dvalid_k, None, verbose_eval) predict_train_k = bst.predict(dtrain_k) predict_validate_k = bst.predict(dvalid_k) dfks_train = ks.ks_analysis(predict_train_k, dtrain_k.get_label()) dfks_validate = ks.ks_analysis(predict_validate_k, dvalid_k.get_label()) ks_train, ks_validate = max(dfks_train['ks_value']), max( dfks_validate['ks_value']) auc_train = auc(dtrain_k.get_label(), predict_train_k) auc_validate = auc(dvalid_k.get_label(), predict_validate_k) ks_mean_train = ks_mean_train + ks_train auc_mean_train = auc_mean_train + auc_train ks_mean_validate = ks_mean_validate + ks_validate auc_mean_validate = auc_mean_validate + auc_validate info = '\ntrain: ks = {} \t auc = {} '.format( ks_train, auc_train) prettyks = ks.print_ks(predict_train_k, dtrain_k.get_label()) info = info + '\n' + str(prettyks) + '\n' info = info + '\nvalidate: ks = {} \t auc = {}'.format( ks_validate, auc_validate) + '\n' prettyks = ks.print_ks(predict_validate_k, dvalid_k.get_label()) info = info + str(prettyks) + '\n' print(info) self.report_info = self.report_info + info models[k] = bst ks_mean_train = ks_mean_train / float(k) auc_mean_train = auc_mean_train / float(k) ks_mean_validate = ks_mean_validate / float(k) auc_mean_validate = auc_mean_validate / float(k) nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') info = '\n================================================================================ %s\n' % nowtime info = info + 'train : ks mean {:.5f} ; auc mean {:.5f}'.format( ks_mean_train, auc_mean_train) + '\n' info = info + 'validate : ks mean {:.5f} ; auc mean {:.5f}'.format( ks_mean_validate, auc_mean_validate) + '\n' print(info) self.report_info = self.report_info + info bst = models[model_idx] # 处理 cv = 0 或 cv = None时无需交叉验证逻辑 else: nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') info = '\n================================================================================ %s\n' % nowtime print(info) self.report_info = self.report_info + info bst, _ = train_xgb(params_dict_copy, self.dtrain, None, None, verbose_eval) predict_train = bst.predict(self.dtrain) dfks_train = ks.ks_analysis(predict_train, self.y_train.values) ks_train = max(dfks_train['ks_value']) auc_train = auc(self.dtrain.get_label(), predict_train) info = '\ntrain: ks = {} \t auc = {} '.format(ks_train, auc_train) + '\n' prettyks = ks.print_ks(predict_train, self.y_train.values) info = info + str(prettyks) + '\n' print(info) self.report_info = self.report_info + info # 计算特征重要性 feature_scores = bst.get_score() dfimportance = pd.DataFrame({ 'feature': feature_scores.keys(), 'importance': feature_scores.values() }) try: dfimportance = dfimportance.sort_values('importance', ascending=False) except AttributeError as err: dfimportance = dfimportance.sort('importance', ascending=False) dfimportance.index = range(len(dfimportance)) self.dfimportance = dfimportance return (bst)