def test_stepwise_zero(): df = pd.DataFrame({ 'X': np.zeros(500), 'Z': np.random.rand(500), 'Y': np.random.randint(2, size = 500), }) df = stepwise(df, target = 'Y') assert ['Z', 'Y'] == df.columns.tolist()
def test_stepwise_return_drop(): df, drop_list = stepwise(frame.fillna(-1), target='target', return_drop=True) assert ['B', 'A', 'D'] == drop_list
def test_stepwise_exclude(): df = stepwise(frame.fillna(-1), target='target', exclude='B') assert ['B', 'C', 'E', 'F', 'target'] == df.columns.tolist()
def test_stepwise_forward(): df = stepwise(frame.fillna(-1), target='target', direction='forward') assert ['C', 'E', 'F', 'target'] == df.columns.tolist()
def test_stepwise_forward_when_best_is_first(): df = frame[['E', 'F', 'B', 'A', 'D', 'C', 'target']] df = stepwise(df.fillna(-1), target = 'target', direction = 'forward') assert ['E', 'F', 'C', 'target'] == df.columns.tolist()
def test_stepwise_ks(): df = stepwise(frame.fillna(-1), target = 'target', criterion = 'ks', direction = 'forward') assert ['A', 'C', 'target'] == df.columns.tolist()
def test_stepwise_lr(): df = stepwise(frame.fillna(-1), target = 'target', estimator = 'lr', direction = 'forward') assert ['C', 'target'] == df.columns.tolist()
train_w = pd.read_csv('train_w.csv') test_w = pd.read_csv('test_w.csv') train_s2, drops = select(train_w, target='loan_status', iv=0.005, corr=0.8, return_drop=True) test_s2 = test_w[train_s2.columns] print('IV筛选不通过的特征为:\n', drops['iv'], '\n', 'corr筛选不通过的特征为:\n', drops['corr']) print('处理完成,剩余{}特征'.format(train_s2.shape[1])) print('Logistic逐步回归筛选中') train_step = stepwise(train_s2, target='loan_status', estimator='ols', direction='both', criterion='aic') test_step = test_s2[train_step.columns] print('处理完成,剩余{}特征'.format(train_step.shape[1]), '\n' * 2) # data_step = pd.concat([train_step, test_step], join='inner') # data_step.to_csv('data_step.csv', index=False) train_step.to_csv('train_step.csv', index=False) test_step.to_csv('test_step.csv', index=False) # ---------------------------------------------------------------------------------------------------------------------------------------------------- # 模型训练 print('模型训练'.center(60, '—')) train_step = pd.read_csv('train_step.csv') test_step = pd.read_csv('test_step.csv') print(train_step['loan_status'].groupby(train_step['loan_status']).count())