Ejemplo n.º 1
0
# 时间:2020/8/13 15:23
# IDE:PyCharm

from Tools.data import get_train_test_data
from Tools.feature_transforme import min_max_scale
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import toad
from Tools.functions import find_best_columns_by_correlation
from sklearn.model_selection import KFold
import pandas as pd
from Tools.feature_selection import research_feature_by_forward_search, research_feature_by_backward_search

if __name__ == '__main__':
    # 获取训练集、测试集数据
    train_data, test_data = get_train_test_data()

    # 对数据进行归一化处理
    new_train_data, new_test_data = min_max_scale(
        train_data, test_data, exclude=["phone_no_m", "label"])

    # 下面以缺失率大于0.5.IV值小于0.05或者相关性大于0.9(保留较高的特征)来进行特征筛选。
    selected_data_tmp, drop_lst = toad.selection.select(new_train_data,
                                                        target='label',
                                                        empty=0.5,
                                                        iv=0.05,
                                                        corr=0.7,
                                                        return_drop=True,
                                                        exclude=['phone_no_m'])

    log_reg = LogisticRegression()
Ejemplo n.º 2
0
# 时间:2020/8/17 11:03
# IDE:PyCharm

from Tools.data import get_train_test_data
from Tools.feature_transforme import min_max_scale
import toad
from Tools.functions import find_best_columns_by_correlation
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

if __name__ == '__main__':
    # 获取训练集、测试集数据
    train_data, test_data, final_test_data = get_train_test_data()

    # 对数据进行归一化处理
    new_train_data, new_test_data = min_max_scale(
        train_data, test_data, exclude=["phone_no_m", "label"])

    # 下面以缺失率大于0.5.IV值小于0.05或者相关性大于0.9(保留较高的特征)来进行特征筛选。
    selected_data_tmp, drop_lst = toad.selection.select(new_train_data,
                                                        target='label',
                                                        empty=0.5,
                                                        iv=0.05,
                                                        corr=0.7,
                                                        return_drop=True,
                                                        exclude=['phone_no_m'])

    # 筛选相关性大于0.1且特征之间相关性小于0.5的特征