# 时间:2020/8/13 15:23 # IDE:PyCharm from Tools.data import get_train_test_data from Tools.feature_transforme import min_max_scale from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score import toad from Tools.functions import find_best_columns_by_correlation from sklearn.model_selection import KFold import pandas as pd from Tools.feature_selection import research_feature_by_forward_search, research_feature_by_backward_search if __name__ == '__main__': # 获取训练集、测试集数据 train_data, test_data = get_train_test_data() # 对数据进行归一化处理 new_train_data, new_test_data = min_max_scale( train_data, test_data, exclude=["phone_no_m", "label"]) # 下面以缺失率大于0.5.IV值小于0.05或者相关性大于0.9(保留较高的特征)来进行特征筛选。 selected_data_tmp, drop_lst = toad.selection.select(new_train_data, target='label', empty=0.5, iv=0.05, corr=0.7, return_drop=True, exclude=['phone_no_m']) log_reg = LogisticRegression()
# 时间:2020/8/17 11:03 # IDE:PyCharm from Tools.data import get_train_test_data from Tools.feature_transforme import min_max_scale import toad from Tools.functions import find_best_columns_by_correlation from sklearn.linear_model import Lasso from sklearn.model_selection import train_test_split import pandas as pd from sklearn.metrics import accuracy_score from sklearn.ensemble import RandomForestClassifier if __name__ == '__main__': # 获取训练集、测试集数据 train_data, test_data, final_test_data = get_train_test_data() # 对数据进行归一化处理 new_train_data, new_test_data = min_max_scale( train_data, test_data, exclude=["phone_no_m", "label"]) # 下面以缺失率大于0.5.IV值小于0.05或者相关性大于0.9(保留较高的特征)来进行特征筛选。 selected_data_tmp, drop_lst = toad.selection.select(new_train_data, target='label', empty=0.5, iv=0.05, corr=0.7, return_drop=True, exclude=['phone_no_m']) # 筛选相关性大于0.1且特征之间相关性小于0.5的特征