def get_mul_features(top_features): print("===============正在进行特征乘法组合尝试========") # 得到top7依次组合下的top15*7新乘法特征,每次均采用不同的样本 if(os.path.exists('./results/mul_features_top.kv')): mul_features_top = joblib.load('./results/mul_features_top.kv') else: mul_features_top = {} for x in top_features: if x not in list(mul_features_top.keys()): # x_train, scale_pos_weight = c_data_sample.get_sampled_data(c_data_sample.train_selected, try: c_data_sample.train_selected = c_data_sample.train_selected.drop(['target'], axis=1) except: print("drop train_selected.target") pass # 0.7, 0.9) x_train = c_data_sample.train_selected.copy() scale_pos_weight = 4 print(x_train.head(1), b_feature_engineering.labels.head(1)) ytrain = pd.merge(x_train, b_feature_engineering.labels, on='id').target # print(ytrain.head()) for col in x_train.columns: if col not in top_features and col != 'id': x_train[col] = x_train[x] * x_train[col] x_train.rename( columns={col: (x+'_mul_'+col)}, inplace=True) x_train = x_train.drop(top_features, axis=1) clf = get_trained_xgb(x_train, ytrain.values, scale_pos_weight) weight_kv = clf.get_booster().get_score() mul_features_top[x] = list(weight_kv.keys())[:15] del x_train gc.collect() joblib.dump(mul_features_top, './results/mul_features_top.kv') print(mul_features_top[x]) # top内部组合 if ('top_mul_internal') not in list(mul_features_top.keys()): print("mul 内部组合") x_train = c_data_sample.train_selected.copy() scale_pos_weight = 4 df = pd.DataFrame() df['id'] = x_train.id ytrain = pd.merge(df, b_feature_engineering.labels, on='id').target # print("特征维度",len(list(combinations(top_features,2)))) for (x, y) in list(combinations(top_features, 2)): df[x+'_mul_'+y] = x_train[x] * x_train[y] print(df.shape) clf = get_trained_xgb(df, ytrain.values, scale_pos_weight) weight_kv = clf.get_booster().get_score() mul_features_top['top_mul_internal'] = list(weight_kv.keys())[:15] del x_train gc.collect() joblib.dump(mul_features_top, './results/mul_features_top.kv')
def get_add_features(top_features): # 得到top7依次组合下的top15*7新加法特征,每次均采用不同的样本 print("===============正在进行特征加法组合尝试========") if(os.path.exists('./results/add_features_top.kv')): add_features_top = joblib.load('./results/add_features_top.kv') else: add_features_top = {} for x in top_features: if x not in list(add_features_top.keys()): x_train = c_data_sample.train_selected.copy() scale_pos_weight = 4 ytrain = pd.merge(x_train, b_feature_engineering.labels, on='id').target for col in x_train.columns: if col not in top_features and col != 'id': x_train[col] = x_train[x] + x_train[col] x_train.rename( columns={col: (x+'_add_'+col)}, inplace=True) x_train = x_train.drop(top_features, axis=1) clf = get_trained_xgb(x_train, ytrain.values, scale_pos_weight) weight_kv = clf.get_booster().get_score() add_features_top[x] = list(weight_kv.keys())[:15] del x_train gc.collect() joblib.dump(add_features_top, './results/add_features_top.kv') print(add_features_top[x]) # top内部组合 if ('top_add_internal') not in list(add_features_top.keys()): print("add 内部组合") x_train = c_data_sample.train_selected.copy() scale_pos_weight = 4 df = pd.DataFrame() df['id'] = x_train.id # print("特征维度",len(list(combinations(top_features,2)))) ytrain = pd.merge(df, b_feature_engineering.labels, on='id').target for (x, y) in list(combinations(top_features, 2)): df[x+'_add_'+y] = x_train[x] + x_train[y] print(df.shape) clf = get_trained_xgb(df, ytrain.values, scale_pos_weight) weight_kv = clf.get_booster().get_score() add_features_top['top_add_internal'] = list(weight_kv.keys())[:15] del x_train gc.collect() joblib.dump(add_features_top, './results/add_features_top.kv')
def apply_combine_features(): print("正在应用组合特征") add_features_top = joblib.load('./results/add_features_top.kv') mul_features_top = joblib.load('./results/mul_features_top.kv') for v in add_features_top.values(): for col in v: cols = str(col).split('_add_') c_data_sample.train_selected[col] = c_data_sample.train_selected[cols[0] ]+c_data_sample.train_selected[cols[1]] b_feature_engineering.test[col] = b_feature_engineering.test[cols[0] ]+b_feature_engineering.test[cols[1]] for v in mul_features_top.values(): for col in v: cols = str(col).split('_mul_') c_data_sample.train_selected[col] = c_data_sample.train_selected[cols[0] ]*c_data_sample.train_selected[cols[1]] b_feature_engineering.test[col] = b_feature_engineering.test[cols[0] ]*b_feature_engineering.test[cols[1]] print("组合特征应用完毕") print("特征排序:") if not os.path.exists('./results/clf_1.model'): # x_train, scale_pos_weight = c_data_sample.get_sampled_data(c_data_sample.train_selected,0.95, 0.9) print("全部训练") # clf = get_trained_xgb(x_train, scale_pos_weight) x_train = c_data_sample.train_selected.copy() scale_pos_weight = 4 ytrain = pd.merge(c_data_sample.train_selected, b_feature_engineering.labels, on='id').target clf = get_trained_xgb(c_data_sample.train_selected, ytrain.values, scale_pos_weight) # joblib.dump(clf,'./results/clf_1.model') weight_kv = clf.get_booster().get_score() # get_model_metrics(clf,x_train) print(list(weight_kv.keys())[:30]) # X=c_data_sample.train_selected[~c_data_sample.train_selected.id.isin( # list(x_train.id))] # y=pd.merge(X, labels, on='id').target # clf.fit(X.drop(['id'],axis=1),list(y),xgb_model =clf) joblib.dump(clf, './results/clf_1.model') # weight_kv = clf.get_booster().get_score() # print(list(weight_kv.keys())[:30]) # joblib.dump(clf,'./results/feature_rank.kv') print("特征排序结束") y_predprob = clf.predict_proba( b_feature_engineering.test.drop(['id'], axis=1))[:, 1] df = pd.DataFrame() df['id'] = b_feature_engineering.test.id df['target'] = y_predprob df.to_csv('./results/submission_2.csv', index=0, header=True) # del x_train # gc.collect() else: print("检测到已完成排序")
def get_mul_features(top_features): # 得到top7依次组合下的top15*7新乘法特征,每次均采用不同的样本 if (os.path.exists('./results/mul_features_top.kv')): mul_features_top = joblib.load('./results/mul_features_top.kv') else: mul_features_top = {} for x in top_features: if x not in list(mul_features_top.keys()): x_train, scale_pos_weight = c_data_sample.get_sampled_data( c_data_sample.train_selected, 0.6, 0.9) for col in x_train.columns: if col not in top_features and col != 'id': x_train[col] = x_train[x] * x_train[col] x_train.rename(columns={col: (x + '_mul_' + col)}, inplace=True) x_train = x_train.drop(top_features, axis=1) clf = get_trained_xgb(x_train, scale_pos_weight) weight_kv = clf.get_booster().get_score() mul_features_top[x] = list(weight_kv.keys())[:15] del x_train gc.collect() joblib.dump(mul_features_top, './results/mul_features_top.kv') print(mul_features_top[x]) # top内部组合 if ('top_mul_internal') not in list(mul_features_top.keys()): print("mul 内部组合") x_train, scale_pos_weight = c_data_sample.get_sampled_data( c_data_sample.train_selected, 0.6, 0.9) df = pd.DataFrame() df['id'] = x_train.id # print("特征维度",len(list(combinations(top_features,2)))) for (x, y) in list(combinations(top_features, 2)): df[x + '_mul_' + y] = x_train[x] * x_train[y] print(df.shape) clf = get_trained_xgb(df, scale_pos_weight) weight_kv = clf.get_booster().get_score() mul_features_top['top_mul_internal'] = list(weight_kv.keys())[:15] del x_train gc.collect() joblib.dump(mul_features_top, './results/mul_features_top.kv')
def main(): gc.collect() a_preprocessing.get_preprocessed_data() b_feature_engineering.make_features() b_feature_engineering.one_hot_data() eliminate_different_data() # return if (os.path.exists('./results/clf_0.model')): clf_0 = joblib.load('./results/clf_0.model') print("检测到初始模型已生成") else: # 1:0正负例样本采样,用于调节不平衡 x_train, scale_pos_weight = c_data_sample.get_sampled_data( c_data_sample.train_selected, 0.95, 0.95) # 0.8,0.75建完测试时 # 在基本特征上得到第一个初始模型 clf_0 = get_trained_xgb(x_train, scale_pos_weight) joblib.dump(clf_0, './results/clf_0.model') get_model_metrics(clf_0, x_train) del x_train gc.collect() # x_train, scale_pos_weight = c_data_sample.get_sampled_data( # c_data_sample.train_selected, 0.9, 0.95) # get_model_metrics(clf_0,x_train) # del x_train # gc.collect() # 得到基础模型特征重要性排序 weight_kv_0 = clf_0.get_booster().get_score() top_7 = list(weight_kv_0.keys())[:7] print(list(weight_kv_0.keys())[:20]) # 得到top和其他列组合的加减法特征 get_mul_features(top_7) get_add_features(top_7) apply_combine_features()
def start(using_raw_features=False, nums=1200, scale_pos_weight_ctb=50, scale_pos_weight_xgb=8, neighbor_nums=50000): gc.collect() a_preprocessing.get_preprocessed_data() # 预处理 b_feature_engineering.make_features( using_raw_features=using_raw_features) # 新建特征 b_feature_engineering.one_hot_data() make_new_sample(b_feature_engineering.train, b_feature_engineering.labels.target, nums=nums) # 生成新样本数据 eliminate_different_data(neighbor_nums=neighbor_nums) if(os.path.exists('./results/clf_0.model')): clf_0 = joblib.load('./results/clf_0.model') print("检测到初始模型已生成") output(clf_0,b_feature_engineering.test,'./results/submission_xgb_1.csv') else: print("使用原始特征:{0},使用模型:{1},是否独热编码:{2},是否采用近邻点:{3},是否采用人工生成样本:{4}".format( using_raw_features, 'ctb', 'yes', 'yes', 'yes')) ytrain = pd.merge(c_data_sample.train_selected, b_feature_engineering.labels, on='id').target print(c_data_sample.train_selected.shape, len(ytrain)) ctb = get_trained_ctb(c_data_sample.train_selected, ytrain.values, scale_pos_weight_ctb) # catboost结果 output(ctb,b_feature_engineering.test,'./results/submission_ctb_1.csv') # 在基本特征上得到第一个初始模型 # print(c_data_sample.train_selected.head(5)) try: c_data_sample.train_selected = c_data_sample.train_selected.drop(['target'], axis=1) except: print("drop train_selected.target") pass ytrain = pd.merge(c_data_sample.train_selected, b_feature_engineering.labels, on='id').target # print(c_data_sample.train_selected.head(5)) clf_0 = get_trained_xgb( c_data_sample.train_selected, ytrain.values, scale_pos_weight_xgb) output(clf_0, b_feature_engineering.test, './results/submission_xgb_1.csv') # print(c_data_sample.train_selected.head(5)) joblib.dump(clf_0, './results/clf_0.model') # gc.collect() # x_train, scale_pos_weight = c_data_sample.get_sampled_data( # c_data_sample.train_selected, 0.9, 0.95) # get_model_metrics(clf_0,x_train) # del x_train # gc.collect() # 得到基础模型特征重要性排序 weight_kv_0 = clf_0.get_booster().get_score() top_7 = list(weight_kv_0.keys())[:7] print(list(weight_kv_0.keys())[:20]) # 得到top和其他列组合的加减法特征 get_mul_features(top_7) get_add_features(top_7) apply_combine_features()