def get_mul_features(top_features):
    print("===============正在进行特征乘法组合尝试========")
    # 得到top7依次组合下的top15*7新乘法特征,每次均采用不同的样本
    if(os.path.exists('./results/mul_features_top.kv')):
        mul_features_top = joblib.load('./results/mul_features_top.kv')
    else:
        mul_features_top = {}
    for x in top_features:
        if x not in list(mul_features_top.keys()):
            # x_train, scale_pos_weight = c_data_sample.get_sampled_data(c_data_sample.train_selected,
            try:
                c_data_sample.train_selected = c_data_sample.train_selected.drop(['target'], axis=1)
            except:
                print("drop train_selected.target")
                pass                                                        #    0.7, 0.9)
            x_train = c_data_sample.train_selected.copy()
            scale_pos_weight = 4
            print(x_train.head(1), b_feature_engineering.labels.head(1))
            ytrain = pd.merge(x_train,
                              b_feature_engineering.labels, on='id').target
            # print(ytrain.head())
            for col in x_train.columns:
                if col not in top_features and col != 'id':
                    x_train[col] = x_train[x] * x_train[col]
                    x_train.rename(
                        columns={col: (x+'_mul_'+col)}, inplace=True)
            x_train = x_train.drop(top_features, axis=1)

            clf = get_trained_xgb(x_train, ytrain.values, scale_pos_weight)
            weight_kv = clf.get_booster().get_score()
            mul_features_top[x] = list(weight_kv.keys())[:15]
            del x_train
            gc.collect()
            joblib.dump(mul_features_top, './results/mul_features_top.kv')
        print(mul_features_top[x])

    # top内部组合
    if ('top_mul_internal') not in list(mul_features_top.keys()):
        print("mul 内部组合")

        x_train = c_data_sample.train_selected.copy()
        scale_pos_weight = 4
        df = pd.DataFrame()
        df['id'] = x_train.id
        ytrain = pd.merge(df,
                          b_feature_engineering.labels, on='id').target
        # print("特征维度",len(list(combinations(top_features,2))))
        for (x, y) in list(combinations(top_features, 2)):
            df[x+'_mul_'+y] = x_train[x] * x_train[y]
        print(df.shape)
        clf = get_trained_xgb(df, ytrain.values, scale_pos_weight)
        weight_kv = clf.get_booster().get_score()
        mul_features_top['top_mul_internal'] = list(weight_kv.keys())[:15]
        del x_train
        gc.collect()
        joblib.dump(mul_features_top, './results/mul_features_top.kv')
def get_add_features(top_features):
    # 得到top7依次组合下的top15*7新加法特征,每次均采用不同的样本
    print("===============正在进行特征加法组合尝试========")
    if(os.path.exists('./results/add_features_top.kv')):
        add_features_top = joblib.load('./results/add_features_top.kv')
    else:
        add_features_top = {}
    for x in top_features:
        if x not in list(add_features_top.keys()):
            x_train = c_data_sample.train_selected.copy()
            scale_pos_weight = 4
            ytrain = pd.merge(x_train,
                              b_feature_engineering.labels, on='id').target
            for col in x_train.columns:
                if col not in top_features and col != 'id':
                    x_train[col] = x_train[x] + x_train[col]
                    x_train.rename(
                        columns={col: (x+'_add_'+col)}, inplace=True)
            x_train = x_train.drop(top_features, axis=1)
            clf = get_trained_xgb(x_train, ytrain.values, scale_pos_weight)
            weight_kv = clf.get_booster().get_score()
            add_features_top[x] = list(weight_kv.keys())[:15]
            del x_train
            gc.collect()
            joblib.dump(add_features_top, './results/add_features_top.kv')
        print(add_features_top[x])

    # top内部组合
    if ('top_add_internal') not in list(add_features_top.keys()):
        print("add 内部组合")
        x_train = c_data_sample.train_selected.copy()
        scale_pos_weight = 4
        df = pd.DataFrame()
        df['id'] = x_train.id
        # print("特征维度",len(list(combinations(top_features,2))))
        ytrain = pd.merge(df,
                          b_feature_engineering.labels, on='id').target
        for (x, y) in list(combinations(top_features, 2)):
            df[x+'_add_'+y] = x_train[x] + x_train[y]
        print(df.shape)
        clf = get_trained_xgb(df, ytrain.values, scale_pos_weight)
        weight_kv = clf.get_booster().get_score()
        add_features_top['top_add_internal'] = list(weight_kv.keys())[:15]
        del x_train
        gc.collect()
        joblib.dump(add_features_top, './results/add_features_top.kv')
def apply_combine_features():
    print("正在应用组合特征")
    add_features_top = joblib.load('./results/add_features_top.kv')
    mul_features_top = joblib.load('./results/mul_features_top.kv')

    for v in add_features_top.values():
        for col in v:
            cols = str(col).split('_add_')
            c_data_sample.train_selected[col] = c_data_sample.train_selected[cols[0]
                                                                             ]+c_data_sample.train_selected[cols[1]]
            b_feature_engineering.test[col] = b_feature_engineering.test[cols[0]
                                                                         ]+b_feature_engineering.test[cols[1]]

    for v in mul_features_top.values():
        for col in v:
            cols = str(col).split('_mul_')
            c_data_sample.train_selected[col] = c_data_sample.train_selected[cols[0]
                                                                             ]*c_data_sample.train_selected[cols[1]]
            b_feature_engineering.test[col] = b_feature_engineering.test[cols[0]
                                                                         ]*b_feature_engineering.test[cols[1]]
    print("组合特征应用完毕")
    print("特征排序:")
    if not os.path.exists('./results/clf_1.model'):
        # x_train, scale_pos_weight = c_data_sample.get_sampled_data(c_data_sample.train_selected,0.95, 0.9)
        print("全部训练")
        # clf = get_trained_xgb(x_train, scale_pos_weight)
        x_train = c_data_sample.train_selected.copy()
        scale_pos_weight = 4
        ytrain = pd.merge(c_data_sample.train_selected,
                          b_feature_engineering.labels, on='id').target
        clf = get_trained_xgb(c_data_sample.train_selected,
                              ytrain.values, scale_pos_weight)
        # joblib.dump(clf,'./results/clf_1.model')
        weight_kv = clf.get_booster().get_score()
        # get_model_metrics(clf,x_train)
        print(list(weight_kv.keys())[:30])

        # X=c_data_sample.train_selected[~c_data_sample.train_selected.id.isin(
        #     list(x_train.id))]

        # y=pd.merge(X, labels, on='id').target
        # clf.fit(X.drop(['id'],axis=1),list(y),xgb_model =clf)
        joblib.dump(clf, './results/clf_1.model')
        # weight_kv = clf.get_booster().get_score()
        # print(list(weight_kv.keys())[:30])
        # joblib.dump(clf,'./results/feature_rank.kv')
        print("特征排序结束")
        y_predprob = clf.predict_proba(
            b_feature_engineering.test.drop(['id'], axis=1))[:, 1]
        df = pd.DataFrame()
        df['id'] = b_feature_engineering.test.id
        df['target'] = y_predprob
        df.to_csv('./results/submission_2.csv', index=0, header=True)
        # del x_train
        # gc.collect()
    else:
        print("检测到已完成排序")
def get_mul_features(top_features):
    # 得到top7依次组合下的top15*7新乘法特征,每次均采用不同的样本
    if (os.path.exists('./results/mul_features_top.kv')):
        mul_features_top = joblib.load('./results/mul_features_top.kv')
    else:
        mul_features_top = {}
    for x in top_features:
        if x not in list(mul_features_top.keys()):
            x_train, scale_pos_weight = c_data_sample.get_sampled_data(
                c_data_sample.train_selected, 0.6, 0.9)
            for col in x_train.columns:
                if col not in top_features and col != 'id':
                    x_train[col] = x_train[x] * x_train[col]
                    x_train.rename(columns={col: (x + '_mul_' + col)},
                                   inplace=True)
            x_train = x_train.drop(top_features, axis=1)
            clf = get_trained_xgb(x_train, scale_pos_weight)
            weight_kv = clf.get_booster().get_score()
            mul_features_top[x] = list(weight_kv.keys())[:15]
            del x_train
            gc.collect()
            joblib.dump(mul_features_top, './results/mul_features_top.kv')
        print(mul_features_top[x])

    # top内部组合
    if ('top_mul_internal') not in list(mul_features_top.keys()):
        print("mul 内部组合")
        x_train, scale_pos_weight = c_data_sample.get_sampled_data(
            c_data_sample.train_selected, 0.6, 0.9)
        df = pd.DataFrame()
        df['id'] = x_train.id
        # print("特征维度",len(list(combinations(top_features,2))))
        for (x, y) in list(combinations(top_features, 2)):
            df[x + '_mul_' + y] = x_train[x] * x_train[y]
        print(df.shape)
        clf = get_trained_xgb(df, scale_pos_weight)
        weight_kv = clf.get_booster().get_score()
        mul_features_top['top_mul_internal'] = list(weight_kv.keys())[:15]
        del x_train
        gc.collect()
        joblib.dump(mul_features_top, './results/mul_features_top.kv')
def main():
    gc.collect()
    a_preprocessing.get_preprocessed_data()
    b_feature_engineering.make_features()
    b_feature_engineering.one_hot_data()
    eliminate_different_data()
    # return
    if (os.path.exists('./results/clf_0.model')):
        clf_0 = joblib.load('./results/clf_0.model')
        print("检测到初始模型已生成")
    else:

        # 1:0正负例样本采样,用于调节不平衡
        x_train, scale_pos_weight = c_data_sample.get_sampled_data(
            c_data_sample.train_selected, 0.95, 0.95)  # 0.8,0.75建完测试时
        # 在基本特征上得到第一个初始模型
        clf_0 = get_trained_xgb(x_train, scale_pos_weight)
        joblib.dump(clf_0, './results/clf_0.model')
        get_model_metrics(clf_0, x_train)

        del x_train
        gc.collect()
    # x_train, scale_pos_weight = c_data_sample.get_sampled_data(
    #         c_data_sample.train_selected, 0.9, 0.95)
    # get_model_metrics(clf_0,x_train)
    # del x_train
    # gc.collect()
    # 得到基础模型特征重要性排序
    weight_kv_0 = clf_0.get_booster().get_score()
    top_7 = list(weight_kv_0.keys())[:7]
    print(list(weight_kv_0.keys())[:20])

    # 得到top和其他列组合的加减法特征
    get_mul_features(top_7)
    get_add_features(top_7)
    apply_combine_features()
def start(using_raw_features=False, nums=1200, scale_pos_weight_ctb=50, scale_pos_weight_xgb=8, neighbor_nums=50000):
    gc.collect()
    a_preprocessing.get_preprocessed_data()  # 预处理
    b_feature_engineering.make_features(
        using_raw_features=using_raw_features)  # 新建特征
    
    b_feature_engineering.one_hot_data()

    make_new_sample(b_feature_engineering.train,
                    b_feature_engineering.labels.target,
                    nums=nums)  # 生成新样本数据

    eliminate_different_data(neighbor_nums=neighbor_nums)


    if(os.path.exists('./results/clf_0.model')):
        clf_0 = joblib.load('./results/clf_0.model')
        print("检测到初始模型已生成")
        output(clf_0,b_feature_engineering.test,'./results/submission_xgb_1.csv')
    else:
        print("使用原始特征:{0},使用模型:{1},是否独热编码:{2},是否采用近邻点:{3},是否采用人工生成样本:{4}".format(
        using_raw_features, 'ctb', 'yes', 'yes', 'yes'))

        ytrain = pd.merge(c_data_sample.train_selected,
                        b_feature_engineering.labels, on='id').target

        print(c_data_sample.train_selected.shape, len(ytrain))
        ctb = get_trained_ctb(c_data_sample.train_selected,
                            ytrain.values, scale_pos_weight_ctb)  # catboost结果
        output(ctb,b_feature_engineering.test,'./results/submission_ctb_1.csv')
        # 在基本特征上得到第一个初始模型
        # print(c_data_sample.train_selected.head(5))
        try:
            c_data_sample.train_selected = c_data_sample.train_selected.drop(['target'], axis=1)
        except:
            print("drop train_selected.target")
            pass 
        ytrain = pd.merge(c_data_sample.train_selected,
                          b_feature_engineering.labels, on='id').target
        # print(c_data_sample.train_selected.head(5))
        clf_0 = get_trained_xgb(
            c_data_sample.train_selected, ytrain.values, scale_pos_weight_xgb)

        output(clf_0, b_feature_engineering.test, './results/submission_xgb_1.csv')
        # print(c_data_sample.train_selected.head(5))
        joblib.dump(clf_0, './results/clf_0.model')
        # gc.collect()
    # x_train, scale_pos_weight = c_data_sample.get_sampled_data(
    #         c_data_sample.train_selected, 0.9, 0.95)
    # get_model_metrics(clf_0,x_train)
    # del x_train
    # gc.collect()
    # 得到基础模型特征重要性排序
    weight_kv_0 = clf_0.get_booster().get_score()
    top_7 = list(weight_kv_0.keys())[:7]
    print(list(weight_kv_0.keys())[:20])

    # 得到top和其他列组合的加减法特征
    get_mul_features(top_7)
    get_add_features(top_7)
    apply_combine_features()