def run(data):
    train = data.loc[data.ret != -1].reset_index(drop=True)
    test = data.loc[data.ret == -1].reset_index(drop=True)
    feat_arr = [
        '162', '110', '86', '168', '8', '84', '113', '96', '60', '108', '194',
        '170', '66', '89', '165', '192', '24', '18', '366', '258', '354',
        '360', '11', '276', '120', '158', '270', '246', '372', '6', '12',
        '164', '342', '81', '57', '254', '252', '63', '176', '374', '77'
    ]
    lgb_params = {
        'boosting_type': 'gbdt',
        'num_leaves': 150,
        'reg_alpha': 0.,
        'reg_lambda': 1,
        'n_estimators': 60,
        'objective': 'binary',
        'subsample': 0.9,
        'colsample_bytree': 0.9,
        'learning_rate': 0.1,
        'min_child_weight': 5
    }
    s = CV(_df=train[['ret'] + feat_arr],
           label_name='ret',
           random_state=3,
           is_val=False)
    s.CV(is_print=False, lgb_params=lgb_params, round_cv=3, n_splits=10)
    pred = s.get_result(test[feat_arr])
    result = test[['file_name']].reset_index(drop=True).copy()
    result['ret'] = pred
    result['ret'].loc[result['ret'] > 0.01] = 1
    result['ret'].loc[result['ret'] <= 0.01] = 0
    result = result.rename(columns={'file_name': 'id'})
    return result
def run(data, result_best):
    feat_arr = [
        '185_new', '237_new', '176_new', '243_new', '544_new', '85_new',
        '245_new', '103_new', '249_new', '83_new', '545_new', '555_new',
        '183_new', '187_new', '135_new', '161_new', '89_new', '171_new',
        '242_new', '529_new', '91_new', '146_new', '547_new', '123_new',
        '576_new', '97_new', '447_new', '475_new', '141_new', '143_new',
        '159_new', '452_new', '540_new', '543_new', '239_new', '573_new',
        '145_new', '163_new', '181_new', '355_new'
    ]
    # 名字转换
    temp_1 = os.listdir(config.TRAIN_PATH)[0]
    d = pd.read_csv(config.TRAIN_PATH + '/' + temp_1 + '/' +
                    os.listdir(config.TRAIN_PATH + '/' + temp_1)[0])
    name_lst = []
    for col in d.columns:
        name_lst.append(col + '_var')
    for col in d.columns.tolist() + ['_功角', '_视在功率', '_变频器出入口温差', '_变频器出入口压力']:
        name_lst.append(col + '_mean')
        name_lst.append(col + '_min')
        name_lst.append(col + '_max')
        name_lst.append(col + '_ptp')
        name_lst.append(col + '_median')
        name_lst.append(col + '_sum')
    for col in [['叶片1角度', '叶片2角度', '叶片3角度'], ['变桨电机1电流', '变桨电机2电流', '变桨电机3电流'],
                ['x方向振动值', 'y方向振动值'],
                [
                    '发电机定子温度1', '发电机定子温度2', '发电机定子温度3', '发电机定子温度4', '发电机定子温度5',
                    '发电机定子温度6'
                ], ['发电机空气温度1', '发电机空气温度2'], ['主轴承温度1', '主轴承温度2'],
                ['变桨电机1功率估算', '变桨电机2功率估算', '变桨电机3功率估算'],
                ['叶片1电池箱温度', '叶片2电池箱温度', '叶片3电池箱温度'],
                ['叶片1变桨电机温度', '叶片2变桨电机温度', '叶片3变桨电机温度'],
                ['叶片1变频器箱温度', '叶片2变频器箱温度', '叶片3变频器箱温度'],
                ['叶片1超级电容电压', '叶片2超级电容电压', '叶片3超级电容电压'],
                ['驱动1晶闸管温度', '驱动2晶闸管温度', '驱动3晶闸管温度'],
                ['驱动1输出扭矩', '驱动2输出扭矩', '驱动3输出扭矩']]:
        name_lst.append('_'.join(col) + '_mean')
        name_lst.append('_'.join(col) + '_sum')
        name_lst.append('_'.join(col) + '_var')
    dict_name = {}
    col_lst = data.columns.tolist()[:-1]
    for i in range(len(name_lst)):
        dict_name[col_lst[i]] = name_lst[i]

    data = data[
        feat_arr +
        [str(name_lst.index('液压制动压力_max')) + '_new', 'ret', 'file_name']]
    data.columns = [
        dict_name[i]
        for i in feat_arr + [str(name_lst.index('液压制动压力_max')) + '_new']
    ] + ['ret', 'file_name']

    test = data.loc[data.ret == -1].reset_index(drop=True)
    data = data.loc[data.ret != -1].reset_index(drop=True)

    file_name_dict = {}
    for f1 in os.listdir(config.TRAIN_PATH):
        for f2 in os.listdir(config.TRAIN_PATH + f1):
            file_name_dict[f2] = int(f1)

    data['multi_label'] = data.file_name.apply(lambda x: file_name_dict[x])
    data_14 = data.loc[data['multi_label'] == 14].reset_index(drop=True)
    data = data.loc[data['multi_label'] != 14].reset_index(drop=True)

    lgb_params = {
        'boosting_type': 'gbdt',
        'num_leaves': 8,
        'reg_alpha': 0.,
        'reg_lambda': 1,
        'n_estimators': 50,
        'objective': 'binary',
        'subsample': 0.7,
        'colsample_bytree': 0.6,
        'learning_rate': 0.1,
        'min_child_weight': 1
    }
    feat_arr = [dict_name[i] for i in feat_arr]
    # =============================================================================
    #     '''6751 - 6755  test_02.csv 0816'''
    # =============================================================================
    temp_test = test.loc[np.logical_and(
        np.logical_and(test['液压制动压力_max'] < 1.32, test['液压制动压力_max'] > 1),
        test['x方向振动值_mean'] < -1.5)]
    temp_val = temp_test  # test
    temp_train = data
    s = CV(_df=temp_train[['ret'] + feat_arr],
           label_name='ret',
           random_state=3,
           is_val=False)
    s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1)
    pred = s.get_result(temp_val[feat_arr])

    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.2:
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i

    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.2:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub.ret[ix_lst] = 0
    result_best = result_sub.copy()
    '''6755 - 6772  submission_3.csv 0816'''
    temp_test = test.loc[np.logical_and(
        np.logical_and(test['x方向振动值_mean'] < 3.4, test['x方向振动值_mean'] > 1.2),
        np.logical_and(test['y方向振动值_mean'] < 3, test['y方向振动值_mean'] > 2))]

    temp_val = temp_test  # test
    temp_train = data
    s = CV(_df=temp_train[['ret'] + feat_arr],
           label_name='ret',
           random_state=3,
           is_val=False)
    s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1)
    pred = s.get_result(temp_val[feat_arr])

    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.4:
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i

    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.35:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub.ret[ix_lst] = 0
    result_best = result_sub.copy()
    '''6772 - 6773  test.csv 0816'''
    temp_test = test.loc[np.logical_and(
        np.logical_and(test['液压制动压力_max'] > 1.32, test['液压制动压力_max'] > 1),
        np.logical_and(
            np.logical_and(test['x方向振动值_mean'] < -0.3,
                           test['x方向振动值_mean'] < 22.05),
            np.logical_and(test['y方向振动值_mean'] < .8,
                           test['y方向振动值_mean'] > 0)))]

    temp_val = temp_test  # test
    temp_train = data
    s = CV(_df=temp_train[['ret'] + feat_arr],
           label_name='ret',
           random_state=3,
           is_val=False)
    s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1)
    pred = s.get_result(temp_val[feat_arr])

    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.4:
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i

    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.4:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub.ret[ix_lst] = 0
    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.4:
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i

    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.4:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub.ret[ix_lst] = 0
    result_best = result_sub.copy()
    '''6773 - 6816'''
    test['temp'] = test['x方向振动值_mean'] + 1.2 - test['y方向振动值_mean']
    temp_test = test.loc[np.logical_and(
        np.logical_and(
            np.logical_and(
                test['x方向振动值_mean'] < 0.34,  # 0.34
                test['x方向振动值_mean'] > -0.25),
            np.logical_and(test['y方向振动值_mean'] > 0,
                           test['y方向振动值_mean'] < 1.8)),
        test['temp'] < 0)]
    temp_val = temp_test  # test
    temp_train = data
    s = CV(_df=temp_train[['ret'] + feat_arr],
           label_name='ret',
           random_state=3,
           is_val=False)
    s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1)
    pred = s.get_result(temp_val[feat_arr])

    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    # result_best = pd.read_csv('../V9_final/result/678.csv')
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.18:
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    temp_val = temp_test  # test
    temp_train = data
    s = CV(_df=temp_train[['ret'] + feat_arr],
           label_name='ret',
           random_state=3,
           is_val=False)
    s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1)
    pred = s.get_result(temp_val[feat_arr])

    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] > 0.18:
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i

    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] > 0.18:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])
    result_sub.ret[ix_lst] = 0
    result_best = result_sub.copy()
    '''6822 - 6834'''
    temp_test = test.loc[np.logical_and(
        np.logical_and(test['x方向振动值_mean'] < 2, test['x方向振动值_mean'] > 1.25),
        np.logical_and(test['y方向振动值_mean'] < 3.2, test['y方向振动值_mean'] > 2))]

    temp_val = temp_test  # test
    temp_train = data  # .loc[:len(data)-1454-2496-1] # -2496, 1454
    s = CV(_df=temp_train[['ret'] + feat_arr],
           label_name='ret',
           random_state=3,
           is_val=False)
    s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1)
    pred = s.get_result(temp_val[feat_arr])

    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.34:  # 0.18 0.27
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    # result_best = pd.read_csv('../V9_final/result/6822.csv')
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.35:  # 0.18 0.27
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    # result_best = pd.read_csv('../V9_final/result/6816.csv')
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.34:  # 0.18 0.27
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    result_best_2 = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i
    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.35 and temp_result.pred[i] > 0.35:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])
    result_best_2.ret[ix_lst] = 1
    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    # result_best = pd.read_csv('../V9_final/result/6816.csv')
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.35:  # 0.18 0.27
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    result_best_2 = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i
    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.35 and temp_result.pred[i] > 0.35:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])
    result_best_2.ret[ix_lst] = 1
    result_best_2 = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i
    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.35 and temp_result.pred[i] > 0.34:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])
    result_best_2.ret[ix_lst] = 1
    result_best = result_best_2.copy()
    test['temp'] = test['x方向振动值_mean'] + 0.75 - test['y方向振动值_mean']
    temp_test = test.loc[np.logical_and(
        np.logical_and(
            np.logical_and(
                test['y方向振动值_mean'] < 1.25,  # 1.25 , 1
                test['y方向振动值_mean'] > 0.75),  # 0.75, 1
            np.logical_and(test['x方向振动值_mean'] > 0.25,
                           test['x方向振动值_mean'] < 0.6)),
        test['temp'] > 0)]

    temp_val = temp_test  # test
    temp_train = data  # .loc[:len(data)-1454-2496-1] # -2496, 1454
    s = CV(_df=temp_train[['ret'] + feat_arr],
           label_name='ret',
           random_state=3,
           is_val=False)
    s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1)
    pred = s.get_result(temp_val[feat_arr])

    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    # result_best = pd.read_csv('../V9_final/result/6816_new.csv')
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.45:  # 0.18 0.27
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    # result_best = pd.read_csv('../V9_final/result/6822.csv')
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.45:  # 0.18 0.27
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    # result_best = pd.read_csv('../V9_final/result/6822.csv')
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.4:  # 0.18 0.27
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    # result_best = pd.read_csv('../V9_final/result/6816_new.csv')
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.4:  # 0.18 0.27
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i

    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.4:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub.ret[ix_lst] = 0
    result_best = result_sub.copy()
    temp_test = test.loc[np.logical_and(
        np.logical_and(test['x方向振动值_mean'] < 1.7, test['x方向振动值_mean'] > 1.36),
        np.logical_and(test['y方向振动值_mean'] > 1.4, test['y方向振动值_mean'] < 1.8))]

    temp_val = temp_test  # test
    temp_train = data  # .loc[:len(data)-1454-2496-1] # -2496, 1454
    s = CV(_df=temp_train[['ret'] + feat_arr],
           label_name='ret',
           random_state=3,
           is_val=False)
    s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1)
    pred = s.get_result(temp_val[feat_arr])
    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    # result_best = pd.read_csv('../V9_final/result/6822_new.csv')
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.27:  # 0.18 0.27   0.4
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    result_best.ret.sum()
    result_sub = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i

    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.27:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub.ret[ix_lst] = 0
    # =============================================================================
    #     result_sub.to_csv('../V9_final/result/0820_1.csv', index=False)
    #     result_best = pd.read_csv('../V9_final/result/0820_1.csv')
    # =============================================================================
    result_best = result_sub.copy()
    temp_test = test.loc[np.logical_and(
        np.logical_and(test['x方向振动值_mean'] < 1, test['x方向振动值_mean'] > 0.8),
        np.logical_and(test['y方向振动值_mean'] > 0.9, test['y方向振动值_mean'] < 1.1))]

    temp_val = temp_test  # test
    temp_train = data  # .loc[:len(data)-1454-2496-1] # -2496, 1454
    s = CV(_df=temp_train[['ret'] + feat_arr],
           label_name='ret',
           random_state=3,
           is_val=False)
    s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1)
    pred = s.get_result(temp_val[feat_arr])
    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    # result_best = pd.read_csv('../V9_final/result/0820_1.csv')
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.39:  # 0.18 0.27   0.4 0.27
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.39:  # 0.18 0.27   0.4 0.27
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i

    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.39:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub.ret[ix_lst] = 0
    result_best = result_sub.copy()

    test['temp'] = test['x方向振动值_mean'] + 0.2 - test['y方向振动值_mean']
    temp_test = test.loc[np.logical_and(
        np.logical_and(
            test['液压制动压力_max'] > 1,
            np.logical_and(
                np.logical_and(test['x方向振动值_mean'] < 0.4,
                               test['x方向振动值_mean'] > -0.3),
                np.logical_and(test['y方向振动值_mean'] > -0.3,
                               test['y方向振动值_mean'] < 0.14))),
        test['temp'] > 0)]

    temp_val = temp_test  # test
    temp_train = data  # .loc[:len(data)-1454-2496-1] # -2496, 1454
    s = CV(_df=temp_train[['ret'] + feat_arr],
           label_name='ret',
           random_state=3,
           is_val=False)
    s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1)
    pred = s.get_result(temp_val[feat_arr])
    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    # result_best = pd.read_csv('../V9_final/result/0820_2.csv')
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.3:  # 0.18 0.27   0.4  0.27  0.39
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    result_best.ret.sum()
    result_sub = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i

    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.3:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub.ret[ix_lst] = 0
    result_best = result_sub.copy()
    temp_test = test.loc[np.logical_and(
        np.logical_and(test['x方向振动值_mean'] < 3, test['x方向振动值_mean'] > 1.8),
        np.logical_and(test['y方向振动值_mean'] > 1, test['y方向振动值_mean'] < 2.1))]

    temp_val = temp_test  # test
    temp_train = data  # .loc[:len(data)-1454-2496-1] # -2496, 1454
    s = CV(_df=temp_train[['ret'] + feat_arr],
           label_name='ret',
           random_state=3,
           is_val=False)
    s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1)
    pred = s.get_result(temp_val[feat_arr])
    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.38:  # 0.18 0.27   0.4  0.27  0.39  0.3
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.37:  # 0.18 0.27   0.4  0.27  0.39  0.3
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i

    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.37:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub.ret[ix_lst] = 0
    result_best = result_sub.copy()
    temp_test = test.loc[np.logical_and(
        np.logical_and(
            test['x方向振动值_mean'] < -0.8,  # -0.55
            test['x方向振动值_mean'] > -2),
        np.logical_and(test['y方向振动值_mean'] > -0.9,
                       test['y方向振动值_mean'] < -0.2))]
    temp_val = temp_test  # test
    temp_train = data  # .loc[:len(data)-1454-2496-1] # -2496, 1454
    s = CV(_df=temp_train[['ret'] + feat_arr],
           label_name='ret',
           random_state=3,
           is_val=False)
    s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1)
    pred = s.get_result(temp_val[feat_arr])
    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.25:  # 0.18 0.27   0.4  0.27  0.39  0.3  0.37
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i

    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.25:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub.ret[ix_lst] = 0
    result_best = result_sub.copy()
    temp_test = test.loc[np.logical_and(
        np.logical_and(test['x方向振动值_mean'] < 1, test['x方向振动值_mean'] > 0),
        np.logical_and(test['y方向振动值_mean'] > 1.9, test['y方向振动值_mean'] < 2.5))]

    temp_val = temp_test  # test
    temp_train = data  # .loc[:len(data)-1454-2496-1] # -2496, 1454
    s = CV(_df=temp_train[['ret'] + feat_arr],
           label_name='ret',
           random_state=3,
           is_val=False)
    s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1)
    pred = s.get_result(temp_val[feat_arr])
    result_sub = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i

    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.2:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub.ret[ix_lst] = 0
    result_sub.ret.sum()
    temp_result = temp_val[['file_name']].reset_index(drop=True).copy()
    temp_result['pred'] = pred

    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = result_best.ret[i]

    temp_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[
                i] < 0.2:  # 0.18 0.27   0.4  0.27  0.39  0.3  0.37  0.25
            temp_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub = result_best.copy()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i

    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] < 0.2:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub.ret[ix_lst] = 0
    result_sub.ret.sum()
    dict_result_best = {}
    for i in range(len(result_best)):
        dict_result_best[result_best.id[i]] = i

    ix_lst = []
    for i in range(len(temp_result)):
        if temp_result.pred[i] > 0.8:
            ix_lst.append(dict_result_best[temp_result.file_name[i]])

    result_sub.ret[ix_lst] = 1
    result_best = result_sub.copy()
    return result_best
Exemple #3
0
def run(data, result_temp):
    test = data.loc[data.ret==-1].reset_index(drop=True)
    data = data.loc[data.ret!=-1].reset_index(drop=True)
    file_name_dict = {}
    for f1 in os.listdir(config.TRAIN_PATH):
        for f2 in os.listdir(config.TRAIN_PATH+f1):
            file_name_dict[f2] = int(f1)
    data['multi_label'] = data.file_name.apply(lambda x:file_name_dict[x])
    data = data.loc[data['multi_label']!=14].reset_index(drop=True)
    
    clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=10, 
                             learning_rate=0.1, n_estimators=100, 
                             subsample_for_bin=200000, objective='multiclass', 
                             min_child_weight=1, min_child_samples=20, 
                             subsample=0.7, subsample_freq=0, 
                             colsample_bytree=0.7, 
                             reg_alpha=0.0, reg_lambda=0.0, 
                             random_state=3)
    
    train_x, val_x, train_y, val_y = train_test_split(data.drop(['file_name', 'ret', 'multi_label'], axis=1), 
                                                      data['multi_label'], 
                                                      random_state=3, 
                                                      test_size=0.3)
    clf.fit(train_x, train_y, verbose=False,early_stopping_rounds=100, eval_metric='logloss', eval_set=[(val_x, val_y)])
    
    pred_val = clf.predict(val_x)
    result_val = pd.DataFrame(index=list(range(len(val_x))))
    result_val['label'] = val_y.tolist()
    result_val['pred'] = pred_val
    
    pred_test = clf.predict(test[train_x.columns.tolist()])
    result_test = pd.DataFrame(index=list(range(len(test))))
    result_test['pred'] = pred_test
    test['multi_label'] = pred_test
    pred = clf.predict_proba(test[train_x.columns.tolist()])
    temp = []
    for i in range(len(pred)):
        temp.append(np.max(pred[i]))
    test['prob'] = temp
    
    
    '''单独训练'''
    print('training...')
    result_dict = {}
    result_prob_dict = {}
    c = Counter(data.multi_label)
    for class_ in tqdm(list(c.keys())):
        lgb_params = { 'boosting_type':'gbdt', 'num_leaves':8, 
                   'reg_alpha':0., 'reg_lambda':1, 
                   'n_estimators':30, 'objective':'binary',
                   'subsample':0.7, 'colsample_bytree':0.6, 
                   'learning_rate':0.1, 'min_child_weight':1}
        s = CV(_df=data.loc[data.multi_label==class_].drop(['file_name', 'multi_label'], axis=1).reset_index(drop=True), 
                     label_name='ret')
        s.CV(is_print=False, lgb_params=lgb_params, round_cv=3, n_splits=8) # , eval_metrics=f1_score
        test_temp = test.loc[test.multi_label==class_].reset_index(drop=True)
        pred_temp = s.get_result(test_temp.drop(['file_name', 'multi_label','prob', 'ret'], axis=1))
        for i in range(len(test_temp)):
            result_dict[test_temp['file_name'][i]] = pred_temp[i]
            result_prob_dict[test_temp['file_name'][i]] = test_temp['prob'][i]
    
            
    df = pd.DataFrame(index=range(len(result_dict)))
    df['id'] = result_dict.keys()
    df['ret'] = result_dict.values()
    df['prob'] = result_prob_dict.values()
    
    df['multi_score'] = 2*(1-df.ret)**2*df.prob/((1-df.ret)**2+df.prob)

    dict_ = {}
    tp_df = df.loc[np.logical_and(df.prob>0.999, df.ret<0.1)].copy()
    tp_df = tp_df.reset_index(drop=True)
    for i in range(len(tp_df)):
        dict_[tp_df['id'][i]] = 0
    print(len(dict_))
    result = result_temp.copy()
    result['pred_2'] = result['id'].apply(lambda x:0 if x in dict_ else 1)
    result['pred_2'] = result['pred_2'] * result['ret']
    r = result[['id', 'pred_2']].copy()
    r.columns = ['id', 'ret']
    r['ret'] = r['ret'].astype(int)
    return r
Exemple #4
0
 def feed(self, arr, d=0.001):
     temp_dict = {}
     for item in self.data.columns.tolist(): temp_dict[item] = 1
     for item in arr:
         assert item in temp_dict
     # start
     '''拼接'''
     train_csr = sparse.csr_matrix((len(self.data[[self.label_name]].loc[np.logical_and(self.data[self.label_name]!=-1, self.data['val_tags']==0)]), 0))
     train_val_csr = sparse.csr_matrix((len(self.data[[self.label_name]].loc[np.logical_and(self.data[self.label_name]!=-1, self.data['val_tags']==1)]), 0))
     test_csr = sparse.csr_matrix((len(self.data[[self.label_name]].loc[self.data[self.label_name]==-1]), 0))
     _onehot_feature = []
     _cv_feature = []
     _row_feature = []
     for item in arr:
         if item not in config.type_dict: 
             _row_feature.append(item)
         elif config.type_dict[item] == 'cv':
             _cv_feature.append(item)
         elif config.type_dict[item] == 'onehot':
             _onehot_feature.append(item)
         else:
             print('name error')
             return
     for features in _onehot_feature:
         self.data[features] = LabelEncoder().fit_transform(self.data[features].astype(str))
     _train = self.data.loc[np.logical_and(self.data[self.label_name]!=-1, self.data['val_tags']==0)]
     _train_val = self.data.loc[np.logical_and(self.data[self.label_name]!=-1, self.data['val_tags']==1)]
     _test = self.data.loc[self.data[self.label_name]==-1]
     enc = OneHotEncoder()
     for feature in _onehot_feature:
         enc.fit(self.data[feature].values.reshape(-1, 1))
         train_csr = sparse.hstack((train_csr, enc.transform(_train[feature].values.reshape(-1, 1))), 'csr', 'bool')
         train_val_csr = sparse.hstack((train_val_csr, enc.transform(_train_val[feature].values.reshape(-1, 1))), 'csr', 'bool')
         test_csr = sparse.hstack((test_csr, enc.transform(_test[feature].values.reshape(-1, 1))), 'csr', 'bool')
     cv = CountVectorizer(min_df=20)
     for feature in _cv_feature:
         self.data[feature] = self.data[feature].astype(str)
         cv.fit(self.data[feature])
         train_csr = sparse.hstack((train_csr, cv.transform(_train[feature].astype(str))), 'csr', 'bool')
         train_val_csr = sparse.hstack((train_val_csr, cv.transform(_train_val[feature].astype(str))), 'csr', 'bool')
         test_csr = sparse.hstack((test_csr, cv.transform(_test[feature].astype(str))), 'csr', 'bool')
     train_csr = sparse.hstack((sparse.csr_matrix(_train[_row_feature]), train_csr), 'csr').astype('float32')
     train_val_csr = sparse.hstack((sparse.csr_matrix(_train_val[_row_feature]), train_val_csr), 'csr').astype('float32')
     test_csr = sparse.hstack((sparse.csr_matrix(_test[_row_feature]), test_csr), 'csr').astype('float32')
     
     if len(self.train_score_lst) != 0:
         for ix in range(len(self.train_score_lst)):
             train_csr = sparse.hstack((sparse.csr_matrix(np.array(self.train_score_lst[ix]).reshape(-1, 1)), train_csr), 'csr').astype('float32')
             train_val_csr = sparse.hstack((sparse.csr_matrix(np.array(self.train_val_score_lst[ix]).reshape(-1, 1)), train_val_csr), 'csr').astype('float32')
             test_csr = sparse.hstack((sparse.csr_matrix(np.array(self.test_score_lst[ix]).reshape(-1, 1)), test_csr), 'csr').astype('float32')
     '''CV,与之前的轮子直接对接'''
     lgb_params = { 'boosting_type':'gbdt', 'num_leaves':200, 
                    'reg_alpha':1, 'reg_lambda':1, 
                    'n_estimators':100000, 'objective':'binary',
                    'subsample':0.7, 'colsample_bytree':0.6, 
                    'learning_rate':0.02, 'min_child_weight':1}
     c = CV(_df=train_csr, y=_train[self.label_name].values, 
            random_state=self.random_state, is_val=False)
     c.CV(is_print=True, lgb_params=lgb_params, n_splits=5, round_cv=1)
     self.train_pred = 0
     for item in c.MS_arr:
         self.train_pred += np.array(item['pred_train'])
     self.train_pred /= len(c.MS_arr)
     self.train_score_lst.append(self.train_pred)
     self.test_score_lst.append(c.get_result(test_csr))
     self.train_val_score_lst.append(c.get_result(train_val_csr))
     self.c = c
     self.c_lst.append(c)