def run(data): train = data.loc[data.ret != -1].reset_index(drop=True) test = data.loc[data.ret == -1].reset_index(drop=True) feat_arr = [ '162', '110', '86', '168', '8', '84', '113', '96', '60', '108', '194', '170', '66', '89', '165', '192', '24', '18', '366', '258', '354', '360', '11', '276', '120', '158', '270', '246', '372', '6', '12', '164', '342', '81', '57', '254', '252', '63', '176', '374', '77' ] lgb_params = { 'boosting_type': 'gbdt', 'num_leaves': 150, 'reg_alpha': 0., 'reg_lambda': 1, 'n_estimators': 60, 'objective': 'binary', 'subsample': 0.9, 'colsample_bytree': 0.9, 'learning_rate': 0.1, 'min_child_weight': 5 } s = CV(_df=train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, round_cv=3, n_splits=10) pred = s.get_result(test[feat_arr]) result = test[['file_name']].reset_index(drop=True).copy() result['ret'] = pred result['ret'].loc[result['ret'] > 0.01] = 1 result['ret'].loc[result['ret'] <= 0.01] = 0 result = result.rename(columns={'file_name': 'id'}) return result
def run(data, result_best): feat_arr = [ '185_new', '237_new', '176_new', '243_new', '544_new', '85_new', '245_new', '103_new', '249_new', '83_new', '545_new', '555_new', '183_new', '187_new', '135_new', '161_new', '89_new', '171_new', '242_new', '529_new', '91_new', '146_new', '547_new', '123_new', '576_new', '97_new', '447_new', '475_new', '141_new', '143_new', '159_new', '452_new', '540_new', '543_new', '239_new', '573_new', '145_new', '163_new', '181_new', '355_new' ] # 名字转换 temp_1 = os.listdir(config.TRAIN_PATH)[0] d = pd.read_csv(config.TRAIN_PATH + '/' + temp_1 + '/' + os.listdir(config.TRAIN_PATH + '/' + temp_1)[0]) name_lst = [] for col in d.columns: name_lst.append(col + '_var') for col in d.columns.tolist() + ['_功角', '_视在功率', '_变频器出入口温差', '_变频器出入口压力']: name_lst.append(col + '_mean') name_lst.append(col + '_min') name_lst.append(col + '_max') name_lst.append(col + '_ptp') name_lst.append(col + '_median') name_lst.append(col + '_sum') for col in [['叶片1角度', '叶片2角度', '叶片3角度'], ['变桨电机1电流', '变桨电机2电流', '变桨电机3电流'], ['x方向振动值', 'y方向振动值'], [ '发电机定子温度1', '发电机定子温度2', '发电机定子温度3', '发电机定子温度4', '发电机定子温度5', '发电机定子温度6' ], ['发电机空气温度1', '发电机空气温度2'], ['主轴承温度1', '主轴承温度2'], ['变桨电机1功率估算', '变桨电机2功率估算', '变桨电机3功率估算'], ['叶片1电池箱温度', '叶片2电池箱温度', '叶片3电池箱温度'], ['叶片1变桨电机温度', '叶片2变桨电机温度', '叶片3变桨电机温度'], ['叶片1变频器箱温度', '叶片2变频器箱温度', '叶片3变频器箱温度'], ['叶片1超级电容电压', '叶片2超级电容电压', '叶片3超级电容电压'], ['驱动1晶闸管温度', '驱动2晶闸管温度', '驱动3晶闸管温度'], ['驱动1输出扭矩', '驱动2输出扭矩', '驱动3输出扭矩']]: name_lst.append('_'.join(col) + '_mean') name_lst.append('_'.join(col) + '_sum') name_lst.append('_'.join(col) + '_var') dict_name = {} col_lst = data.columns.tolist()[:-1] for i in range(len(name_lst)): dict_name[col_lst[i]] = name_lst[i] data = data[ feat_arr + [str(name_lst.index('液压制动压力_max')) + '_new', 'ret', 'file_name']] data.columns = [ dict_name[i] for i in feat_arr + [str(name_lst.index('液压制动压力_max')) + '_new'] ] + ['ret', 'file_name'] test = data.loc[data.ret == -1].reset_index(drop=True) data = data.loc[data.ret != -1].reset_index(drop=True) file_name_dict = {} for f1 in os.listdir(config.TRAIN_PATH): for f2 in os.listdir(config.TRAIN_PATH + f1): file_name_dict[f2] = int(f1) data['multi_label'] = data.file_name.apply(lambda x: file_name_dict[x]) data_14 = data.loc[data['multi_label'] == 14].reset_index(drop=True) data = data.loc[data['multi_label'] != 14].reset_index(drop=True) lgb_params = { 'boosting_type': 'gbdt', 'num_leaves': 8, 'reg_alpha': 0., 'reg_lambda': 1, 'n_estimators': 50, 'objective': 'binary', 'subsample': 0.7, 'colsample_bytree': 0.6, 'learning_rate': 0.1, 'min_child_weight': 1 } feat_arr = [dict_name[i] for i in feat_arr] # ============================================================================= # '''6751 - 6755 test_02.csv 0816''' # ============================================================================= temp_test = test.loc[np.logical_and( np.logical_and(test['液压制动压力_max'] < 1.32, test['液压制动压力_max'] > 1), test['x方向振动值_mean'] < -1.5)] temp_val = temp_test # test temp_train = data s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.2: temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.2: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() '''6755 - 6772 submission_3.csv 0816''' temp_test = test.loc[np.logical_and( np.logical_and(test['x方向振动值_mean'] < 3.4, test['x方向振动值_mean'] > 1.2), np.logical_and(test['y方向振动值_mean'] < 3, test['y方向振动值_mean'] > 2))] temp_val = temp_test # test temp_train = data s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.35: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() '''6772 - 6773 test.csv 0816''' temp_test = test.loc[np.logical_and( np.logical_and(test['液压制动压力_max'] > 1.32, test['液压制动压力_max'] > 1), np.logical_and( np.logical_and(test['x方向振动值_mean'] < -0.3, test['x方向振动值_mean'] < 22.05), np.logical_and(test['y方向振动值_mean'] < .8, test['y方向振动值_mean'] > 0)))] temp_val = temp_test # test temp_train = data s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() '''6773 - 6816''' test['temp'] = test['x方向振动值_mean'] + 1.2 - test['y方向振动值_mean'] temp_test = test.loc[np.logical_and( np.logical_and( np.logical_and( test['x方向振动值_mean'] < 0.34, # 0.34 test['x方向振动值_mean'] > -0.25), np.logical_and(test['y方向振动值_mean'] > 0, test['y方向振动值_mean'] < 1.8)), test['temp'] < 0)] temp_val = temp_test # test temp_train = data s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/678.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.18: temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_val = temp_test # test temp_train = data s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] > 0.18: temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] > 0.18: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() '''6822 - 6834''' temp_test = test.loc[np.logical_and( np.logical_and(test['x方向振动值_mean'] < 2, test['x方向振动值_mean'] > 1.25), np.logical_and(test['y方向振动值_mean'] < 3.2, test['y方向振动值_mean'] > 2))] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.34: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6822.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.35: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6816.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.34: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_best_2 = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.35 and temp_result.pred[i] > 0.35: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_best_2.ret[ix_lst] = 1 temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6816.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.35: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_best_2 = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.35 and temp_result.pred[i] > 0.35: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_best_2.ret[ix_lst] = 1 result_best_2 = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.35 and temp_result.pred[i] > 0.34: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_best_2.ret[ix_lst] = 1 result_best = result_best_2.copy() test['temp'] = test['x方向振动值_mean'] + 0.75 - test['y方向振动值_mean'] temp_test = test.loc[np.logical_and( np.logical_and( np.logical_and( test['y方向振动值_mean'] < 1.25, # 1.25 , 1 test['y方向振动值_mean'] > 0.75), # 0.75, 1 np.logical_and(test['x方向振动值_mean'] > 0.25, test['x方向振动值_mean'] < 0.6)), test['temp'] > 0)] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6816_new.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.45: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6822.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.45: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6822.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6816_new.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: # 0.18 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.4: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() temp_test = test.loc[np.logical_and( np.logical_and(test['x方向振动值_mean'] < 1.7, test['x方向振动值_mean'] > 1.36), np.logical_and(test['y方向振动值_mean'] > 1.4, test['y方向振动值_mean'] < 1.8))] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/6822_new.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.27: # 0.18 0.27 0.4 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_best.ret.sum() result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.27: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 # ============================================================================= # result_sub.to_csv('../V9_final/result/0820_1.csv', index=False) # result_best = pd.read_csv('../V9_final/result/0820_1.csv') # ============================================================================= result_best = result_sub.copy() temp_test = test.loc[np.logical_and( np.logical_and(test['x方向振动值_mean'] < 1, test['x方向振动值_mean'] > 0.8), np.logical_and(test['y方向振动值_mean'] > 0.9, test['y方向振动值_mean'] < 1.1))] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/0820_1.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.39: # 0.18 0.27 0.4 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.39: # 0.18 0.27 0.4 0.27 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.39: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() test['temp'] = test['x方向振动值_mean'] + 0.2 - test['y方向振动值_mean'] temp_test = test.loc[np.logical_and( np.logical_and( test['液压制动压力_max'] > 1, np.logical_and( np.logical_and(test['x方向振动值_mean'] < 0.4, test['x方向振动值_mean'] > -0.3), np.logical_and(test['y方向振动值_mean'] > -0.3, test['y方向振动值_mean'] < 0.14))), test['temp'] > 0)] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred # result_best = pd.read_csv('../V9_final/result/0820_2.csv') dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.3: # 0.18 0.27 0.4 0.27 0.39 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_best.ret.sum() result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.3: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() temp_test = test.loc[np.logical_and( np.logical_and(test['x方向振动值_mean'] < 3, test['x方向振动值_mean'] > 1.8), np.logical_and(test['y方向振动值_mean'] > 1, test['y方向振动值_mean'] < 2.1))] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.38: # 0.18 0.27 0.4 0.27 0.39 0.3 temp_lst.append(dict_result_best[temp_result.file_name[i]]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.37: # 0.18 0.27 0.4 0.27 0.39 0.3 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.37: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() temp_test = test.loc[np.logical_and( np.logical_and( test['x方向振动值_mean'] < -0.8, # -0.55 test['x方向振动值_mean'] > -2), np.logical_and(test['y方向振动值_mean'] > -0.9, test['y方向振动值_mean'] < -0.2))] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.25: # 0.18 0.27 0.4 0.27 0.39 0.3 0.37 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.25: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_best = result_sub.copy() temp_test = test.loc[np.logical_and( np.logical_and(test['x方向振动值_mean'] < 1, test['x方向振动值_mean'] > 0), np.logical_and(test['y方向振动值_mean'] > 1.9, test['y方向振动值_mean'] < 2.5))] temp_val = temp_test # test temp_train = data # .loc[:len(data)-1454-2496-1] # -2496, 1454 s = CV(_df=temp_train[['ret'] + feat_arr], label_name='ret', random_state=3, is_val=False) s.CV(is_print=False, lgb_params=lgb_params, n_splits=5, round_cv=1) pred = s.get_result(temp_val[feat_arr]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.2: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_sub.ret.sum() temp_result = temp_val[['file_name']].reset_index(drop=True).copy() temp_result['pred'] = pred dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = result_best.ret[i] temp_lst = [] for i in range(len(temp_result)): if temp_result.pred[ i] < 0.2: # 0.18 0.27 0.4 0.27 0.39 0.3 0.37 0.25 temp_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub = result_best.copy() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] < 0.2: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 0 result_sub.ret.sum() dict_result_best = {} for i in range(len(result_best)): dict_result_best[result_best.id[i]] = i ix_lst = [] for i in range(len(temp_result)): if temp_result.pred[i] > 0.8: ix_lst.append(dict_result_best[temp_result.file_name[i]]) result_sub.ret[ix_lst] = 1 result_best = result_sub.copy() return result_best
def run(data, result_temp): test = data.loc[data.ret==-1].reset_index(drop=True) data = data.loc[data.ret!=-1].reset_index(drop=True) file_name_dict = {} for f1 in os.listdir(config.TRAIN_PATH): for f2 in os.listdir(config.TRAIN_PATH+f1): file_name_dict[f2] = int(f1) data['multi_label'] = data.file_name.apply(lambda x:file_name_dict[x]) data = data.loc[data['multi_label']!=14].reset_index(drop=True) clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=10, learning_rate=0.1, n_estimators=100, subsample_for_bin=200000, objective='multiclass', min_child_weight=1, min_child_samples=20, subsample=0.7, subsample_freq=0, colsample_bytree=0.7, reg_alpha=0.0, reg_lambda=0.0, random_state=3) train_x, val_x, train_y, val_y = train_test_split(data.drop(['file_name', 'ret', 'multi_label'], axis=1), data['multi_label'], random_state=3, test_size=0.3) clf.fit(train_x, train_y, verbose=False,early_stopping_rounds=100, eval_metric='logloss', eval_set=[(val_x, val_y)]) pred_val = clf.predict(val_x) result_val = pd.DataFrame(index=list(range(len(val_x)))) result_val['label'] = val_y.tolist() result_val['pred'] = pred_val pred_test = clf.predict(test[train_x.columns.tolist()]) result_test = pd.DataFrame(index=list(range(len(test)))) result_test['pred'] = pred_test test['multi_label'] = pred_test pred = clf.predict_proba(test[train_x.columns.tolist()]) temp = [] for i in range(len(pred)): temp.append(np.max(pred[i])) test['prob'] = temp '''单独训练''' print('training...') result_dict = {} result_prob_dict = {} c = Counter(data.multi_label) for class_ in tqdm(list(c.keys())): lgb_params = { 'boosting_type':'gbdt', 'num_leaves':8, 'reg_alpha':0., 'reg_lambda':1, 'n_estimators':30, 'objective':'binary', 'subsample':0.7, 'colsample_bytree':0.6, 'learning_rate':0.1, 'min_child_weight':1} s = CV(_df=data.loc[data.multi_label==class_].drop(['file_name', 'multi_label'], axis=1).reset_index(drop=True), label_name='ret') s.CV(is_print=False, lgb_params=lgb_params, round_cv=3, n_splits=8) # , eval_metrics=f1_score test_temp = test.loc[test.multi_label==class_].reset_index(drop=True) pred_temp = s.get_result(test_temp.drop(['file_name', 'multi_label','prob', 'ret'], axis=1)) for i in range(len(test_temp)): result_dict[test_temp['file_name'][i]] = pred_temp[i] result_prob_dict[test_temp['file_name'][i]] = test_temp['prob'][i] df = pd.DataFrame(index=range(len(result_dict))) df['id'] = result_dict.keys() df['ret'] = result_dict.values() df['prob'] = result_prob_dict.values() df['multi_score'] = 2*(1-df.ret)**2*df.prob/((1-df.ret)**2+df.prob) dict_ = {} tp_df = df.loc[np.logical_and(df.prob>0.999, df.ret<0.1)].copy() tp_df = tp_df.reset_index(drop=True) for i in range(len(tp_df)): dict_[tp_df['id'][i]] = 0 print(len(dict_)) result = result_temp.copy() result['pred_2'] = result['id'].apply(lambda x:0 if x in dict_ else 1) result['pred_2'] = result['pred_2'] * result['ret'] r = result[['id', 'pred_2']].copy() r.columns = ['id', 'ret'] r['ret'] = r['ret'].astype(int) return r
def feed(self, arr, d=0.001): temp_dict = {} for item in self.data.columns.tolist(): temp_dict[item] = 1 for item in arr: assert item in temp_dict # start '''拼接''' train_csr = sparse.csr_matrix((len(self.data[[self.label_name]].loc[np.logical_and(self.data[self.label_name]!=-1, self.data['val_tags']==0)]), 0)) train_val_csr = sparse.csr_matrix((len(self.data[[self.label_name]].loc[np.logical_and(self.data[self.label_name]!=-1, self.data['val_tags']==1)]), 0)) test_csr = sparse.csr_matrix((len(self.data[[self.label_name]].loc[self.data[self.label_name]==-1]), 0)) _onehot_feature = [] _cv_feature = [] _row_feature = [] for item in arr: if item not in config.type_dict: _row_feature.append(item) elif config.type_dict[item] == 'cv': _cv_feature.append(item) elif config.type_dict[item] == 'onehot': _onehot_feature.append(item) else: print('name error') return for features in _onehot_feature: self.data[features] = LabelEncoder().fit_transform(self.data[features].astype(str)) _train = self.data.loc[np.logical_and(self.data[self.label_name]!=-1, self.data['val_tags']==0)] _train_val = self.data.loc[np.logical_and(self.data[self.label_name]!=-1, self.data['val_tags']==1)] _test = self.data.loc[self.data[self.label_name]==-1] enc = OneHotEncoder() for feature in _onehot_feature: enc.fit(self.data[feature].values.reshape(-1, 1)) train_csr = sparse.hstack((train_csr, enc.transform(_train[feature].values.reshape(-1, 1))), 'csr', 'bool') train_val_csr = sparse.hstack((train_val_csr, enc.transform(_train_val[feature].values.reshape(-1, 1))), 'csr', 'bool') test_csr = sparse.hstack((test_csr, enc.transform(_test[feature].values.reshape(-1, 1))), 'csr', 'bool') cv = CountVectorizer(min_df=20) for feature in _cv_feature: self.data[feature] = self.data[feature].astype(str) cv.fit(self.data[feature]) train_csr = sparse.hstack((train_csr, cv.transform(_train[feature].astype(str))), 'csr', 'bool') train_val_csr = sparse.hstack((train_val_csr, cv.transform(_train_val[feature].astype(str))), 'csr', 'bool') test_csr = sparse.hstack((test_csr, cv.transform(_test[feature].astype(str))), 'csr', 'bool') train_csr = sparse.hstack((sparse.csr_matrix(_train[_row_feature]), train_csr), 'csr').astype('float32') train_val_csr = sparse.hstack((sparse.csr_matrix(_train_val[_row_feature]), train_val_csr), 'csr').astype('float32') test_csr = sparse.hstack((sparse.csr_matrix(_test[_row_feature]), test_csr), 'csr').astype('float32') if len(self.train_score_lst) != 0: for ix in range(len(self.train_score_lst)): train_csr = sparse.hstack((sparse.csr_matrix(np.array(self.train_score_lst[ix]).reshape(-1, 1)), train_csr), 'csr').astype('float32') train_val_csr = sparse.hstack((sparse.csr_matrix(np.array(self.train_val_score_lst[ix]).reshape(-1, 1)), train_val_csr), 'csr').astype('float32') test_csr = sparse.hstack((sparse.csr_matrix(np.array(self.test_score_lst[ix]).reshape(-1, 1)), test_csr), 'csr').astype('float32') '''CV,与之前的轮子直接对接''' lgb_params = { 'boosting_type':'gbdt', 'num_leaves':200, 'reg_alpha':1, 'reg_lambda':1, 'n_estimators':100000, 'objective':'binary', 'subsample':0.7, 'colsample_bytree':0.6, 'learning_rate':0.02, 'min_child_weight':1} c = CV(_df=train_csr, y=_train[self.label_name].values, random_state=self.random_state, is_val=False) c.CV(is_print=True, lgb_params=lgb_params, n_splits=5, round_cv=1) self.train_pred = 0 for item in c.MS_arr: self.train_pred += np.array(item['pred_train']) self.train_pred /= len(c.MS_arr) self.train_score_lst.append(self.train_pred) self.test_score_lst.append(c.get_result(test_csr)) self.train_val_score_lst.append(c.get_result(train_val_csr)) self.c = c self.c_lst.append(c)