def get_combine_data(): df = combine.get_combine_raw(submit=False) df = combine.get_combine_amended(submit=False) train_df, test_df = common_util.split_train_test(df, test_size=0.2) data_util.pd_drop_row_after(train_df, 10) data_util.pd_drop_row_after(test_df, 2) test_df_with_id = test_df label = 'true' X_train, y_train = common_util.split_df_to_array(train_df, label) X_test, test_y = common_util.split_df_to_array(test_df, label) # 正确率64.38% return X_train, y_train, X_test, test_y
def _get_train_data(submit=False, data_size=-1, test_size=0.2): if (submit): train_df = pd.read_csv( r'D:/wks/wks_ml_kaggle/tianchi_happiness/data/happiness_train_complete.csv', na_filter=True, encoding='gbk') test_df = pd.read_csv( r'D:/wks/wks_ml_kaggle/tianchi_happiness/data/happiness_test_complete.csv', na_filter=True, encoding='gbk') test_df['happiness'] = -1 else: train_df = pd.read_csv( r'D:/wks/wks_ml_kaggle/tianchi_happiness/data/happiness_train_complete.csv', na_filter=True, encoding='gbk') if (data_size > 0): data_util.pd_drop_row_after(train_df, data_size) train_df, test_df = common_util.split_train_test(train_df, test_size) return train_df, test_df
def load_train_data(index, size, test_size): if (index == 1): "" # train_df=pd.read_csv('./data/happiness_train_short.csv',na_filter=True) train_df = pd.read_csv('./data/happiness_train_abbr.csv', na_filter=True) elif (index == 2): "" elif (index == 3): train_df = pd.read_csv( r'D:/wks/wks_ml_kaggle/tianchi_happiness/data/happiness_train_complete.csv', na_filter=True, encoding='gbk') elif (index == 4): train_df = pd.read_csv('data/happiness_test_complete.csv', na_filter=True, encoding='gbk') if (size > 0): data_util.pd_drop_row_after(train_df, size) train_df, test_df = common_util.split_train_test(train_df, test_size) return train_df, test_df
import numpy as np from tianchi_happiness import deal_data label = 'happiness' train_df, test_df, test_df_with_id, features = deal_data.getData(submit=False) train_x, train_y = common_util.split_df_to_array(train_df, label) test_x, test_y = common_util.split_df_to_array(test_df, label) # combine all COMBINE_ALL = False if (COMBINE_ALL): from tianchi_happiness import deal_data df = deal_data.get_combine_all(submit=False) train_df, test_df = common_util.split_train_test(df, test_size=0.2) label = 'true' train_x, train_y = common_util.split_df_to_array(train_df, label) test_x, test_y = common_util.split_df_to_array(test_df, label) # 正确率60.62% kfolder = KFold(n_splits=5, shuffle=True, random_state=2019) # oof_cb = np.zeros(len(train_x)) # predictions_cb = np.zeros(len(test_x)) # kfold = kfolder.split(train_x, train_y) fold_ = 0 param = { 'boosting_type': 'gbdt', 'num_leaves': 20, 'min_data_in_leaf': 20,
combine_df1['cb_round'] = predictions_cb_round combine_df1['cb_num'] = predictions_cb combine_df1['lgb_round'] = predictions_lgb_round combine_df1['lgb_num'] = predictions_lgb combine_df2 = combine_df1.copy() combine_df2['diff_cb_round'] = combine_df2['cb_num'] - combine_df2['cb_round'] combine_df2['diff_xgb_num'] = combine_df2['xgb_num'] - combine_df2['xgb'] combine_df2[ 'diff_lgb_round'] = combine_df2['lgb_num'] - combine_df2['lgb_round'] print( '-----combine df1 --------------------------------------------------------------------------' ) train_df, test_df = common_util.split_train_test(combine_df1, test_size=0.2) train_x_f, train_y_f = common_util.split_df_to_array(train_df, 'true') test_x_f, test_y_f = common_util.split_df_to_array(test_df, 'true') linear_xgb_predictions = m_xgboost.run_linear(train_x_f, train_y_f, test_x_f, test_y_f, params=None) rate, incorrect_list = common_util.cal_correct_rate(test_y_f, linear_xgb_predictions, None, True, 'linear_xgb_predictions') mse = common_util.cal_mse(test_y_f, linear_xgb_predictions, 'linear_xgb_predictions')