コード例 #1
0
def get_combine_data():
    df = combine.get_combine_raw(submit=False)
    df = combine.get_combine_amended(submit=False)

    train_df, test_df = common_util.split_train_test(df, test_size=0.2)

    data_util.pd_drop_row_after(train_df, 10)
    data_util.pd_drop_row_after(test_df, 2)

    test_df_with_id = test_df
    label = 'true'
    X_train, y_train = common_util.split_df_to_array(train_df, label)
    X_test, test_y = common_util.split_df_to_array(test_df, label)
    # 正确率64.38%
    return X_train, y_train, X_test, test_y
コード例 #2
0
def _get_train_data(submit=False, data_size=-1, test_size=0.2):
    if (submit):
        train_df = pd.read_csv(
            r'D:/wks/wks_ml_kaggle/tianchi_happiness/data/happiness_train_complete.csv',
            na_filter=True,
            encoding='gbk')
        test_df = pd.read_csv(
            r'D:/wks/wks_ml_kaggle/tianchi_happiness/data/happiness_test_complete.csv',
            na_filter=True,
            encoding='gbk')
        test_df['happiness'] = -1
    else:
        train_df = pd.read_csv(
            r'D:/wks/wks_ml_kaggle/tianchi_happiness/data/happiness_train_complete.csv',
            na_filter=True,
            encoding='gbk')
        if (data_size > 0):
            data_util.pd_drop_row_after(train_df, data_size)
        train_df, test_df = common_util.split_train_test(train_df, test_size)
    return train_df, test_df
コード例 #3
0
def load_train_data(index, size, test_size):
    if (index == 1):
        ""
        # train_df=pd.read_csv('./data/happiness_train_short.csv',na_filter=True)
        train_df = pd.read_csv('./data/happiness_train_abbr.csv',
                               na_filter=True)
    elif (index == 2):
        ""
    elif (index == 3):
        train_df = pd.read_csv(
            r'D:/wks/wks_ml_kaggle/tianchi_happiness/data/happiness_train_complete.csv',
            na_filter=True,
            encoding='gbk')
    elif (index == 4):
        train_df = pd.read_csv('data/happiness_test_complete.csv',
                               na_filter=True,
                               encoding='gbk')
    if (size > 0):
        data_util.pd_drop_row_after(train_df, size)

    train_df, test_df = common_util.split_train_test(train_df, test_size)

    return train_df, test_df
コード例 #4
0
import numpy as np
from tianchi_happiness import deal_data

label = 'happiness'
train_df, test_df, test_df_with_id, features = deal_data.getData(submit=False)

train_x, train_y = common_util.split_df_to_array(train_df, label)
test_x, test_y = common_util.split_df_to_array(test_df, label)

# combine all
COMBINE_ALL = False
if (COMBINE_ALL):
    from tianchi_happiness import deal_data

    df = deal_data.get_combine_all(submit=False)
    train_df, test_df = common_util.split_train_test(df, test_size=0.2)
    label = 'true'
    train_x, train_y = common_util.split_df_to_array(train_df, label)
    test_x, test_y = common_util.split_df_to_array(test_df, label)
    # 正确率60.62%

kfolder = KFold(n_splits=5, shuffle=True, random_state=2019)
# oof_cb = np.zeros(len(train_x))
# predictions_cb = np.zeros(len(test_x))
# kfold = kfolder.split(train_x, train_y)
fold_ = 0

param = {
    'boosting_type': 'gbdt',
    'num_leaves': 20,
    'min_data_in_leaf': 20,
コード例 #5
0
combine_df1['cb_round'] = predictions_cb_round
combine_df1['cb_num'] = predictions_cb
combine_df1['lgb_round'] = predictions_lgb_round
combine_df1['lgb_num'] = predictions_lgb

combine_df2 = combine_df1.copy()
combine_df2['diff_cb_round'] = combine_df2['cb_num'] - combine_df2['cb_round']
combine_df2['diff_xgb_num'] = combine_df2['xgb_num'] - combine_df2['xgb']
combine_df2[
    'diff_lgb_round'] = combine_df2['lgb_num'] - combine_df2['lgb_round']

print(
    '-----combine df1 --------------------------------------------------------------------------'
)

train_df, test_df = common_util.split_train_test(combine_df1, test_size=0.2)
train_x_f, train_y_f = common_util.split_df_to_array(train_df, 'true')
test_x_f, test_y_f = common_util.split_df_to_array(test_df, 'true')

linear_xgb_predictions = m_xgboost.run_linear(train_x_f,
                                              train_y_f,
                                              test_x_f,
                                              test_y_f,
                                              params=None)
rate, incorrect_list = common_util.cal_correct_rate(test_y_f,
                                                    linear_xgb_predictions,
                                                    None, True,
                                                    'linear_xgb_predictions')
mse = common_util.cal_mse(test_y_f, linear_xgb_predictions,
                          'linear_xgb_predictions')