コード例 #1
0
def get_iris_data(test_size=0.2):
    df_train, df_test = __get_iris_data(test_size=test_size)
    train_X, train_y = common_util.split_df_to_array(df_train,
                                                     label_column='label')
    if (len(df_test) > 0):
        test_X, test_y = common_util.split_df_to_array(df_test,
                                                       label_column='label')
    return df_train, df_test, train_X, test_X, train_y, test_y
コード例 #2
0
def get_data():
    train_df, test_df, test_df_with_id, features = deal_data.getData(
        submit=False)

    data_util.pd_drop_row_after(train_df, 3000)
    data_util.pd_drop_row_after(test_df, 600)
    data_util.pd_drop_row_after(test_df_with_id, 600)

    label = 'happiness'
    X_train, y_train = common_util.split_df_to_array(train_df, label)
    X_test, test_y = common_util.split_df_to_array(test_df, label)
    return X_train, y_train, X_test, test_y
コード例 #3
0
def get_combine_data():
    df = combine.get_combine_raw(submit=False)
    df = combine.get_combine_amended(submit=False)

    train_df, test_df = common_util.split_train_test(df, test_size=0.2)

    data_util.pd_drop_row_after(train_df, 10)
    data_util.pd_drop_row_after(test_df, 2)

    test_df_with_id = test_df
    label = 'true'
    X_train, y_train = common_util.split_df_to_array(train_df, label)
    X_test, test_y = common_util.split_df_to_array(test_df, label)
    # 正确率64.38%
    return X_train, y_train, X_test, test_y
コード例 #4
0
import lightgbm as lgb
import pandas as pd
from common_kaggle import common_util
from sklearn.model_selection import KFold, RepeatedKFold
import numpy as np
from tianchi_happiness import deal_data

label = 'happiness'
train_df, test_df, test_df_with_id, features = deal_data.getData(submit=False)

train_x, train_y = common_util.split_df_to_array(train_df, label)
test_x, test_y = common_util.split_df_to_array(test_df, label)

# combine all
COMBINE_ALL = False
if (COMBINE_ALL):
    from tianchi_happiness import deal_data

    df = deal_data.get_combine_all(submit=False)
    train_df, test_df = common_util.split_train_test(df, test_size=0.2)
    label = 'true'
    train_x, train_y = common_util.split_df_to_array(train_df, label)
    test_x, test_y = common_util.split_df_to_array(test_df, label)
    # 正确率60.62%

kfolder = KFold(n_splits=5, shuffle=True, random_state=2019)
# oof_cb = np.zeros(len(train_x))
# predictions_cb = np.zeros(len(test_x))
# kfold = kfolder.split(train_x, train_y)
fold_ = 0
コード例 #5
0
import xgboost as xgb
from mSklearn import m_xgboost
import pandas as pd
from common_kaggle import common_util
from sklearn.model_selection import KFold, RepeatedKFold
import numpy as np
from sklearn.metrics import mean_squared_error

from tianchi_happiness import deal_data
from tianchi_happiness.analyze_linear import combine
train_df,test_df,test_df_with_id,features=deal_data.getData(submit=False)
label = 'happiness'
X_train, y_train = common_util.split_df_to_array(train_df, label)
X_test, test_y = common_util.split_df_to_array(test_df, label)

#combine all
COMBINE_ALL=True
if(COMBINE_ALL):
    # df=combine.get_combine_raw(submit=False)
    # 正确率64.06% MSE 0.4625


    df=combine.get_combine_amended(submit=False)
    #正确率61.88% MSE 0.5844 正确率61.88% MSE 0.5188


    train_df, test_df = common_util.split_train_test(df, test_size=0.2)
    label = 'true'
    X_train, y_train = common_util.split_df_to_array(train_df, label)
    X_test, test_y = common_util.split_df_to_array(test_df, label)
    #正确率64.38%
コード例 #6
0
#显示所有行
pd.set_option('display.max_rows', None)
from datetime import datetime

from common_kaggle import common_util
from common_kaggle import mathUtil
from tianchi_happiness import deal_data
train_df, test_df, test_df_with_id, features = deal_data.getData(submit=False)
train = train_df
test = test_df
test_sub = pd.DataFrame()
test_sub['id'] = test_df_with_id['id']
v = test['happiness']
test_sub['true'] = test['happiness']

X_train, y_train = common_util.split_df_to_array(train, 'happiness')
X_test, y_test = common_util.split_df_to_array(test, 'happiness')
X_train_ = pd.DataFrame(X_train)
y_train_ = pd.DataFrame(y_train)
X_test_ = pd.DataFrame(X_test)
y_test_ = pd.DataFrame(y_test)


#自定义评价函数
def myFeval(preds, xgbtrain):
    label = xgbtrain.get_label()
    score = mean_squared_error(label, preds)
    return 'myFeval', score


##### xgb
コード例 #7
0
def standard_run():
    label = 'happiness'
    #
    data_size = -1
    load = False

    load = True
    #

    if (load == False):
        train_df, test_df = deal_data.get_train_data_and_deal_data(
            submit=False, data_size=data_size, keep_id=True)
        train_df.to_csv(
            r'D:/wks/wks_ml_kaggle/tianchi_happiness/tmp/train_df_minus.csv')
        test_df.to_csv(
            r'D:/wks/wks_ml_kaggle/tianchi_happiness/tmp/test_df_minus.csv')
    else:
        train_df = pd.read_csv(
            r'D:/wks/wks_ml_kaggle/tianchi_happiness/tmp/train_df_minus.csv')
        test_df = pd.read_csv(
            r'D:/wks/wks_ml_kaggle/tianchi_happiness/tmp/test_df_minus.csv')

    train_df, test_df = common_util.scale_data(train_df=train_df,
                                               max_limit=1,
                                               min_limit=-1,
                                               labels=[label, 'id'],
                                               test_df=test_df)

    if (True):
        train_df, del_col_names = scale_data2(train_df, label)
        test_df, xx = scale_data2(test_df, label)
        train_df.drop(columns=del_col_names, inplace=True)
        test_df.drop(columns=del_col_names, inplace=True)

    train_df_with_id_ = train_df.copy()
    test_df_with_id = test_df.copy()
    #
    train_df.drop(columns=['id'], inplace=True)
    test_df.drop(columns=['id'], inplace=True)

    train_x, train_y = common_util.split_df_to_array(train_df, label)
    test_x, test_y = common_util.split_df_to_array(test_df, label)

    train_x = tf.convert_to_tensor(train_x, dtype=tf.float32)
    train_y = tf.convert_to_tensor(train_y, dtype=tf.int32)
    train_y = tf.one_hot(train_y, depth=5)

    test_x = tf.convert_to_tensor(test_x, dtype=tf.float32)

    # 将x,y合并,并分成batch
    train_data = tf.data.Dataset.from_tensor_slices((train_x, train_y))
    batch_num = 32
    train_data = train_data.batch(batch_num)
    num_classes = 5
    dims = train_x.shape[1]

    # model = mModels.get_model_11(dims,num_classes)
    model = mModels.get_model_13(dims, num_classes)
    # model = mModels.get_model_10(dims,num_classes)
    # model = mModels.get_model_8(dims,num_classes)
    # model = mModels.get_model_9(dims,num_classes)
    # model = mModels.get_model_4(dims,num_classes)
    # model = mModels.get_model_5(dims,num_classes)
    # model = mModels.get_model_7(dims,num_classes)

    # model = mModels.get_model_1(num_classes, dims, dims)

    rate_result = []
    train_num = 2
    for i in range(train_num):
        if (i >= train_num / 2):
            model = mModels.amend_adam_rate(model, 0.001)
        for step, (x, y) in enumerate(train_data):
            with tf.GradientTape() as type:
                x = tf.reshape(x, (-1, dims))
            model.fit(x=x, y=y)
        predict = model.predict(test_x)

        correct_rate, incorrect_list = cal_correct_rate(test_y, predict, None)
        rate_result.append((i, correct_rate))

    print(rate_result)
    print('batch_num:', batch_num)