def get_iris_data(test_size=0.2): df_train, df_test = __get_iris_data(test_size=test_size) train_X, train_y = common_util.split_df_to_array(df_train, label_column='label') if (len(df_test) > 0): test_X, test_y = common_util.split_df_to_array(df_test, label_column='label') return df_train, df_test, train_X, test_X, train_y, test_y
def get_data(): train_df, test_df, test_df_with_id, features = deal_data.getData( submit=False) data_util.pd_drop_row_after(train_df, 3000) data_util.pd_drop_row_after(test_df, 600) data_util.pd_drop_row_after(test_df_with_id, 600) label = 'happiness' X_train, y_train = common_util.split_df_to_array(train_df, label) X_test, test_y = common_util.split_df_to_array(test_df, label) return X_train, y_train, X_test, test_y
def get_combine_data(): df = combine.get_combine_raw(submit=False) df = combine.get_combine_amended(submit=False) train_df, test_df = common_util.split_train_test(df, test_size=0.2) data_util.pd_drop_row_after(train_df, 10) data_util.pd_drop_row_after(test_df, 2) test_df_with_id = test_df label = 'true' X_train, y_train = common_util.split_df_to_array(train_df, label) X_test, test_y = common_util.split_df_to_array(test_df, label) # 正确率64.38% return X_train, y_train, X_test, test_y
import lightgbm as lgb import pandas as pd from common_kaggle import common_util from sklearn.model_selection import KFold, RepeatedKFold import numpy as np from tianchi_happiness import deal_data label = 'happiness' train_df, test_df, test_df_with_id, features = deal_data.getData(submit=False) train_x, train_y = common_util.split_df_to_array(train_df, label) test_x, test_y = common_util.split_df_to_array(test_df, label) # combine all COMBINE_ALL = False if (COMBINE_ALL): from tianchi_happiness import deal_data df = deal_data.get_combine_all(submit=False) train_df, test_df = common_util.split_train_test(df, test_size=0.2) label = 'true' train_x, train_y = common_util.split_df_to_array(train_df, label) test_x, test_y = common_util.split_df_to_array(test_df, label) # 正确率60.62% kfolder = KFold(n_splits=5, shuffle=True, random_state=2019) # oof_cb = np.zeros(len(train_x)) # predictions_cb = np.zeros(len(test_x)) # kfold = kfolder.split(train_x, train_y) fold_ = 0
import xgboost as xgb from mSklearn import m_xgboost import pandas as pd from common_kaggle import common_util from sklearn.model_selection import KFold, RepeatedKFold import numpy as np from sklearn.metrics import mean_squared_error from tianchi_happiness import deal_data from tianchi_happiness.analyze_linear import combine train_df,test_df,test_df_with_id,features=deal_data.getData(submit=False) label = 'happiness' X_train, y_train = common_util.split_df_to_array(train_df, label) X_test, test_y = common_util.split_df_to_array(test_df, label) #combine all COMBINE_ALL=True if(COMBINE_ALL): # df=combine.get_combine_raw(submit=False) # 正确率64.06% MSE 0.4625 df=combine.get_combine_amended(submit=False) #正确率61.88% MSE 0.5844 正确率61.88% MSE 0.5188 train_df, test_df = common_util.split_train_test(df, test_size=0.2) label = 'true' X_train, y_train = common_util.split_df_to_array(train_df, label) X_test, test_y = common_util.split_df_to_array(test_df, label) #正确率64.38%
#显示所有行 pd.set_option('display.max_rows', None) from datetime import datetime from common_kaggle import common_util from common_kaggle import mathUtil from tianchi_happiness import deal_data train_df, test_df, test_df_with_id, features = deal_data.getData(submit=False) train = train_df test = test_df test_sub = pd.DataFrame() test_sub['id'] = test_df_with_id['id'] v = test['happiness'] test_sub['true'] = test['happiness'] X_train, y_train = common_util.split_df_to_array(train, 'happiness') X_test, y_test = common_util.split_df_to_array(test, 'happiness') X_train_ = pd.DataFrame(X_train) y_train_ = pd.DataFrame(y_train) X_test_ = pd.DataFrame(X_test) y_test_ = pd.DataFrame(y_test) #自定义评价函数 def myFeval(preds, xgbtrain): label = xgbtrain.get_label() score = mean_squared_error(label, preds) return 'myFeval', score ##### xgb
def standard_run(): label = 'happiness' # data_size = -1 load = False load = True # if (load == False): train_df, test_df = deal_data.get_train_data_and_deal_data( submit=False, data_size=data_size, keep_id=True) train_df.to_csv( r'D:/wks/wks_ml_kaggle/tianchi_happiness/tmp/train_df_minus.csv') test_df.to_csv( r'D:/wks/wks_ml_kaggle/tianchi_happiness/tmp/test_df_minus.csv') else: train_df = pd.read_csv( r'D:/wks/wks_ml_kaggle/tianchi_happiness/tmp/train_df_minus.csv') test_df = pd.read_csv( r'D:/wks/wks_ml_kaggle/tianchi_happiness/tmp/test_df_minus.csv') train_df, test_df = common_util.scale_data(train_df=train_df, max_limit=1, min_limit=-1, labels=[label, 'id'], test_df=test_df) if (True): train_df, del_col_names = scale_data2(train_df, label) test_df, xx = scale_data2(test_df, label) train_df.drop(columns=del_col_names, inplace=True) test_df.drop(columns=del_col_names, inplace=True) train_df_with_id_ = train_df.copy() test_df_with_id = test_df.copy() # train_df.drop(columns=['id'], inplace=True) test_df.drop(columns=['id'], inplace=True) train_x, train_y = common_util.split_df_to_array(train_df, label) test_x, test_y = common_util.split_df_to_array(test_df, label) train_x = tf.convert_to_tensor(train_x, dtype=tf.float32) train_y = tf.convert_to_tensor(train_y, dtype=tf.int32) train_y = tf.one_hot(train_y, depth=5) test_x = tf.convert_to_tensor(test_x, dtype=tf.float32) # 将x,y合并,并分成batch train_data = tf.data.Dataset.from_tensor_slices((train_x, train_y)) batch_num = 32 train_data = train_data.batch(batch_num) num_classes = 5 dims = train_x.shape[1] # model = mModels.get_model_11(dims,num_classes) model = mModels.get_model_13(dims, num_classes) # model = mModels.get_model_10(dims,num_classes) # model = mModels.get_model_8(dims,num_classes) # model = mModels.get_model_9(dims,num_classes) # model = mModels.get_model_4(dims,num_classes) # model = mModels.get_model_5(dims,num_classes) # model = mModels.get_model_7(dims,num_classes) # model = mModels.get_model_1(num_classes, dims, dims) rate_result = [] train_num = 2 for i in range(train_num): if (i >= train_num / 2): model = mModels.amend_adam_rate(model, 0.001) for step, (x, y) in enumerate(train_data): with tf.GradientTape() as type: x = tf.reshape(x, (-1, dims)) model.fit(x=x, y=y) predict = model.predict(test_x) correct_rate, incorrect_list = cal_correct_rate(test_y, predict, None) rate_result.append((i, correct_rate)) print(rate_result) print('batch_num:', batch_num)