def __init__(self, version_str, start_date, end_date, mock_flag, test_flag): self.start_date = start_date self.end_date = end_date self.mock_flag = mock_flag self.dict_path = mc.return_base_path(test_flag) self._set_folder_path(version_str) self.ext = self._get_extract_object(start_date, end_date, mock_flag) self.tf = self._get_transform_object(start_date, end_date)
def _set_folder_path(self, version_str, model_name, test_flag): self.dict_path = mc.return_base_path(test_flag) self.dict_folder = self.dict_path + 'dict/' + version_str + '/' self.model_folder = self.dict_path + 'model/' + version_str + '/' self.pred_folder =self.dict_path + 'pred/' + version_str + '/' mu.create_folder(self.dict_folder) mu.create_folder(self.model_folder) mu.create_folder(self.pred_folder)
def __init__(self, model_name, version_str, start_date, end_date, test_flag): self.model_name = model_name self.version_str = version_str self.start_date = start_date self.end_date = end_date self.dict_path = mc.return_base_path(test_flag) self.dict_folder = self.dict_path + 'dict/' + self.version_str + '/' self._set_columns()
def __init__(self, model_name, version_str, start_date, end_date, mock_flag, test_flag, mode): self.model_name = model_name self.version_str = version_str self.start_date = start_date self.end_date = end_date self.dict_path = mc.return_base_path(test_flag) self._set_folder_path(mode) self.model_folder = self.model_path + model_name + '/' self.proc = self._get_skproc_object(version_str, start_date, end_date, model_name, mock_flag, test_flag)
def __init__(self, version_str, start_date, end_date, model_name, mock_flag, test_flag, obj_column_list): self.start_date = start_date self.end_date = end_date self.model_name = model_name self.dict_path = mc.return_base_path(test_flag) self._set_folder_path(version_str) self.model_folder = self.model_path + model_name + '/' mu.create_folder(self.model_folder) self.ld = self._get_load_object(version_str, start_date, end_date, mock_flag, test_flag) self.mock_flag = mock_flag self.test_flag = test_flag self.obj_column_list = obj_column_list
class TestBaseTaskLearning(TestBaseCommon): """ Learning処理を実施できることを確認するためのテスト """ start_date = '2018/01/01' end_date = '2018/01/11' mode = 'learning' model_name = 'raceuma_ens' mock_flag = False test_flag = True dict_path = mc.return_base_path(test_flag) clean_flag = False cls_val = "競走種別コード" val = "12" target = "WIN_FLAG" obj_column_list = ["WIN_FLAG", "JIKU_FLAG", "ANA_FLAG"] obj_column_list_tr = ["WIN_FLAG_tr", "JIKU_FLAG_tr", "ANA_FLAG_tr"] def setUp(self): """ テスト実施前に必要な処理を記載する。呼び出しクラスやフォルダの指定等 """ model_version = 'base' table_name = '地方競馬レース馬' self.intermediate_folder = self.dict_path + 'intermediate/' + model_version + '_' + self.mode + '/' + self.model_name + '/' self.skmodel = BaseSkModel(self.model_name, model_version, self.start_date, self.end_date, self.mock_flag, self.test_flag, self.mode) table_name = table_name + "_test" self.skmodel.set_test_table(table_name) self._proc_check_folder() def test_00_preprocess(self): """ テストを実施する前の前処理(フォルダのクリーンとか) """ print("-- " + sys._getframe().f_code.co_name + " start --") model_folder = self.skmodel.model_folder dict_folder = self.skmodel.dict_folder intermediate_folder = self.intermediate_folder if self.clean_flag: shutil.rmtree(model_folder) shutil.rmtree(dict_folder) shutil.rmtree(intermediate_folder) def test_01_create_learning_data(self): """ learning_dfを問題なく作成できることを確認 """ print("-- " + sys._getframe().f_code.co_name + " start --") if not os.path.exists(self.intermediate_folder + '_learning_all.pkl'): self.skmodel.create_learning_data() df = self.skmodel.learning_df # not empty check self.assertFalse(len(df.index) == 0) # columns check # 分類軸用の列があるか確認 # mu.check_df(df) contain_columns_set = set(self.skmodel.class_list) contain_check = self.proc_test_contain_columns_check( df, contain_columns_set) self.assertTrue(contain_check) # データ区分等不要な項目がないか確認 contain_not_columns_set = set(['データ区分_x']) not_contain_check = self.proc_test_not_contain_columns_check( df, contain_not_columns_set) self.assertTrue(not_contain_check) # value check # 後続処理のためにデータを保存 save_learning_df = self.skmodel.get_all_learning_df_for_save() save_learning_df.to_pickle(self.intermediate_folder + '_learning_all.pkl') df.to_pickle(self.intermediate_folder + '_learning.pkl') def test_02_check_dimension(self): """ 分類軸毎にデータを分割できることを確認。test_01の結果を使いたい """ print("-- " + sys._getframe().f_code.co_name + " start --") file_name = self.intermediate_folder + "_learning.pkl" contain_columns_set = set(self.obj_column_list) contain_not_columns_set = set(["データ区分_x"]) df_list = os.listdir(self.intermediate_folder) with open(file_name, 'rb') as f: df = pickle.load(f) class_list = self.skmodel.class_list for cls_val in class_list: check_list = [s for s in df_list if "learning_" + cls_val in s] if len(check_list) == 0: val_list = self.skmodel.get_val_list(df, cls_val) # val_listが空でないことを確認 self.assertFalse(len(val_list) == 0) val_list.to_pickle(self.intermediate_folder + cls_val + "_list.pkl") for val in val_list: filter_df = self.skmodel.get_filter_df( df, cls_val, val) # filter_dfが空でないことを確認 self.assertFalse(len(filter_df.index) == 0) # 必要な項目がちゃんとあるか確認 print(contain_columns_set) # mu.check_df(filter_df) contain_check = self.proc_test_contain_columns_check( filter_df, contain_columns_set) self.assertTrue(contain_check) # 不要な項目がないか確認 not_contain_check = self.proc_test_not_contain_columns_check( df, contain_not_columns_set) self.assertTrue(not_contain_check) filter_df.to_pickle(self.intermediate_folder + "learning_" + cls_val + "_" + val + ".pkl") break def test_11_create_feature_select_data(self): """ 特徴量作成処理を問題なくできることを確認。test_01の結果を使いたい。すでに作成に成功している場合はスキップ """ print("-- " + sys._getframe().f_code.co_name + " start --") dict_list = os.listdir(self.skmodel.dict_folder) check_dict = self.obj_column_list_tr check_flag = False for check in check_dict: check_list = [s for s in dict_list if check in s] if len(check_list) == 0: check_flag = True if check_flag: file_name = self.intermediate_folder + "_learning_all.pkl" with open(file_name, 'rb') as f: learning_df = pickle.load(f) self.skmodel.create_featrue_select_data(learning_df) def test_20_check_learning_df(self): """ 学習に利用するデータフレームのテスト """ print("-- " + sys._getframe().f_code.co_name + " start --") file_name = self.intermediate_folder + 'learning_' + self.cls_val + '_' + self.val + '.pkl' with open(file_name, 'rb') as f: df = pickle.load(f) self.skmodel.proc.set_ensemble_params(self.skmodel.clfs, self.skmodel.index_list, self.skmodel.ens_folder_path) self.skmodel.proc.set_target_flag(self.target) df = df.fillna(df.median()) df = df.dropna() #SMOTEでNaNがあると処理できないため self.skmodel.proc.set_learning_data(df, self.target) self.skmodel.proc.divide_learning_data() self.skmodel.proc.load_learning_target_encoding() X_train = self.skmodel.proc.X_train mu.check_df(X_train) def test_21_proc_learning_sk_model(self): """ 学習モデルの作成が問題なくできることを確認。test_02の結果を使いたい""" print("-- " + sys._getframe().f_code.co_name + " start --") ### 途中から実行できるようにしたいがファイル処理を考えないといけない。 self.create_folder() te_p = self.intermediate_folder class_list = self.skmodel.class_list for cls_val in class_list: print(cls_val) file_name = self.intermediate_folder + cls_val + "_list.pkl" created_model_list = [ s for s in os.listdir(self.skmodel.model_folder + 'third/') if cls_val in s ] with open(file_name, 'rb') as f: val_list = pickle.load(f) tr_list = [s for s in os.listdir(te_p) if cls_val in s] for val in val_list: print(val) created_model_list_val = [ s for s in created_model_list if val in s ] print(created_model_list_val) if len(created_model_list_val) == len( self.skmodel.obj_column_list): print( "-----------------------------\r\n --- skip create learning model -- \r\n" ) else: data_file_name = [s for s in tr_list if val in s] print(data_file_name) with open(self.intermediate_folder + data_file_name[0], 'rb') as f: df = pickle.load(f) # 学習を実施 # check_df = df.dropna() # if not check_df.empty: self.skmodel.proc_learning_sk_model( df, cls_val, val) break
return proc # ============================================================================================================ if __name__ == "__main__": args = sys.argv print("------------- start luigi tasks ----------------") print(args) print("mode:" + args[1]) # learning or predict print("mock flag:" + args[2]) # True or False print("test mode:" + args[3]) # True or False mode = args[1] mock_flag = strtobool(args[2]) test_flag = strtobool(args[3]) dict_path = mc.return_base_path(test_flag) INTERMEDIATE_FOLDER = dict_path + 'intermediate/' + MODEL_VERSION + '_' + args[ 1] + '/' + MODEL_NAME + '/' print("intermediate_folder:" + INTERMEDIATE_FOLDER) if mode == "learning": if test_flag: print("Test mode") start_date = '2018/01/01' end_date = '2018/01/31' else: start_date = '2015/01/01' end_date = '2018/12/31' if mock_flag: print("use mock data") print("MODE:learning mock_flag:" + str(args[2]) + " start_date:" +
## 各指数の適切な配分を計算する ## 勝ち指数:単勝回収率・勝率を重視 ## 軸指数:複勝回収率・複勝率を重視 ## 穴指数:1番人気との馬連の回収率・的中率を重視 # データ取得 start_date = '2019/01/01' end_date = '2019/12/31' mock_flag = False ext = LBExtract(start_date, end_date, mock_flag) sim = LBSimulation(start_date, end_date, mock_flag) dict_path = mc.return_base_path(False) intermediate_folder = dict_path + 'intermediate/' def get_type_df_list(df, type): df.rename(columns={"RACE_KEY": "競走コード", "UMABAN": "馬番", "predict_std": "予測値偏差"}, inplace=True) win_df = df[df["target"] == "WIN_FLAG"][["競走コード", "馬番", "予測値偏差"]].rename( columns={"予測値偏差": "偏差" + type}) jiku_df = df[df["target"] == "JIKU_FLAG"][["競走コード", "馬番", "予測値偏差"]].rename( columns={"予測値偏差": "偏差" + type}) ana_df = df[df["target"] == "ANA_FLAG"][["競走コード", "馬番", "予測値偏差"]].rename( columns={"予測値偏差": "偏差" + type}) return [win_df, jiku_df, ana_df] """ print(lb_v1_df.iloc[0])
class TestBaseTaskLearning(TestBaseCommon): """ Learning処理を実施できることを確認するためのテスト """ start_date = '2018/01/01' end_date = '2018/01/11' mode = 'learning' model_name = 'race_lgm' mock_flag = False test_flag = True dict_path = mc.return_base_path(test_flag) clean_flag = False target = "WIN_FLAG" obj_column_list = ["WIN_FLAG", "JIKU_FLAG", "ANA_FLAG"] obj_column_list_tr = ["WIN_FLAG_tr", "JIKU_FLAG_tr", "ANA_FLAG_tr"] def setUp(self): """ テスト実施前に必要な処理を記載する。呼び出しクラスやフォルダの指定等 """ model_version = 'base' self.intermediate_folder = self.dict_path + 'intermediate/' + model_version + '_' + self.mode + '/' + self.model_name + '/' self.skmodel = BaseSkModel(self.model_name, model_version, self.start_date, self.end_date, self.mock_flag, self.test_flag, self.mode) self._proc_check_folder() def test_00_preprocess(self): """ テストを実施する前の前処理(フォルダのクリーンとか) """ print("-- " + sys._getframe().f_code.co_name + " start --") model_folder = self.skmodel.model_folder dict_folder = self.skmodel.dict_folder intermediate_folder = self.intermediate_folder if self.clean_flag: shutil.rmtree(model_folder) shutil.rmtree(dict_folder) shutil.rmtree(intermediate_folder) def test_01_create_learning_data(self): """ learning_dfを問題なく作成できることを確認 """ print("-- " + sys._getframe().f_code.co_name + " start --") if not os.path.exists(self.intermediate_folder + '_learning_df.pkl'): self.skmodel.create_learning_data() df = self.skmodel.learning_df # not empty check self.assertFalse(len(df.index) == 0) # columns check # 分類軸用の列があるか確認 contain_columns_set = set(self.skmodel.class_list) contain_check = self.proc_test_contain_columns_check(df, contain_columns_set) self.assertTrue(contain_check) # データ区分等不要な項目がないか確認 contain_not_columns_set = set(['データ区分_x']) not_contain_check = self.proc_test_not_contain_columns_check(df, contain_not_columns_set) self.assertTrue(not_contain_check) # value check # 後続処理のためにデータを保存 df.to_pickle(self.intermediate_folder + '_learning.pkl') def test_11_create_feature_select_data(self): """ 特徴量作成処理を問題なくできることを確認。test_01の結果を使いたい。すでに作成に成功している場合はスキップ """ print("-- " + sys._getframe().f_code.co_name + " start --") dict_list = os.listdir(self.skmodel.dict_folder) check_dict = self.obj_column_list_tr check_flag = False for check in check_dict: check_list = [s for s in dict_list if check in s] if len(check_list) == 0: check_flag = True if check_flag: file_name = self.intermediate_folder + "_learning_df.pkl" with open(file_name, 'rb') as f: learning_df = pickle.load(f) self.skmodel.create_featrue_select_data(learning_df) def test_21_proc_learning_sk_model(self): """ 学習モデルの作成が問題なくできることを確認。test_02の結果を使いたい""" print("-- " + sys._getframe().f_code.co_name + " start --") self.create_folder() te_p = self.intermediate_folder with open(self.intermediate_folder + '_learning.pkl', 'rb') as f: df = pickle.load(f) # 学習を実施 print(self.target) self.skmodel.proc.learning_sk_model(df, self.target)