Exemple #1
0
 def __init__(self, version_str, start_date, end_date, mock_flag, test_flag):
     self.start_date = start_date
     self.end_date = end_date
     self.mock_flag = mock_flag
     self.dict_path = mc.return_base_path(test_flag)
     self._set_folder_path(version_str)
     self.ext = self._get_extract_object(start_date, end_date, mock_flag)
     self.tf = self._get_transform_object(start_date, end_date)
Exemple #2
0
 def _set_folder_path(self, version_str, model_name, test_flag):
     self.dict_path = mc.return_base_path(test_flag)
     self.dict_folder = self.dict_path + 'dict/' + version_str + '/'
     self.model_folder = self.dict_path + 'model/' + version_str + '/'
     self.pred_folder =self.dict_path + 'pred/' + version_str + '/'
     mu.create_folder(self.dict_folder)
     mu.create_folder(self.model_folder)
     mu.create_folder(self.pred_folder)
Exemple #3
0
 def __init__(self, model_name, version_str, start_date, end_date, test_flag):
     self.model_name = model_name
     self.version_str = version_str
     self.start_date = start_date
     self.end_date = end_date
     self.dict_path = mc.return_base_path(test_flag)
     self.dict_folder = self.dict_path + 'dict/' + self.version_str + '/'
     self._set_columns()
Exemple #4
0
 def __init__(self, model_name, version_str, start_date, end_date,
              mock_flag, test_flag, mode):
     self.model_name = model_name
     self.version_str = version_str
     self.start_date = start_date
     self.end_date = end_date
     self.dict_path = mc.return_base_path(test_flag)
     self._set_folder_path(mode)
     self.model_folder = self.model_path + model_name + '/'
     self.proc = self._get_skproc_object(version_str, start_date, end_date,
                                         model_name, mock_flag, test_flag)
Exemple #5
0
 def __init__(self, version_str, start_date, end_date, model_name, mock_flag, test_flag, obj_column_list):
     self.start_date = start_date
     self.end_date = end_date
     self.model_name = model_name
     self.dict_path = mc.return_base_path(test_flag)
     self._set_folder_path(version_str)
     self.model_folder = self.model_path + model_name + '/'
     mu.create_folder(self.model_folder)
     self.ld = self._get_load_object(version_str, start_date, end_date, mock_flag, test_flag)
     self.mock_flag = mock_flag
     self.test_flag = test_flag
     self.obj_column_list = obj_column_list
class TestBaseTaskLearning(TestBaseCommon):
    """ Learning処理を実施できることを確認するためのテスト """
    start_date = '2018/01/01'
    end_date = '2018/01/11'
    mode = 'learning'
    model_name = 'raceuma_ens'
    mock_flag = False
    test_flag = True
    dict_path = mc.return_base_path(test_flag)
    clean_flag = False
    cls_val = "競走種別コード"
    val = "12"
    target = "WIN_FLAG"
    obj_column_list = ["WIN_FLAG", "JIKU_FLAG", "ANA_FLAG"]
    obj_column_list_tr = ["WIN_FLAG_tr", "JIKU_FLAG_tr", "ANA_FLAG_tr"]

    def setUp(self):
        """ テスト実施前に必要な処理を記載する。呼び出しクラスやフォルダの指定等 """
        model_version = 'base'
        table_name = '地方競馬レース馬'
        self.intermediate_folder = self.dict_path + 'intermediate/' + model_version + '_' + self.mode + '/' + self.model_name + '/'
        self.skmodel = BaseSkModel(self.model_name, model_version,
                                   self.start_date, self.end_date,
                                   self.mock_flag, self.test_flag, self.mode)
        table_name = table_name + "_test"
        self.skmodel.set_test_table(table_name)
        self._proc_check_folder()

    def test_00_preprocess(self):
        """ テストを実施する前の前処理(フォルダのクリーンとか) """
        print("--  " + sys._getframe().f_code.co_name + " start --")
        model_folder = self.skmodel.model_folder
        dict_folder = self.skmodel.dict_folder
        intermediate_folder = self.intermediate_folder
        if self.clean_flag:
            shutil.rmtree(model_folder)
            shutil.rmtree(dict_folder)
            shutil.rmtree(intermediate_folder)

    def test_01_create_learning_data(self):
        """ learning_dfを問題なく作成できることを確認 """
        print("--  " + sys._getframe().f_code.co_name + " start --")
        if not os.path.exists(self.intermediate_folder + '_learning_all.pkl'):
            self.skmodel.create_learning_data()
            df = self.skmodel.learning_df
            # not empty check
            self.assertFalse(len(df.index) == 0)
            # columns check
            # 分類軸用の列があるか確認
            # mu.check_df(df)
            contain_columns_set = set(self.skmodel.class_list)
            contain_check = self.proc_test_contain_columns_check(
                df, contain_columns_set)
            self.assertTrue(contain_check)
            # データ区分等不要な項目がないか確認
            contain_not_columns_set = set(['データ区分_x'])
            not_contain_check = self.proc_test_not_contain_columns_check(
                df, contain_not_columns_set)
            self.assertTrue(not_contain_check)
            # value check

            # 後続処理のためにデータを保存
            save_learning_df = self.skmodel.get_all_learning_df_for_save()
            save_learning_df.to_pickle(self.intermediate_folder +
                                       '_learning_all.pkl')
            df.to_pickle(self.intermediate_folder + '_learning.pkl')

    def test_02_check_dimension(self):
        """ 分類軸毎にデータを分割できることを確認。test_01の結果を使いたい """
        print("--  " + sys._getframe().f_code.co_name + " start --")
        file_name = self.intermediate_folder + "_learning.pkl"
        contain_columns_set = set(self.obj_column_list)
        contain_not_columns_set = set(["データ区分_x"])
        df_list = os.listdir(self.intermediate_folder)
        with open(file_name, 'rb') as f:
            df = pickle.load(f)
            class_list = self.skmodel.class_list
            for cls_val in class_list:
                check_list = [s for s in df_list if "learning_" + cls_val in s]
                if len(check_list) == 0:
                    val_list = self.skmodel.get_val_list(df, cls_val)
                    # val_listが空でないことを確認
                    self.assertFalse(len(val_list) == 0)
                    val_list.to_pickle(self.intermediate_folder + cls_val +
                                       "_list.pkl")
                    for val in val_list:
                        filter_df = self.skmodel.get_filter_df(
                            df, cls_val, val)
                        # filter_dfが空でないことを確認
                        self.assertFalse(len(filter_df.index) == 0)
                        # 必要な項目がちゃんとあるか確認
                        print(contain_columns_set)
                        # mu.check_df(filter_df)
                        contain_check = self.proc_test_contain_columns_check(
                            filter_df, contain_columns_set)
                        self.assertTrue(contain_check)
                        # 不要な項目がないか確認
                        not_contain_check = self.proc_test_not_contain_columns_check(
                            df, contain_not_columns_set)
                        self.assertTrue(not_contain_check)
                        filter_df.to_pickle(self.intermediate_folder +
                                            "learning_" + cls_val + "_" + val +
                                            ".pkl")
                        break

    def test_11_create_feature_select_data(self):
        """ 特徴量作成処理を問題なくできることを確認。test_01の結果を使いたい。すでに作成に成功している場合はスキップ """
        print("--  " + sys._getframe().f_code.co_name + " start --")
        dict_list = os.listdir(self.skmodel.dict_folder)
        check_dict = self.obj_column_list_tr
        check_flag = False
        for check in check_dict:
            check_list = [s for s in dict_list if check in s]
            if len(check_list) == 0:
                check_flag = True
        if check_flag:
            file_name = self.intermediate_folder + "_learning_all.pkl"
            with open(file_name, 'rb') as f:
                learning_df = pickle.load(f)
                self.skmodel.create_featrue_select_data(learning_df)

    def test_20_check_learning_df(self):
        """ 学習に利用するデータフレームのテスト """
        print("--  " + sys._getframe().f_code.co_name + " start --")
        file_name = self.intermediate_folder + 'learning_' + self.cls_val + '_' + self.val + '.pkl'

        with open(file_name, 'rb') as f:
            df = pickle.load(f)
            self.skmodel.proc.set_ensemble_params(self.skmodel.clfs,
                                                  self.skmodel.index_list,
                                                  self.skmodel.ens_folder_path)
            self.skmodel.proc.set_target_flag(self.target)
            df = df.fillna(df.median())
            df = df.dropna()  #SMOTEでNaNがあると処理できないため
            self.skmodel.proc.set_learning_data(df, self.target)
            self.skmodel.proc.divide_learning_data()
            self.skmodel.proc.load_learning_target_encoding()
            X_train = self.skmodel.proc.X_train
            mu.check_df(X_train)

    def test_21_proc_learning_sk_model(self):
        """ 学習モデルの作成が問題なくできることを確認。test_02の結果を使いたい"""
        print("--  " + sys._getframe().f_code.co_name + " start --")
        ### 途中から実行できるようにしたいがファイル処理を考えないといけない。
        self.create_folder()
        te_p = self.intermediate_folder
        class_list = self.skmodel.class_list
        for cls_val in class_list:
            print(cls_val)
            file_name = self.intermediate_folder + cls_val + "_list.pkl"
            created_model_list = [
                s for s in os.listdir(self.skmodel.model_folder + 'third/')
                if cls_val in s
            ]
            with open(file_name, 'rb') as f:
                val_list = pickle.load(f)
                tr_list = [s for s in os.listdir(te_p) if cls_val in s]
                for val in val_list:
                    print(val)
                    created_model_list_val = [
                        s for s in created_model_list if val in s
                    ]
                    print(created_model_list_val)
                    if len(created_model_list_val) == len(
                            self.skmodel.obj_column_list):
                        print(
                            "-----------------------------\r\n --- skip create learning model -- \r\n"
                        )
                    else:
                        data_file_name = [s for s in tr_list if val in s]
                        print(data_file_name)
                        with open(self.intermediate_folder + data_file_name[0],
                                  'rb') as f:
                            df = pickle.load(f)
                            # 学習を実施
                            # check_df = df.dropna()
                            # if not check_df.empty:
                            self.skmodel.proc_learning_sk_model(
                                df, cls_val, val)
                    break
Exemple #7
0
        return proc


# ============================================================================================================

if __name__ == "__main__":
    args = sys.argv
    print("------------- start luigi tasks ----------------")
    print(args)
    print("mode:" + args[1])  # learning or predict
    print("mock flag:" + args[2])  # True or False
    print("test mode:" + args[3])  # True or False
    mode = args[1]
    mock_flag = strtobool(args[2])
    test_flag = strtobool(args[3])
    dict_path = mc.return_base_path(test_flag)
    INTERMEDIATE_FOLDER = dict_path + 'intermediate/' + MODEL_VERSION + '_' + args[
        1] + '/' + MODEL_NAME + '/'
    print("intermediate_folder:" + INTERMEDIATE_FOLDER)

    if mode == "learning":
        if test_flag:
            print("Test mode")
            start_date = '2018/01/01'
            end_date = '2018/01/31'
        else:
            start_date = '2015/01/01'
            end_date = '2018/12/31'
        if mock_flag:
            print("use mock data")
        print("MODE:learning mock_flag:" + str(args[2]) + "  start_date:" +
Exemple #8
0
## 各指数の適切な配分を計算する
## 勝ち指数:単勝回収率・勝率を重視
## 軸指数:複勝回収率・複勝率を重視
## 穴指数:1番人気との馬連の回収率・的中率を重視


# データ取得

start_date = '2019/01/01'
end_date = '2019/12/31'
mock_flag = False

ext = LBExtract(start_date, end_date, mock_flag)
sim = LBSimulation(start_date, end_date, mock_flag)

dict_path = mc.return_base_path(False)
intermediate_folder = dict_path + 'intermediate/'

def get_type_df_list(df, type):
    df.rename(columns={"RACE_KEY": "競走コード", "UMABAN": "馬番", "predict_std": "予測値偏差"}, inplace=True)

    win_df = df[df["target"] == "WIN_FLAG"][["競走コード", "馬番", "予測値偏差"]].rename(
        columns={"予測値偏差": "偏差" + type})
    jiku_df = df[df["target"] == "JIKU_FLAG"][["競走コード", "馬番", "予測値偏差"]].rename(
        columns={"予測値偏差": "偏差" + type})
    ana_df = df[df["target"] == "ANA_FLAG"][["競走コード", "馬番", "予測値偏差"]].rename(
        columns={"予測値偏差": "偏差" + type})
    return [win_df, jiku_df, ana_df]

"""
print(lb_v1_df.iloc[0])
Exemple #9
0
class TestBaseTaskLearning(TestBaseCommon):
    """ Learning処理を実施できることを確認するためのテスト """
    start_date = '2018/01/01'
    end_date = '2018/01/11'
    mode = 'learning'
    model_name = 'race_lgm'
    mock_flag = False
    test_flag = True
    dict_path = mc.return_base_path(test_flag)
    clean_flag = False
    target = "WIN_FLAG"
    obj_column_list = ["WIN_FLAG", "JIKU_FLAG", "ANA_FLAG"]
    obj_column_list_tr = ["WIN_FLAG_tr", "JIKU_FLAG_tr", "ANA_FLAG_tr"]

    def setUp(self):
        """ テスト実施前に必要な処理を記載する。呼び出しクラスやフォルダの指定等 """
        model_version = 'base'
        self.intermediate_folder = self.dict_path + 'intermediate/' + model_version + '_' + self.mode + '/' + self.model_name + '/'
        self.skmodel = BaseSkModel(self.model_name, model_version, self.start_date, self.end_date, self.mock_flag, self.test_flag, self.mode)
        self._proc_check_folder()

    def test_00_preprocess(self):
        """ テストを実施する前の前処理(フォルダのクリーンとか) """
        print("--  " + sys._getframe().f_code.co_name + " start --")
        model_folder = self.skmodel.model_folder
        dict_folder = self.skmodel.dict_folder
        intermediate_folder = self.intermediate_folder
        if self.clean_flag:
            shutil.rmtree(model_folder)
            shutil.rmtree(dict_folder)
            shutil.rmtree(intermediate_folder)


    def test_01_create_learning_data(self):
        """ learning_dfを問題なく作成できることを確認 """
        print("--  " + sys._getframe().f_code.co_name + " start --")
        if not os.path.exists(self.intermediate_folder + '_learning_df.pkl'):
            self.skmodel.create_learning_data()
            df = self.skmodel.learning_df
            # not empty check
            self.assertFalse(len(df.index) == 0)
            # columns check
            # 分類軸用の列があるか確認
            contain_columns_set = set(self.skmodel.class_list)
            contain_check = self.proc_test_contain_columns_check(df, contain_columns_set)
            self.assertTrue(contain_check)
            # データ区分等不要な項目がないか確認
            contain_not_columns_set = set(['データ区分_x'])
            not_contain_check = self.proc_test_not_contain_columns_check(df, contain_not_columns_set)
            self.assertTrue(not_contain_check)
            # value check

            # 後続処理のためにデータを保存
            df.to_pickle(self.intermediate_folder + '_learning.pkl')


    def test_11_create_feature_select_data(self):
        """ 特徴量作成処理を問題なくできることを確認。test_01の結果を使いたい。すでに作成に成功している場合はスキップ """
        print("--  " + sys._getframe().f_code.co_name + " start --")
        dict_list = os.listdir(self.skmodel.dict_folder)
        check_dict = self.obj_column_list_tr
        check_flag = False
        for check in check_dict:
            check_list = [s for s in dict_list if check in s]
            if len(check_list) == 0:
                check_flag = True
        if check_flag:
            file_name = self.intermediate_folder + "_learning_df.pkl"
            with open(file_name, 'rb') as f:
                learning_df = pickle.load(f)
                self.skmodel.create_featrue_select_data(learning_df)

    def test_21_proc_learning_sk_model(self):
        """ 学習モデルの作成が問題なくできることを確認。test_02の結果を使いたい"""
        print("--  " + sys._getframe().f_code.co_name + " start --")
        self.create_folder()
        te_p = self.intermediate_folder
        with open(self.intermediate_folder + '_learning.pkl', 'rb') as f:
            df = pickle.load(f)
            # 学習を実施
            print(self.target)
            self.skmodel.proc.learning_sk_model(df, self.target)