def _regression_setup(self, combined_data):
     regression.setup(
         combined_data.sample(frac=1),  #shuffles the data
         target=self.task.target,
         test_data=self.test_data,
         fold_strategy="kfold",  # TODO allow more strategies as hyperparam
         silent=True,
         verbose=False)
Beispiel #2
0
def exploratory_experiment(df, target, target_type='R'):
    '''
    Func:机器学习探索性实验\n
    In:dataframe各指标数据\n
    target --> 因变量\n
    target_type --> R连续变量;C分类变量
    '''
    if target_type == 'R' or target_type == 'r':
        from pycaret.regression import compare_models, setup
    elif target_type == 'C' or target_type == 'c':
        from pycaret.classification import compare_models, setup
    setup(df, target)
    compare_models()
def test():
    from pycaret.datasets import get_data

    data = get_data("boston")
    from pycaret.regression import setup, create_model, tune_model

    s = setup(data, target="medv", silent=True, html=False, session_id=123)
    gbr = create_model("gbr")
    tuned_gbr = tune_model(gbr)
    xgboost = create_model("xgboost")
    tuned_xgboost = tune_model(xgboost)
    lightgbm = create_model("lightgbm")
    tuned_lightgbm = tune_model(lightgbm)
    assert 1 == 1
Beispiel #4
0
def run_pycaret(name, df_train, df_test, acc_func, target):
    pycaret_acc_func_str = 'Accuracy'
    for pycaret_metrics in [
            'Accuracy', 'AUC', 'Recall', 'Precision', 'F1', 'Kappa', 'MCC',
            'MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'
    ]:
        if pycaret_metrics.lower() in str(acc_func).lower():
            pycaret_acc_func_str = pycaret_metrics

    import traceback
    task_type = 'classification'
    if pycaret_acc_func_str in ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']:
        task_type = 'regression'
        from pycaret.regression import setup, compare_models, predict_model, blend_models, stack_models, automl, create_model
    else:
        from pycaret.classification import setup, compare_models, predict_model, blend_models, stack_models, automl, create_model

    setup_return = setup(data=df_train, target=target)

    top_models = compare_models(n_select=3,
                                verbose=False,
                                sort=pycaret_acc_func_str,
                                turbo=True,
                                blacklist=['catboost', 'xgboost'])

    # Ensemble the top models and optimize the resulting model
    blender = blend_models(estimator_list=top_models, verbose=False)
    stacker = stack_models(estimator_list=top_models,
                           meta_model=top_models[0],
                           verbose=False)
    best_model = automl(optimize=pycaret_acc_func_str)

    df_test_dropped = df_test.drop(columns=[target])

    predictions = predict_model(best_model, data=df_test_dropped)

    try:
        accuracy = acc_func(list(predictions['Label']), list(df_test[target]))
    except Exception as e:
        traceback.print_exc()
        print(f'Exception computing accuracy (1): {e}')
        if task_type == 'classification':
            accuracy = acc_func([str(x) for x in list(predictions['Label'])],
                                [str(x) for x in list(df_test[target])])
        elif task_type == 'regression':
            accuracy = acc_func([float(x) for x in list(predictions['Label'])],
                                [float(x) for x in list(df_test[target])])

    return accuracy
Beispiel #5
0
    def exec(self):

        log.info('[START] {}'.format("exec"))

        # h2o.init()

        try:

            if (platform.system() == 'Windows'):

                # 옵션 설정
                sysOpt = {
                    # 시작/종료 시간
                    'srtDate': '2019-01-01',
                    'endDate': '2021-12-31',
                    'isOverWrite': True
                    # , 'isOverWrite': False
                }

                globalVar['inpPath'] = 'E:/DATA'
                globalVar['outPath'] = 'E:/DATA'

            else:

                # 옵션 설정
                sysOpt = {
                    # 시작/종료 시간
                    'srtDate': globalVar['srtDate'],
                    'endDate': globalVar['endDate']
                    # , 'isOverWrite': True
                    ,
                    'isOverWrite': False
                }

            isDlModelInit = False

            inpPosFile = '{}/{}'.format(globalVar['cfgPath'],
                                        'stnInfo/GA_STN_INFO.xlsx')
            posData = pd.read_excel(inpPosFile, engine='openpyxl')
            posDataL1 = posData[['id', 'lat', 'lon']]

            modelDirKeyList = ['AI_2Y']
            # modelDirKeyList = ['AI_1Y6M']
            # modelDirKeyList = ['AI_2Y', 'AI_7D', 'AI_15D', 'AI_1M', 'AI_3M', 'AI_6M']

            for k, modelDirKey in enumerate(modelDirKeyList):
                log.info("[CHECK] modelDirKey : {}".format(modelDirKey))

                for i, posInfo in posDataL1.iterrows():

                    posId = int(posInfo['id'])
                    posLat = posInfo['lat']
                    posLon = posInfo['lon']

                    if (not re.search('17', str(posId))): continue
                    # if (re.search('17|50|51|58|60|67|72|81|85|87', str(posId))): continue

                    log.info('[CHECK] posId : {}'.format(posId))

                    # break
                    inpFile = '{}/{}/{}-SRV{:05d}-{}-{}-{}.xlsx'.format(
                        globalVar['outPath'], 'FOR', serviceName, posId,
                        'final', 'proc', 'for')
                    fileList = sorted(glob.glob(inpFile))

                    # 파일 없을 경우 예외 처리
                    if fileList is None or len(fileList) < 1:
                        log.error('[ERROR] inpFile : {} / {}'.format(
                            inpFile, '입력 자료를 확인해주세요.'))
                        continue

                    fileInfo = fileList[0]
                    inpData = pd.read_excel(fileInfo, engine='openpyxl')

                    # inpData['CA_TOT'].where(inpData['CA_TOT'] < 0, np.nan)
                    inpData['CA_TOT'][inpData['CA_TOT'] < 0] = np.nan
                    inpData['WS'][inpData['WS'] < 0] = np.nan
                    inpData['WD'][inpData['WD'] < 0] = np.nan
                    inpData['SWR'][inpData['SWR'] < 0] = np.nan
                    inpData['pv'][inpData['pv'] < 0] = np.nan

                    inpDataL1 = inpData.dropna().reset_index(drop=True)
                    inpDataL1 = inpDataL1.sort_values(by=['dtDateKst'], axis=0)

                    # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d')].index.to_numpy()[0]
                    # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-11-30', format='%Y-%m-%d')].index.to_numpy()
                    idxInfo = inpDataL1.loc[
                        inpDataL1['dtDateKst'] >= pd.to_datetime(
                            '2021-06-01', format='%Y-%m-%d')].index.to_numpy()
                    # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2022-01-01', format='%Y-%m-%d')].index.to_numpy()

                    if (len(idxInfo) < 1): continue
                    idx = idxInfo[0]

                    # 7일, 15일, 1달, 3달, 6달, 2년
                    if (modelDirKey == 'AI_2Y'):
                        # 전체 데이터
                        # trainData = inpDataL1

                        # 2021년 기준으로 데이터 분할
                        trainData, testData = inpDataL1[0:idx], inpDataL1[
                            idx:len(inpDataL1)]

                    elif (modelDirKey == 'AI_7D'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            timedelta(days=7)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_15D'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            timedelta(days=15)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_1M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=1)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_3M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=3)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_6M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=6)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]

                    log.info('[CHECK] len(trainData) : {}'.format(
                        len(trainData)))
                    log.info('[CHECK] len(testData) : {}'.format(
                        len(testData)))
                    log.info('[CHECK] trainData : {} - {}'.format(
                        trainData['dtDateKst'].min(),
                        trainData['dtDateKst'].max()))
                    # log.info('[CHECK] testData : {} - {}'.format(trainData['testData'].min(), trainData['testData'].max()))

                    # trainData['year'] = trainData['dtDateKst'].dt.strftime('%Y').astype('int64')
                    # trainData['month'] = trainData['dtDateKst'].dt.strftime('%m').astype('int64')
                    # trainData['day'] = trainData['dtDateKst'].dt.strftime('%d').astype('int64')
                    # trainData['hour'] = trainData['dtDateKst'].dt.strftime('%H').astype('int64')

                    trainDataL1 = trainData[[
                        'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR',
                        'pv', 'sza', 'aza', 'et'
                    ]]
                    # trainDataL1 = trainData[['year', 'month', 'day', 'hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv', 'sza', 'aza', 'et']]
                    # trainDataL1.describe()

                    # trainDataL1 = trainDataL1.loc[(trainDataL1['CA_TOT'] == 0)]

                    # CA_TOT = 0 (전운량)

                    # plt.scatter(trainData['dtDateKst'], trainData['CA_TOT'])
                    # plt.scatter(trainData['dtDateKst'], trainData['SWR'])
                    # plt.scatter(trainData['pv'], trainData['SWR'])
                    # plt.scatter(trainDataL1['CA_TOT'], trainDataL1['SWR'])
                    # plt.show()

                    # trainDataL1 = trainData[['dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv']]
                    #     # )[['dtDate', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'DSR', 'CLD', 'CF', 'SWR', 'pv']]
                    #     # )[['dtDate', 'dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'DSR', 'CF', 'CLD', 'SWR', 'pv']].dropna()
                    #     # )[['dtDate', 'dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'CF', 'CLD', 'SWR', 'pv']]
                    #

                    # import pandas as pd
                    # from autofeat import AutoFeatClassifier
                    # from sklearn.model_selection import train_test_split
                    # from sklearn.datasets import load_breast_cancer
                    # from sklearn.linear_model import LogisticRegression
                    # from sklearn.metrics import accuracy_score, confusion_matrix
                    # #
                    # # load_breast_cancer = load_breast_cancer(as_frame=True)
                    # # X = load_breast_cancer.data
                    # # y = load_breast_cancer.target
                    # # trainData, testData
                    # model = AutoFeatClassifier(verbose=1)
                    # X_train_feature_creation = model.fit_transform(trainData, testData)
                    #
                    # import pandas as pd  # 기본라이브러리
                    # # from prophet import Prophet  # Prophet
                    # from neuralprophet import NeuralProphet  # NeuralProphet
                    # from sklearn.metrics import mean_absolute_error  # 평가 지표 MAE
                    # from statistics import mean  # 평균값 계산
                    # import matplotlib.pyplot as plt  # 그래프묘사
                    #
                    # df1_nprophet_model = NeuralProphet(seasonality_mode='multiplicative')
                    # df1_nprophet_model_result = df1_nprophet_model.fit(trainData, freq="H")
                    # trainData['ds'] = trainData['dtDateKst']

                    # **********************************************************************************************************
                    # TEST
                    # ***************

                    # trainData

                    plt.scatter(trainData['dtDateKst'], trainData['pv'])
                    plt.show()

                    from pmdarima import auto_arima
                    import statsmodels.tsa.api as tsa
                    import statsmodels.api as sm
                    # arima_model = auto_arima(y_to_train, seasonal=True, m=7)

                    sxmodel = auto_arima(trainData[['pv']],
                                         exogenous=trainData[['SWR']],
                                         start_p=1,
                                         start_q=1,
                                         test='adf',
                                         max_p=3,
                                         max_q=3,
                                         m=12,
                                         start_P=0,
                                         seasonal=True,
                                         d=None,
                                         D=1,
                                         trace=True,
                                         error_action='ignore',
                                         suppress_warnings=True,
                                         stepwise=True)
                    # Fit model
                    # arima_exog_model = auto_arima(y=trainData['pv'], exogenous=trainData['SWR'], seasonal=True, m=7)
                    # Forecast
                    # y_arima_exog_forecast = arima_exog_model.predict(n_periods=365, exogenous=exog_to_test)

                    trainData.index = trainData['dtDateKst']

                    # import pmdarima as pm
                    # y = pm.datasets.load_wineind()
                    #
                    # from pmdarima.model_selection import train_test_split
                    # import numpy as np
                    # train, test = train_test_split(y, train_size=150)
                    #
                    #
                    # auto_arima = auto_arima(
                    #     y=trainData['pv'].values,
                    #     # X=trainData['dtDatekst', 'law']],
                    #     X=trainData['dtDateKst'].values,
                    #     #                   stepwise=False,
                    #     seasonal=True,
                    #     max_order=5,
                    #     m=12,
                    #     approximation=False,
                    #     information_criterion='aic')
                    #
                    # from darts.models import (
                    #     NaiveSeasonal,
                    #     NaiveDrift,
                    #     Prophet,
                    #     ExponentialSmoothing,
                    #     ARIMA,
                    #     AutoARIMA,
                    #     StandardRegressionModel,
                    #     Theta,
                    #     FFT
                    # )
                    #
                    # from darts import TimeSeries
                    # series = TimeSeries.from_dataframe(trainData, time_col='dtDateKst', value_cols='pv', fill_missing_dates=True, freq='H')
                    #
                    #
                    # # for model in (
                    # #         NaiveSeasonal,
                    # #         NaiveDrift,
                    # #         Prophet,
                    # #         ExponentialSmoothing,
                    # #         ARIMA,
                    # #         AutoARIMA,
                    # #         # StandardRegressionModel, -> 初期化時にtrain_n_points が必要
                    # #         Theta,
                    # #         FFT
                    # #     )
                    # m = model()
                    # m.fit(trainData)
                    # pred = m.predict(len(val))
                    # `

                    # **********************************************************************************************************
                    # 머신러닝
                    # **********************************************************************************************************
                    # 시게열
                    # https://towardsdatascience.com/time-series-forecasting-with-pycaret-regression-module-237b703a0c63
                    #
                    # saveCsvFile = '{}/{}_{}.csv'.format(globalVar['outPath'], serviceName, "trainDataL4")
                    # # trainDataL4.to_csv(saveCsvFile, index=False)
                    # log.info('[CHECK] saveCsvFile : {}'.format(saveCsvFile))
                    #
                    # trainDataL4 = pd.read_csv(saveCsvFile)
                    # trainDataL4.describe()

                    saveMlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model.pkl'.format(
                        globalVar['outPath'], modelDirKey, serviceName, posId,
                        'final', 'pycaret', 'for', '*')
                    saveMlModelList = sorted(glob.glob(saveMlModel),
                                             reverse=True)

                    # 학습 모델이 없을 경우
                    # if (sysOpt['isOverWrite']) or (len(saveMlModelList) < 1):
                    if (len(saveMlModelList) < 1):
                        pyModel = setup(data=trainDataL1,
                                        session_id=123,
                                        silent=True,
                                        target='pv')

                        # 각 모형에 따른 자동 머신러닝
                        modelList = compare_models(sort='RMSE', n_select=3)

                        # 앙상블 모형
                        blendModel = blend_models(estimator_list=modelList,
                                                  fold=10)

                        # 앙상블 튜닝
                        tuneModel = tune_model(blendModel,
                                               fold=10,
                                               choose_better=True)

                        # 학습 모델
                        fnlModel = finalize_model(tuneModel)

                        # 학습 모델 저장
                        saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                            globalVar['outPath'], modelDirKey, serviceName,
                            posId, 'final', 'pycaret', 'for',
                            datetime.now().strftime("%Y%m%d"))
                        os.makedirs(os.path.dirname(saveModel), exist_ok=True)
                        save_model(fnlModel, saveModel)

                    # **********************************************************************************************************
                    # 딥러닝
                    # **********************************************************************************************************
                    saveDlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                        globalVar['outPath'], modelDirKey, serviceName, posId,
                        'final', 'h2o', 'for', '*')
                    saveDlModelList = sorted(glob.glob(saveDlModel),
                                             reverse=True)

                    # 학습 모델이 없을 경우
                    if (sysOpt['isOverWrite']) or (len(saveDlModelList) < 1):

                        if (isDlModelInit == False):
                            h2o.init()
                            isDlModelInit = True

                        # dnModel = H2OAutoML(max_models=20, max_runtime_secs=10000, balance_classes=True, seed=123)
                        # 2022-03-29
                        # dnModel = H2OAutoML(max_models=2, max_runtime_secs=20000, balance_classes=True, seed=123)
                        dnModel = H2OAutoML(max_models=20,
                                            max_runtime_secs=99999,
                                            balance_classes=True,
                                            seed=123)

                        # java.lang.OutOfMemoryError: Java heap space
                        # dnModel = H2OAutoML(max_models=None, max_runtime_secs=99999, balance_classes=True, seed=123)
                        # dnModel = H2OAutoML(max_models=40, max_runtime_secs=99999, balance_classes=True, seed=123)
                        # dnModel = H2OAutoML(max_models=30, max_runtime_secs=99999, balance_classes=True, seed=123)
                        # dnModel = H2OAutoML(max_models=40, max_runtime_secs=20000, balance_classes=True, seed=123)
                        dnModel.train(x=[
                            'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS',
                            'SWR', 'sza', 'aza', 'et'
                        ],
                                      y='pv',
                                      training_frame=h2o.H2OFrame(trainDataL1))
                        # dnModel.train(x=['year', 'month', 'day', 'hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL1))
                        # dnModel.train(x=['hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL1))

                        # trainSet, validSet = np.split(trainDataL1, [int(0.70 * len(trainDataL1))])
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSetL1), validation_frame=h2o.H2OFrame(validSetL1))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2), validation_frame=h2o.H2OFrame(testData))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSet), validation_frame=h2o.H2OFrame(validSet))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2))

                        # 학습 모델 저장
                        saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                            globalVar['outPath'], modelDirKey, serviceName,
                            posId, 'final', 'h2o', 'for',
                            datetime.now().strftime("%Y%m%d"))
                        os.makedirs(os.path.dirname(saveModel), exist_ok=True)

                        # h2o.save_model(model=dnModel.get_best_model(), path=os.path.dirname(saveModel), filename=os.path.basename(saveModel), force=True)
                        dnModel.get_best_model().save_mojo(
                            path=os.path.dirname(saveModel),
                            filename=os.path.basename(saveModel),
                            force=True)

        except Exception as e:
            log.error("Exception : {}".format(e))
            raise e
        finally:
            log.info('[END] {}'.format("exec"))
    def exec(self):

        log.info('[START] {}'.format("exec"))

        # import pandas as pd
        # import numpy as np

        h2o.init()

        try:
            if (platform.system() == 'Windows'):

                # 옵션 설정
                sysOpt = {
                    # 시작/종료 시간
                    'srtDate': '2021-10-01',
                    'endDate': '2021-11-01',
                    'isOverWrite': True
                    # , 'isOverWrite': False
                }

                globalVar['inpPath'] = 'E:/DATA'
                globalVar['outPath'] = 'E:/DATA'

            else:

                # 옵션 설정
                sysOpt = {
                    # 시작/종료 시간
                    'srtDate': globalVar['srtDate'],
                    'endDate': globalVar['endDate']
                    # , 'isOverWrite': True
                    ,
                    'isOverWrite': False
                }

            inpPosFile = '{}/{}'.format(globalVar['cfgPath'],
                                        'stnInfo/GA_STN_INFO.xlsx')
            posData = pd.read_excel(inpPosFile, engine='openpyxl')
            posDataL1 = posData[['id', 'lat', 'lon']]

            modelDirKeyList = ['AI_2Y']
            # modelDirKeyList = ['AI_2Y', 'AI_7D', 'AI_15D', 'AI_1M', 'AI_3M', 'AI_6M']
            for k, modelDirKey in enumerate(modelDirKeyList):
                log.info("[CHECK] modelDirKey : {}".format(modelDirKey))

                for i, posInfo in posDataL1.iterrows():
                    posId = int(posInfo['id'])
                    posLat = posInfo['lat']
                    posLon = posInfo['lon']

                    log.info(
                        "[CHECK] posId (posLon, posLat) : {} ({}. {})".format(
                            posId, posLon, posLat))

                    # break
                    inpFile = '{}/{}/{}-SRV{:05d}-{}-{}-{}.xlsx'.format(
                        globalVar['outPath'], 'ACT', serviceName, posId,
                        'final', 'proc', 'act')
                    fileList = sorted(glob.glob(inpFile))

                    # 파일 없을 경우 예외 처리
                    if fileList is None or len(fileList) < 1:
                        log.error('[ERROR] inpFile : {} / {}'.format(
                            inpFile, '입력 자료를 확인해주세요.'))
                        continue

                    fileInfo = fileList[0]
                    inpData = pd.read_excel(fileInfo, engine='openpyxl')

                    inpData['CA_TOT'][inpData['CA_TOT'] < 0] = np.nan
                    inpData['WS'][inpData['WS'] < 0] = np.nan
                    inpData['WD'][inpData['WD'] < 0] = np.nan
                    inpData['SWR'][inpData['SWR'] < 0] = np.nan
                    inpData['pv'][inpData['pv'] < 0] = np.nan

                    inpDataL1 = inpData.dropna().reset_index(drop=True)

                    # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d')].index.to_numpy()[0]
                    idxInfo = inpDataL1.loc[
                        inpDataL1['dtDateKst'] >= pd.to_datetime(
                            '2021-10-30', format='%Y-%m-%d')].index.to_numpy()
                    if (len(idxInfo) < 1): continue
                    idx = idxInfo[0]

                    # 7일, 15일, 1달, 3달, 6달, 2년
                    if (modelDirKey == 'AI_2Y'):
                        # 2021년 기준으로 데이터 분할
                        # trainData, testData = inpDataL1[0:idx], inpDataL1[idx:len(inpDataL1)]

                        # 전체 데이터
                        # trainData = inpDataL1

                        # 2021년 기준으로 데이터 분할
                        trainData, testData = inpDataL1[0:idx], inpDataL1[
                            idx:len(inpDataL1)]

                    elif (modelDirKey == 'AI_7D'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            timedelta(days=7)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_15D'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            timedelta(days=15)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_1M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=1)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_3M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=3)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_6M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=6)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]

                    # 2021년 기준으로 변경
                    trainDataL1 = trainData[[
                        'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR',
                        'pv', 'sza', 'aza', 'et'
                    ]]

                    # trainDataL1.describe()

                    # **********************************************************************************************************
                    # 머신러닝
                    # **********************************************************************************************************
                    # 시게열
                    # https://towardsdatascience.com/time-series-forecasting-with-pycaret-regression-module-237b703a0c63
                    #
                    # saveCsvFile = '{}/{}_{}.csv'.format(globalVar['outPath'], serviceName, "trainDataL4")
                    # # trainDataL4.to_csv(saveCsvFile, index=False)
                    # log.info('[CHECK] saveCsvFile : {}'.format(saveCsvFile))
                    #
                    # trainDataL4 = pd.read_csv(saveCsvFile)
                    # trainDataL4.describe()

                    saveMlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model.pkl'.format(
                        globalVar['outPath'], modelDirKey, serviceName, posId,
                        'final', 'pycaret', 'act', '*')
                    saveMlModelList = sorted(glob.glob(saveMlModel),
                                             reverse=True)

                    # 학습 모델이 없을 경우
                    if (sysOpt['isOverWrite']) or (len(saveMlModelList) < 1):
                        pyModel = setup(data=trainDataL1,
                                        session_id=123,
                                        silent=True,
                                        target='pv')

                        # 각 모형에 따른 자동 머신러닝
                        modelList = compare_models(sort='RMSE', n_select=10)

                        # 앙상블 모형
                        blendModel = blend_models(estimator_list=modelList,
                                                  fold=10)

                        # 앙상블 튜닝
                        tuneModel = tune_model(blendModel,
                                               fold=2,
                                               choose_better=True)
                        log.info("[CHECK] tuneModel : {}".format(tuneModel))

                        # 학습 모델
                        fnlModel = finalize_model(tuneModel)
                        log.info("[CHECK] fnlModel : {}".format(fnlModel))

                        # 학습 모델 저장
                        saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                            globalVar['outPath'], modelDirKey, serviceName,
                            posId, 'final', 'pycaret', 'act',
                            datetime.now().strftime("%Y%m%d"))
                        log.info("[CHECK] saveModel : {}".format(saveModel))
                        os.makedirs(os.path.dirname(saveModel), exist_ok=True)
                        save_model(fnlModel, saveModel)

                    # **********************************************************************************************************
                    # 딥러닝
                    # **********************************************************************************************************
                    saveDlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                        globalVar['outPath'], modelDirKey, serviceName, posId,
                        'final', 'h2o', 'act', '*')
                    saveDlModelList = sorted(glob.glob(saveDlModel),
                                             reverse=True)

                    # 학습 모델이 없을 경우
                    if (sysOpt['isOverWrite']) or (len(saveDlModelList) < 1):
                        # 개수 제한
                        # 10초 제한
                        # dnModel = H2OAutoML(max_models=20, max_runtime_secs=10000, balance_classes=True, seed=123)
                        dnModel = H2OAutoML(max_models=40,
                                            max_runtime_secs=20000,
                                            balance_classes=True,
                                            seed=123)

                        # trainSet, validSet = np.split(trainDataL1, [int(0.70 * len(trainDataL1))])

                        dnModel.train(x=[
                            'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS',
                            'SWR', 'sza', 'aza', 'et'
                        ],
                                      y='pv',
                                      training_frame=h2o.H2OFrame(trainDataL1))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSetL1), validation_frame=h2o.H2OFrame(validSetL1))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2), validation_frame=h2o.H2OFrame(testData))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSet), validation_frame=h2o.H2OFrame(validSet))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2))

                        # 학습 모델 저장
                        saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                            globalVar['outPath'], modelDirKey, serviceName,
                            posId, 'final', 'h2o', 'act',
                            datetime.now().strftime("%Y%m%d"))
                        log.info("[CHECK] saveModel : {}".format(saveModel))
                        os.makedirs(os.path.dirname(saveModel), exist_ok=True)
                        h2o.save_model(model=dnModel.get_best_model(),
                                       path=os.path.dirname(saveModel),
                                       filename=os.path.basename(saveModel),
                                       force=True)

        except Exception as e:
            log.error("Exception : {}".format(e))
            raise e

        finally:
            log.info('[END] {}'.format("exec"))
    def exec(self):

        log.info('[START] {}'.format("exec"))

        # h2o.init()
        import pandas as pd

        try:

            if (platform.system() == 'Windows'):

                # 옵션 설정
                sysOpt = {
                    # 시작/종료 시간
                    'srtDate': '2019-01-01',
                    'endDate': '2021-12-31',
                    'isOverWrite': True
                    # , 'isOverWrite': False
                }

                globalVar['inpPath'] = 'E:/DATA'
                globalVar['outPath'] = 'E:/DATA'

            else:

                # 옵션 설정
                sysOpt = {
                    # 시작/종료 시간
                    'srtDate': globalVar['srtDate'],
                    'endDate': globalVar['endDate']
                    # , 'isOverWrite': True
                    ,
                    'isOverWrite': False
                }

            isDlModelInit = False

            inpPosFile = '{}/{}'.format(globalVar['cfgPath'],
                                        'stnInfo/GA_STN_INFO.xlsx')
            posData = pd.read_excel(inpPosFile, engine='openpyxl')
            posDataL1 = posData[['id', 'lat', 'lon']]

            modelDirKeyList = ['AI_2Y']
            # modelDirKeyList = ['AI_1Y6M']
            # modelDirKeyList = ['AI_2Y', 'AI_7D', 'AI_15D', 'AI_1M', 'AI_3M', 'AI_6M']

            for k, modelDirKey in enumerate(modelDirKeyList):
                log.info("[CHECK] modelDirKey : {}".format(modelDirKey))

                for i, posInfo in posDataL1.iterrows():

                    posId = int(posInfo['id'])
                    posLat = posInfo['lat']
                    posLon = posInfo['lon']

                    # if (not re.search('51', str(posId))): continue
                    # if (not re.search('17', str(posId))): continue
                    # if (re.search('17|50|51|58|60|67|72|81|85|87', str(posId))): continue

                    log.info('[CHECK] posId : {}'.format(posId))

                    # break
                    inpFile = '{}/{}/{}-SRV{:05d}-{}-{}-{}.xlsx'.format(
                        globalVar['outPath'], 'FOR', serviceName, posId,
                        'final', 'proc', 'for')
                    fileList = sorted(glob.glob(inpFile))

                    # 파일 없을 경우 예외 처리
                    if fileList is None or len(fileList) < 1:
                        log.error('[ERROR] inpFile : {} / {}'.format(
                            inpFile, '입력 자료를 확인해주세요.'))
                        continue

                    fileInfo = fileList[0]
                    inpData = pd.read_excel(fileInfo, engine='openpyxl')

                    inpData['CA_TOT'][inpData['CA_TOT'] < 0] = np.nan
                    inpData['WS'][inpData['WS'] < 0] = np.nan
                    inpData['WD'][inpData['WD'] < 0] = np.nan
                    inpData['SWR'][inpData['SWR'] < 0] = np.nan
                    inpData['pv'][inpData['pv'] < 0] = np.nan

                    inpDataL1 = inpData.dropna().reset_index(drop=True)
                    inpDataL1 = inpDataL1.sort_values(by=['dtDateKst'], axis=0)

                    # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d')].index.to_numpy()[0]
                    # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-11-30', format='%Y-%m-%d')].index.to_numpy()
                    idxInfo = inpDataL1.loc[
                        inpDataL1['dtDateKst'] >= pd.to_datetime(
                            '2021-06-01', format='%Y-%m-%d')].index.to_numpy()
                    # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2022-01-01', format='%Y-%m-%d')].index.to_numpy()

                    if (len(idxInfo) < 1): continue
                    idx = idxInfo[0]

                    # 7일, 15일, 1달, 3달, 6달, 2년
                    if (modelDirKey == 'AI_2Y'):
                        # 전체 데이터
                        # trainData = inpDataL1

                        # 2021년 기준으로 데이터 분할
                        trainData, testData = inpDataL1[0:idx], inpDataL1[
                            idx:len(inpDataL1)]

                    elif (modelDirKey == 'AI_7D'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            timedelta(days=7)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_15D'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            timedelta(days=15)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_1M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=1)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_3M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=3)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_6M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=6)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]

                    log.info('[CHECK] len(trainData) : {}'.format(
                        len(trainData)))
                    log.info('[CHECK] len(testData) : {}'.format(
                        len(testData)))
                    log.info('[CHECK] trainData : {} - {}'.format(
                        trainData['dtDateKst'].min(),
                        trainData['dtDateKst'].max()))
                    # log.info('[CHECK] testData : {} - {}'.format(trainData['testData'].min(), trainData['testData'].max()))

                    # trainData['year'] = trainData['dtDateKst'].dt.strftime('%Y').astype('int64')
                    # trainData['month'] = trainData['dtDateKst'].dt.strftime('%m').astype('int64')
                    # trainData['day'] = trainData['dtDateKst'].dt.strftime('%d').astype('int64')

                    # trainDataL1 = trainData[['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv', 'sza', 'aza', 'et']]
                    # trainDataL1 = trainData[['year', 'month', 'day', 'hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv', 'sza', 'aza', 'et']]
                    # trainDataL1.describe()

                    # trainDataL1 = trainDataL1.loc[(trainDataL1['CA_TOT'] == 0)]

                    # CA_TOT = 0 (전운량)
                    # trainData.info()

                    # trainData['dtDateKst'] = pd.to_datetime(trainData['dtDateKst'])
                    # plt.scatter(trainData['dtDateKst'][0], trainData['CA_TOT'][0])
                    # plt.scatter(trainData['dtDate'], trainData['CA_TOT'])
                    # plt.scatter(trainData['dtDateKst'], trainData['SWR'])
                    # plt.scatter(trainData['pv'], trainData['SWR'])
                    # plt.scatter(trainDataL1['CA_TOT'], trainDataL1['SWR'])

                    # plt.scatter(trainData['dtDateKst'], trainData['SWR'])

                    log.info('[CHECK] min-max : {} - {}'.format(
                        int(trainData['pv'].min()),
                        int(trainData['pv'].max())))

                    mainTitle = '[{:05d}] {}'.format(
                        posId, '기상 예보 정보 (수치모델)를 활용한 입력데이터 (발전량) 시계열')
                    saveImg = '{}/{}/{}/{}.png'.format(globalVar['figPath'],
                                                       serviceName,
                                                       modelDirKey, mainTitle)
                    os.makedirs(os.path.dirname(saveImg), exist_ok=True)
                    plt.scatter(trainData['dtDateKst'], trainData['pv'])
                    plt.title('{:05d}'.format(posId))
                    plt.savefig(saveImg, dpi=600, bbox_inches='tight')
                    # plt.scatter(trainData['dtDateKst'], trainData['SWR'])
                    # plt.scatter(trainData['dtDateKst'], trainData['sza'])
                    # plt.scatter(trainData['dtDateKst'], trainData['aza'])
                    plt.show()
                    plt.close()

                    continue

                    # trainData.plot()
                    # plt.show()
                    # plt.close()
                    # trainDataL1 = trainData[['dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv']]
                    #     # )[['dtDate', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'DSR', 'CLD', 'CF', 'SWR', 'pv']]
                    #     # )[['dtDate', 'dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'DSR', 'CF', 'CLD', 'SWR', 'pv']].dropna()
                    #     # )[['dtDate', 'dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'CF', 'CLD', 'SWR', 'pv']]

                    # import pandas as pd
                    # from autofeat import AutoFeatClassifier
                    # from sklearn.model_selection import train_test_split
                    # from sklearn.datasets import load_breast_cancer
                    # from sklearn.linear_model import LogisticRegression
                    # from sklearn.metrics import accuracy_score, confusion_matrix
                    # #
                    # # load_breast_cancer = load_breast_cancer(as_frame=True)
                    # # X = load_breast_cancer.data
                    # # y = load_breast_cancer.target
                    # # trainData, testData
                    # model = AutoFeatClassifier(verbose=1)
                    # X_train_feature_creation = model.fit_transform(trainData, testData)
                    #
                    # import pandas as pd  # 기본라이브러리
                    # # from prophet import Prophet  # Prophet
                    # from neuralprophet import NeuralProphet  # NeuralProphet
                    # from sklearn.metrics import mean_absolute_error  # 평가 지표 MAE
                    # from statistics import mean  # 평균값 계산
                    # import matplotlib.pyplot as plt  # 그래프묘사
                    #
                    # df1_nprophet_model = NeuralProphet(seasonality_mode='multiplicative')
                    # df1_nprophet_model_result = df1_nprophet_model.fit(trainData, freq="H")
                    # trainData['ds'] = trainData['dtDateKst']
                    #
                    # import pandas as pd
                    # from pycaret.datasets import get_data
                    # data = get_data('pycaret_downloads')
                    # data['Date'] = pd.to_datetime(data['Date'])
                    # data = data.groupby('Date').sum()
                    # data = data.asfreq('D')
                    # data.head()
                    #
                    # # plot the data
                    # data.plot()
                    # plt.show()
                    #
                    # trainData.drop_duplicates(subset=['dtDateKst'], inplace=True)
                    # trainDataL2 = trainData[['pv']]
                    # trainDataL2.index = trainData['dtDateKst']

                    # import pycaret.classification
                    # from pycaret.time_series import *
                    # from pycaret.internal.pycaret_experiment import TimeSeriesExperiment
                    # pyModel = setup(trainDataL2, fh=7, fold=3, session_id=123)
                    # pyModel = setup(trainData, target = 'Price', fh=7, fold=3, session_id=123)

                    # 각 모형에 따른 자동 머신러닝
                    # modelList = compare_models(sort='RMSE', n_select=3)
                    # modelList = compare_models(sort='RMSE')

                    # tuneModel = stack_models(modelList)

                    # 앙상블 모형
                    # blendModel = blend_models(estimator_list=modelList, fold=5)

                    # 앙상블 튜닝
                    # tuneModel = tune_model(modelList, fold=5, choose_better=True)

                    # # 학습 모델
                    # fnlModel = finalize_model(tuneModel)
                    #
                    # predict_model(fnlModel, fh=90)
                    #
                    # plot_model(fnlModel, plot='forecast', data_kwargs = { 'fh' : 30 })
                    # # plot_model(modelList[0], plot='forecast', data_kwargs = { 'fh' : 30 })
                    # # plot_model(modelList[0], plot='forecast', data_kwargs = { 'fh' : 30 })
                    # plot_model(fnlModel, plot='insample')

                    # **********************************************************************************************************
                    # 머신러닝
                    # **********************************************************************************************************
                    # 시게열
                    # https://towardsdatascience.com/time-series-forecasting-with-pycaret-regression-module-237b703a0c63
                    #
                    # saveCsvFile = '{}/{}_{}.csv'.format(globalVar['outPath'], serviceName, "trainDataL4")
                    # # trainDataL4.to_csv(saveCsvFile, index=False)
                    # log.info('[CHECK] saveCsvFile : {}'.format(saveCsvFile))
                    #
                    # trainDataL4 = pd.read_csv(saveCsvFile)
                    # trainDataL4.describe()

                    saveMlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model.pkl'.format(
                        globalVar['outPath'], modelDirKey, serviceName, posId,
                        'final', 'pycaret', 'for', '*')
                    saveMlModelList = sorted(glob.glob(saveMlModel),
                                             reverse=True)

                    # 학습 모델이 없을 경우
                    # if (sysOpt['isOverWrite']) or (len(saveMlModelList) < 1):
                    if (len(saveMlModelList) < 1):
                        pyModel = setup(data=trainDataL1,
                                        session_id=123,
                                        silent=True,
                                        target='pv')

                        # 각 모형에 따른 자동 머신러닝
                        modelList = compare_models(sort='RMSE', n_select=3)

                        # 앙상블 모형
                        blendModel = blend_models(estimator_list=modelList,
                                                  fold=10)

                        # 앙상블 튜닝
                        tuneModel = tune_model(blendModel,
                                               fold=10,
                                               choose_better=True)

                        # 학습 모델
                        fnlModel = finalize_model(tuneModel)

                        # 학습 모델 저장
                        saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                            globalVar['outPath'], modelDirKey, serviceName,
                            posId, 'final', 'pycaret', 'for',
                            datetime.now().strftime("%Y%m%d"))
                        os.makedirs(os.path.dirname(saveModel), exist_ok=True)
                        save_model(fnlModel, saveModel)

                    # **********************************************************************************************************
                    # 딥러닝
                    # **********************************************************************************************************
                    saveDlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                        globalVar['outPath'], modelDirKey, serviceName, posId,
                        'final', 'h2o', 'for', '*')
                    saveDlModelList = sorted(glob.glob(saveDlModel),
                                             reverse=True)

                    # 학습 모델이 없을 경우
                    if (sysOpt['isOverWrite']) or (len(saveDlModelList) < 1):

                        if (isDlModelInit == False):
                            h2o.init()
                            isDlModelInit = True

                        # dnModel = H2OAutoML(max_models=20, max_runtime_secs=10000, balance_classes=True, seed=123)
                        # 2022-03-29
                        # dnModel = H2OAutoML(max_models=2, max_runtime_secs=20000, balance_classes=True, seed=123)
                        dnModel = H2OAutoML(max_models=20,
                                            max_runtime_secs=99999,
                                            balance_classes=True,
                                            seed=123)

                        # java.lang.OutOfMemoryError: Java heap space
                        # dnModel = H2OAutoML(max_models=None, max_runtime_secs=99999, balance_classes=True, seed=123)
                        # dnModel = H2OAutoML(max_models=40, max_runtime_secs=99999, balance_classes=True, seed=123)
                        # dnModel = H2OAutoML(max_models=30, max_runtime_secs=99999, balance_classes=True, seed=123)
                        # dnModel = H2OAutoML(max_models=40, max_runtime_secs=20000, balance_classes=True, seed=123)
                        dnModel.train(x=[
                            'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS',
                            'SWR', 'sza', 'aza', 'et'
                        ],
                                      y='pv',
                                      training_frame=h2o.H2OFrame(trainDataL1))
                        # dnModel.train(x=['year', 'month', 'day', 'hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL1))
                        # dnModel.train(x=['hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL1))

                        # trainSet, validSet = np.split(trainDataL1, [int(0.70 * len(trainDataL1))])
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSetL1), validation_frame=h2o.H2OFrame(validSetL1))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2), validation_frame=h2o.H2OFrame(testData))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSet), validation_frame=h2o.H2OFrame(validSet))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2))

                        # 학습 모델 저장
                        saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                            globalVar['outPath'], modelDirKey, serviceName,
                            posId, 'final', 'h2o', 'for',
                            datetime.now().strftime("%Y%m%d"))
                        os.makedirs(os.path.dirname(saveModel), exist_ok=True)

                        # h2o.save_model(model=dnModel.get_best_model(), path=os.path.dirname(saveModel), filename=os.path.basename(saveModel), force=True)
                        dnModel.get_best_model().save_mojo(
                            path=os.path.dirname(saveModel),
                            filename=os.path.basename(saveModel),
                            force=True)

        except Exception as e:
            log.error("Exception : {}".format(e))
            raise e
        finally:
            log.info('[END] {}'.format("exec"))
Beispiel #8
0
from pycaret.regression import setup, create_model, tune_model, save_model
import pandas as pd

data = pd.read_csv('C:/tmp/insurance.csv',  delimiter=',')
print(data.head())

r2 = setup(data, target='charges', session_id=123,
           normalize=True,
           polynomial_features=True, trigonometry_features=True,
           feature_interaction=True,
           bin_numeric_features=['age', 'bmi'])

lr = create_model('lr')
tuned_lr = tune_model(lr)
save_model(tuned_lr, model_name='./models/lr_deployment_20210521')
Beispiel #9
0
    # Get task
    df_task = pc_dfs.loc[pc_dfs['Dataset'] == optsel_dataset,
                         'Default Task'].tolist()[0]

    # Describe data
    st.write(
        f'This dataset has {df.shape[0]} samples and {df.shape[1]} features. Target variable is {df_target}.'
    )
    st.dataframe(df.head())

    if df_task in ['NLP / Regression', 'Regression']:

        # Setup PyCaret
        with st.spinner('PyCaret setup is running...'):
            pycset = regression.setup(data=df, target=df_target)

        # Compare models
        st.dataframe(regression.compare_models())

        # End
        st.success('End of execution!')

    if df_task in ['Classification (Binary)', 'Classification (Multiclass)']:

        # Setup PyCaret
        with st.spinner('PyCaret setup is running...'):
            pycset = classification.setup(data=df, target=df_target)

        # Compare models
        st.dataframe(classification.compare_models())
def app_main():
    st.title("Machine learning analysis platform")
    if st.sidebar.checkbox('Define Data Source'):
        filesFolder = st.sidebar.text_input('folder', value="data")
        dataList = list_files(filesFolder, 'csv')
        if len(dataList) == 0:
            st.warning('No data set available')
        else:
            file_selected = st.sidebar.selectbox('Select a document', dataList)
            file_selected_path = concat_file_path(filesFolder, file_selected)
            nrows = st.sidebar.number_input('Number of lines', value=-1)
            n_rows_str = 'All' if nrows == -1 else str(nrows)
            st.info(
                'Selected file:{file_selected_path},The number of rows read is{n_rows_str}'
            )
    else:
        file_selected_path = None
        nrows = 100
        st.warning('The currently selected file is empty, please select:')
    if st.sidebar.checkbox('Exploratory Analysis'):
        if file_selected_path is not None:
            if st.sidebar.button('Report Generation'):
                df = load_csv(file_selected_path, nrows)
                pr = ProfileReport(df, explorative=True)
                st_profile_report(pr)
        else:
            st.info('No file selected, analysis cannot be performed')
    if st.sidebar.checkbox('Modeling'):
        if file_selected_path is not None:
            task = st.sidebar.selectbox('Select Task', ML_LIST)
            if task == 'Regression':
                model = st.sidebar.selectbox('Select Model', RG_LIST)
            elif task == 'Classification':
                model = st.sidebar.selectbox('Select Model', RG_LIST)
            df = load_csv(file_selected_path, nrows)
            try:
                cols = df.columns.to_list()
                target_col = st.sidebar.selectbox('Select Prediction Object',
                                                  cols)
            except BaseException:
                st.sidebar.warning('The data format cannot be read correctly')
                target_col = None

            if target_col is not None and st.sidebar.button('Training Model'):
                if task == 'Regression':
                    st.success('Data preprocessing...')
                    pc_rg.setup(df,
                                target=target_col,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success('Data preprocessing is complete')
                    st.success('Training model. . .')
                    pc_rg.create_model(model, verbose=False)
                    st.success('The model training is complete. . .')
                    #pc_rg.finalize_model(model)
                    st.success('Model has been created')
                elif task == 'Classification':
                    st.success('Data preprocessing. . .')
                    pc_cl.setup(df,
                                target=target_col,
                                fix_imbalance=True,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success('Data preprocessing is complete.')
                    st.success('Training model. . .')
                    pc_cl.create_model(model, verbose=False)
                    st.success('The model training is complete. . .')
                    #pc_cl.finalize_model(model)
                    st.success('Model has been created')

    if st.sidebar.checkbox('View System Log'):
        n_lines = st.sidebar.slider(label='Number of lines',
                                    min_value=3,
                                    max_value=50)
        if st.sidebar.button("Check View"):
            logs = get_model_training_logs(n_lines=n_lines)
            st.text('System log')
            st.write(logs)
    try:
        allOfRuns = mlflow.search_runs(experiment_ids=0)
    except:
        allOfRuns = []
    if len(allOfRuns) != 0:
        if st.sidebar.checkbox('Preview model'):
            ml_logs = 'http://kubernetes.docker.internal:5000/  -->Open mlflow, enter the command line: mlflow ui'
            st.markdown(ml_logs)
            st.dataframe(allOfRuns)
        if st.sidebar.checkbox('Choose a model'):
            selected_run_id = st.sidebar.selectbox(
                'Choose from saved models',
                allOfRuns[allOfRuns['tags.Source'] ==
                          'create_model']['run_id'].tolist())
            selected_run_info = allOfRuns[(
                allOfRuns['run_id'] == selected_run_id)].iloc[0, :]
            st.code(selected_run_info)
            if st.sidebar.button('Forecast data'):
                model_uri = 'runs:/' + selected_run_id + '/model/'
                model_loaded = mlflow.sklearn.load_model(model_uri)
                df = pd.read_csv(file_selected_path, nrows=nrows)
                #st.success('Model prediction. . .')
                pred = model_loaded.predict(df)
                pred_df = pd.DataFrame(pred, columns=['Predictive Data'])
                st.dataframe(pred_df)
                pred_df.plot()
                st.pyplot()
    else:
        st.sidebar.warning('Did not find a trained model')
    def exec(self):

        log.info('[START] {}'.format("exec"))

        try:

            import pandas as pd

            globalVar['inpPath'] = 'E:/DATA/OUTPUT'
            globalVar['outPath'] = 'E:/DATA/OUTPUT'

            inpCsvFile = '{}/{}_{}.csv'.format(globalVar['outPath'],
                                               serviceName, 'TrainData')
            # CSV 파일 저장
            # umDataL10.to_csv(saveCsvFile, index=False)

            data = pd.read_csv(inpCsvFile)

            # test = trainDataL7.dropna().reset_index(drop=True)

            data = data.drop(['ML', 'DL'], axis=1)
            data['dtDateKst'] = pd.to_datetime(data['dtDateKst'])

            #
            # testL1 = test[['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'SWR', 'sza', 'aza', 'et', 'pv']]
            dataL1 = data[[
                'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza',
                'aza', 'et', 'pv'
            ]]

            pyModel = setup(data=dataL1, target='pv', session_id=123)

            try:

                # 각 모형에 따른 자동 머신러닝
                modelList = compare_models(sort='RMSE', n_select=3)

                # 앙상블 모형
                blendModel = blend_models(estimator_list=modelList, fold=2)

                # 앙상블 튜닝
                tuneModel = tune_model(blendModel, fold=2, choose_better=True)

                # 학습 모델
                fnlModel = finalize_model(tuneModel)

            except Exception as e:
                log.error("Exception : {}".format(e))

            # evaluate_model(tuneModel)

            # pred_holdout = predict_model(fnlModel)

            # print(fnlModel)

            # 회귀 시각화
            # plot_model(fnlModel, plot='error')
            # plt.show()

            mlModel = fnlModel

            # predData = predict_model(fnlModel, data=dataL1)

            # 24.4427

            # check_metric(dataL1['pv'], dataL1['Label'], metric='RMSE')

            # h2o
            h2o.init()
            aml = H2OAutoML(max_models=20,
                            max_runtime_secs=10000,
                            balance_classes=True,
                            seed=1)
            aml.train(x=[
                'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza',
                'aza', 'et'
            ],
                      y='pv',
                      training_frame=h2o.H2OFrame(dataL1),
                      validation_frame=h2o.H2OFrame(dataL1))

            dlModel = aml.get_best_model()

            dataL2 = data
            dataL3 = predict_model(mlModel,
                                   data=dataL2).rename({'Label': 'ML'},
                                                       axis='columns')
            dataL3['DL'] = dlModel.predict(
                h2o.H2OFrame(dataL2)).as_data_frame()

            anaTimeList = dataL3['anaTime'].unique()

            for j, anaTimeInfo in enumerate(anaTimeList):

                dataL4 = dataL3.loc[dataL3['anaTime'] == anaTimeInfo].dropna(
                ).reset_index(drop=True)

                mainTitle = '[{}] {}'.format(
                    anaTimeInfo, '기상 예보 정보 (수치모델)를 활용한 48시간 예측 시계열')
                saveImg = '{}/{}/{}.png'.format(globalVar['figPath'],
                                                serviceName, mainTitle)
                makeUserTimeSeriesPlot(pd.to_datetime(dataL4['dtDateKst']),
                                       dataL4['ML'], dataL4['DL'],
                                       dataL4['pv'], '예측 (머신러닝)', '예측 (딥러닝)',
                                       '실측 (발전량)', '시간 (시)', '발전량', mainTitle,
                                       saveImg, True)

            mainTitle = '[{}-{}] {}'.format(
                min(anaTimeList), max(anaTimeList),
                '기상 예보 정보 (수치모델)를 활용한 머신러닝 (48시간 예측) 산점도')
            saveImg = '{}/{}/{}.png'.format(globalVar['figPath'], serviceName,
                                            mainTitle)
            makeUserScatterPlot(dataL3['ML'], dataL3['pv'], '머신러닝', '실측',
                                mainTitle, saveImg, 0, 1000, 20, 60, True)

            mainTitle = '[{}-{}] {}'.format(
                min(anaTimeList), max(anaTimeList),
                '기상 예보 정보 (수치모델)를 활용한 딥러닝 (48시간 예측) 산점도')
            saveImg = '{}/{}/{}.png'.format(globalVar['figPath'], serviceName,
                                            mainTitle)
            makeUserScatterPlot(dataL3['DL'], dataL3['pv'], '딥러닝', '실측',
                                mainTitle, saveImg, 0, 1000, 20, 60, True)

        except Exception as e:
            log.error("Exception : {}".format(e))
            raise e
        finally:
            log.info('[END] {}'.format("exec"))
Beispiel #12
0
def app_main():
    st.title("自动化机器学习平台")
    if st.sidebar.checkbox('定义数据源'):
        file_folder = st.sidebar.text_input('文件夹', value="data")
        data_file_list = list_files(file_folder, 'csv')
        if len(data_file_list) == 0:
            st.warning(f'当路径无可用数据集')
        else:
            file_selected = st.sidebar.selectbox('选择文件', data_file_list)
            file_selected_path = concat_file_path(file_folder, file_selected)
            nrows = st.sidebar.number_input('行数', value=-1)
            n_rows_str = '全部' if nrows == -1 else str(nrows)
            st.info(f'已选择文件:{file_selected_path},读取行数为{n_rows_str}')
    else:
        file_selected_path = None
        nrows = 100
        st.warning(f'当前选择文件为空,请选择。')
    if st.sidebar.checkbox('探索性分析'):
        if file_selected_path is not None:
            if st.sidebar.button('一键生成报告'):
                df = load_csv(file_selected_path, nrows)
                pr = ProfileReport(df, explorative=True)
                st_profile_report(pr)
        else:
            st.info(f'没有选择文件,无法进行分析。')

    if st.sidebar.checkbox('快速建模'):
        if file_selected_path is not None:
            task = st.sidebar.selectbox('选择任务', ML_TASK_LIST)
            if task == '回归':
                model = st.sidebar.selectbox('选取模型', RG_MODEL_LIST)
            elif task == '分类':
                model = st.sidebar.selectbox('选取模型', RG_MODEL_LIST)
            df = load_csv(file_selected_path, nrows)
            try:
                cols = df.columns.to_list()
                target_col = st.sidebar.selectbox('选取预测对象', cols)
            except BaseException:
                st.sidebar.warning(f'数据格式无法正确读取')
                target_col = None

            if target_col is not None and st.sidebar.button('训练模型'):
                if task == '回归':
                    st.success(f'数据预处理。。。')
                    pc_rg.setup(df,
                                target=target_col,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success(f'数据预处理完毕。')
                    st.success(f'训练模型。。。')
                    pc_rg.create_model(model, verbose=False)
                    st.success(f'模型训练完毕。。。')
                    #pc_rg.finalize_model(model)
                    st.success(f'模型已经创建')
                elif task == '分类':
                    st.success(f'数据预处理。。。')
                    pc_cl.setup(df,
                                target=target_col,
                                fix_imbalance=True,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success(f'数据预处理完毕。')
                    st.success(f'训练模型。。。')
                    pc_cl.create_model(model, verbose=False)
                    st.success(f'模型训练完毕。。。')
                    #pc_cl.finalize_model(model)
                    st.success(f'模型已经创建')
    if st.sidebar.checkbox('查看系统日志'):
        n_lines = st.sidebar.slider(label='行数', min_value=3, max_value=50)
        if st.sidebar.button("查看"):
            logs = get_model_training_logs(n_lines=n_lines)
            st.text('系统日志')
            st.write(logs)
    try:
        all_runs = mlflow.search_runs(experiment_ids=0)
    except:
        all_runs = []
    if len(all_runs) != 0:
        if st.sidebar.checkbox('预览模型'):
            ml_logs = 'http://kubernetes.docker.internal:5000/  -->开启mlflow,命令行输入:mlflow ui'
            st.markdown(ml_logs)
            st.dataframe(all_runs)
        if st.sidebar.checkbox('选择模型'):
            selected_run_id = st.sidebar.selectbox(
                '从已保存模型中选择', all_runs[all_runs['tags.Source'] ==
                                      'create_model']['run_id'].tolist())
            selected_run_info = all_runs[(
                all_runs['run_id'] == selected_run_id)].iloc[0, :]
            st.code(selected_run_info)
            if st.sidebar.button('预测数据'):
                model_uri = f'runs:/' + selected_run_id + '/model/'
                model_loaded = mlflow.sklearn.load_model(model_uri)
                df = pd.read_csv(file_selected_path, nrows=nrows)
                #st.success(f'模型预测中。。。   ')
                pred = model_loaded.predict(df)
                pred_df = pd.DataFrame(pred, columns=['预测值'])
                st.dataframe(pred_df)
                pred_df.plot()
                st.pyplot()
    else:
        st.sidebar.warning('没有找到训练好的模型')
Beispiel #13
0
def regression_model(*, y_col, training_set, normalize, test_size, folds,
                     metric, model_name, testing_set, imbalanced, seed,
                     include_models, normalize_method):
    """
    Build a regression model for prediction.

    Parameters
    ----------
    y_col : str
        the name of the target column.
    training_set : pd.DataFrame
        DataFrame containing the training data.
    normalize : bool
        if True the dataset will be normalized before training.
    test_size : float
        Between [0.0-1.0]. The size of the split for test within the training set.
    folds : int
        number of folds for cross validation.
    metric : str
        the metric used for evaluating the best model.
    model_name : str
        the name to save the model.
    testing_set : pd.DataFrame
        the external dataset for evaluating the best model.
    imbalanced
    seed : int
        random number to initilize the process.
    include_models : List
        a list of models to be included in the process.
    normalize_method : str
        The method used for normalizing the data.

    Returns
    -------
    Final regression model

    """
    if not metric:
        metric = 'RMSE'
    setup = pyreg.setup(target=y_col,
                        data=training_set,
                        normalize=normalize,
                        normalize_method=normalize_method,
                        train_size=1 - test_size,
                        fold=folds,
                        silent=True,
                        session_id=seed)
    best_model = pyreg.compare_models(sort=metric, include=include_models)
    pyreg.pull().to_csv(model_name + '_compare_models.tsv',
                        sep='\t',
                        index=False)
    reg_model = pyreg.create_model(best_model)
    reg_tuned_model = pyreg.tune_model(reg_model, optimize=metric)
    pyreg.pull().to_csv(model_name + '_tuned_model.tsv', sep='\t', index=False)
    final_model = pyreg.finalize_model(reg_tuned_model)
    pyreg.plot_model(final_model, save=True)
    pyreg.plot_model(final_model, plot='feature', save=True)
    pyreg.plot_model(final_model, plot='error', save=True)
    pyreg.save_model(final_model, model_name)
    if len(testing_set.index) != 0:
        unseen_predictions = test_regressor(
            model_path=model_name + '.pkl',
            x_set=testing_set.drop(columns=[y_col]),
            y_col=testing_set[y_col],
            output=model_name)
        unseen_predictions.to_csv(model_name + '_external_testing_results.tsv',
                                  sep='\t',
                                  index=True)
    return final_model
Beispiel #14
0
import pandas as pd
from pyarrow import csv

# 데이터

train = pd.read_csv('./data/dacon/comp3/train_features.csv',
                    header=0,
                    index_col=0)
target = pd.read_csv('./data/dacon/comp3/train_target.csv',
                     header=0,
                     index_col=0)
test = pd.read_csv('./data/dacon/comp3/test_features.csv',
                   header=0,
                   index_col=0)

from pycaret import regression

exp = regression.setup(data=train, target='S1')
    X_df = pd.read_csv(os.path.join(PROJ_DIR, 'data/FCC/X.csv'))
    Y_df = pd.read_csv(os.path.join(PROJ_DIR, 'data/FCC/Y.csv'))

    print('Y columns: {}'.format(Y_df.columns))

    # ---- 建立任务 ---------------------------------------------------------------------------------

    x_cols = X_df.columns.tolist()
    y_col = 'gasoline'
    data = pd.concat([X_df, Y_df[y_col]], axis=1).astype(np.float)

    task = setup(
        data,
        target=y_col,
        numeric_features=x_cols,
        verbose=False,
        remove_multicollinearity=False,
        # multicollinearity_threshold = 0.6,
        ignore_low_variance=False,
        silent=True,
        n_jobs=2)

    # ---- 模型选择 ---------------------------------------------------------------------------------

    best_model = compare_models(
        include=[
            'rf', 'lightgbm', 'lasso', 'ridge', 'xgboost', 'en', 'knn', 'mlp',
            'lr', 'dt'
        ],
        sort='R2',
        verbose=True,
        fold=3,
Beispiel #16
0
def run_model():
    clf = py.setup(data, target = 'rent', silent=True)
    rf_model = py.create_model('rf', fold=5, verbose=False)
    model = py.finalize_model(rf_model)
    return model