def exploratory_experiment(df, target, target_type='R'): ''' Func:机器学习探索性实验\n In:dataframe各指标数据\n target --> 因变量\n target_type --> R连续变量;C分类变量 ''' if target_type == 'R' or target_type == 'r': from pycaret.regression import compare_models, setup elif target_type == 'C' or target_type == 'c': from pycaret.classification import compare_models, setup setup(df, target) compare_models()
def run_pycaret(name, df_train, df_test, acc_func, target): pycaret_acc_func_str = 'Accuracy' for pycaret_metrics in [ 'Accuracy', 'AUC', 'Recall', 'Precision', 'F1', 'Kappa', 'MCC', 'MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE' ]: if pycaret_metrics.lower() in str(acc_func).lower(): pycaret_acc_func_str = pycaret_metrics import traceback task_type = 'classification' if pycaret_acc_func_str in ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']: task_type = 'regression' from pycaret.regression import setup, compare_models, predict_model, blend_models, stack_models, automl, create_model else: from pycaret.classification import setup, compare_models, predict_model, blend_models, stack_models, automl, create_model setup_return = setup(data=df_train, target=target) top_models = compare_models(n_select=3, verbose=False, sort=pycaret_acc_func_str, turbo=True, blacklist=['catboost', 'xgboost']) # Ensemble the top models and optimize the resulting model blender = blend_models(estimator_list=top_models, verbose=False) stacker = stack_models(estimator_list=top_models, meta_model=top_models[0], verbose=False) best_model = automl(optimize=pycaret_acc_func_str) df_test_dropped = df_test.drop(columns=[target]) predictions = predict_model(best_model, data=df_test_dropped) try: accuracy = acc_func(list(predictions['Label']), list(df_test[target])) except Exception as e: traceback.print_exc() print(f'Exception computing accuracy (1): {e}') if task_type == 'classification': accuracy = acc_func([str(x) for x in list(predictions['Label'])], [str(x) for x in list(df_test[target])]) elif task_type == 'regression': accuracy = acc_func([float(x) for x in list(predictions['Label'])], [float(x) for x in list(df_test[target])]) return accuracy
def exec(self): log.info('[START] {}'.format("exec")) # h2o.init() try: if (platform.system() == 'Windows'): # 옵션 설정 sysOpt = { # 시작/종료 시간 'srtDate': '2019-01-01', 'endDate': '2021-12-31', 'isOverWrite': True # , 'isOverWrite': False } globalVar['inpPath'] = 'E:/DATA' globalVar['outPath'] = 'E:/DATA' else: # 옵션 설정 sysOpt = { # 시작/종료 시간 'srtDate': globalVar['srtDate'], 'endDate': globalVar['endDate'] # , 'isOverWrite': True , 'isOverWrite': False } isDlModelInit = False inpPosFile = '{}/{}'.format(globalVar['cfgPath'], 'stnInfo/GA_STN_INFO.xlsx') posData = pd.read_excel(inpPosFile, engine='openpyxl') posDataL1 = posData[['id', 'lat', 'lon']] modelDirKeyList = ['AI_2Y'] # modelDirKeyList = ['AI_1Y6M'] # modelDirKeyList = ['AI_2Y', 'AI_7D', 'AI_15D', 'AI_1M', 'AI_3M', 'AI_6M'] for k, modelDirKey in enumerate(modelDirKeyList): log.info("[CHECK] modelDirKey : {}".format(modelDirKey)) for i, posInfo in posDataL1.iterrows(): posId = int(posInfo['id']) posLat = posInfo['lat'] posLon = posInfo['lon'] if (not re.search('17', str(posId))): continue # if (re.search('17|50|51|58|60|67|72|81|85|87', str(posId))): continue log.info('[CHECK] posId : {}'.format(posId)) # break inpFile = '{}/{}/{}-SRV{:05d}-{}-{}-{}.xlsx'.format( globalVar['outPath'], 'FOR', serviceName, posId, 'final', 'proc', 'for') fileList = sorted(glob.glob(inpFile)) # 파일 없을 경우 예외 처리 if fileList is None or len(fileList) < 1: log.error('[ERROR] inpFile : {} / {}'.format( inpFile, '입력 자료를 확인해주세요.')) continue fileInfo = fileList[0] inpData = pd.read_excel(fileInfo, engine='openpyxl') # inpData['CA_TOT'].where(inpData['CA_TOT'] < 0, np.nan) inpData['CA_TOT'][inpData['CA_TOT'] < 0] = np.nan inpData['WS'][inpData['WS'] < 0] = np.nan inpData['WD'][inpData['WD'] < 0] = np.nan inpData['SWR'][inpData['SWR'] < 0] = np.nan inpData['pv'][inpData['pv'] < 0] = np.nan inpDataL1 = inpData.dropna().reset_index(drop=True) inpDataL1 = inpDataL1.sort_values(by=['dtDateKst'], axis=0) # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d')].index.to_numpy()[0] # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-11-30', format='%Y-%m-%d')].index.to_numpy() idxInfo = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime( '2021-06-01', format='%Y-%m-%d')].index.to_numpy() # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2022-01-01', format='%Y-%m-%d')].index.to_numpy() if (len(idxInfo) < 1): continue idx = idxInfo[0] # 7일, 15일, 1달, 3달, 6달, 2년 if (modelDirKey == 'AI_2Y'): # 전체 데이터 # trainData = inpDataL1 # 2021년 기준으로 데이터 분할 trainData, testData = inpDataL1[0:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_7D'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - timedelta(days=7)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_15D'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - timedelta(days=15)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_1M'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - relativedelta(months=1)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_3M'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - relativedelta(months=3)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_6M'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - relativedelta(months=6)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] log.info('[CHECK] len(trainData) : {}'.format( len(trainData))) log.info('[CHECK] len(testData) : {}'.format( len(testData))) log.info('[CHECK] trainData : {} - {}'.format( trainData['dtDateKst'].min(), trainData['dtDateKst'].max())) # log.info('[CHECK] testData : {} - {}'.format(trainData['testData'].min(), trainData['testData'].max())) # trainData['year'] = trainData['dtDateKst'].dt.strftime('%Y').astype('int64') # trainData['month'] = trainData['dtDateKst'].dt.strftime('%m').astype('int64') # trainData['day'] = trainData['dtDateKst'].dt.strftime('%d').astype('int64') # trainData['hour'] = trainData['dtDateKst'].dt.strftime('%H').astype('int64') trainDataL1 = trainData[[ 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv', 'sza', 'aza', 'et' ]] # trainDataL1 = trainData[['year', 'month', 'day', 'hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv', 'sza', 'aza', 'et']] # trainDataL1.describe() # trainDataL1 = trainDataL1.loc[(trainDataL1['CA_TOT'] == 0)] # CA_TOT = 0 (전운량) # plt.scatter(trainData['dtDateKst'], trainData['CA_TOT']) # plt.scatter(trainData['dtDateKst'], trainData['SWR']) # plt.scatter(trainData['pv'], trainData['SWR']) # plt.scatter(trainDataL1['CA_TOT'], trainDataL1['SWR']) # plt.show() # trainDataL1 = trainData[['dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv']] # # )[['dtDate', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'DSR', 'CLD', 'CF', 'SWR', 'pv']] # # )[['dtDate', 'dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'DSR', 'CF', 'CLD', 'SWR', 'pv']].dropna() # # )[['dtDate', 'dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'CF', 'CLD', 'SWR', 'pv']] # # import pandas as pd # from autofeat import AutoFeatClassifier # from sklearn.model_selection import train_test_split # from sklearn.datasets import load_breast_cancer # from sklearn.linear_model import LogisticRegression # from sklearn.metrics import accuracy_score, confusion_matrix # # # # load_breast_cancer = load_breast_cancer(as_frame=True) # # X = load_breast_cancer.data # # y = load_breast_cancer.target # # trainData, testData # model = AutoFeatClassifier(verbose=1) # X_train_feature_creation = model.fit_transform(trainData, testData) # # import pandas as pd # 기본라이브러리 # # from prophet import Prophet # Prophet # from neuralprophet import NeuralProphet # NeuralProphet # from sklearn.metrics import mean_absolute_error # 평가 지표 MAE # from statistics import mean # 평균값 계산 # import matplotlib.pyplot as plt # 그래프묘사 # # df1_nprophet_model = NeuralProphet(seasonality_mode='multiplicative') # df1_nprophet_model_result = df1_nprophet_model.fit(trainData, freq="H") # trainData['ds'] = trainData['dtDateKst'] # ********************************************************************************************************** # TEST # *************** # trainData plt.scatter(trainData['dtDateKst'], trainData['pv']) plt.show() from pmdarima import auto_arima import statsmodels.tsa.api as tsa import statsmodels.api as sm # arima_model = auto_arima(y_to_train, seasonal=True, m=7) sxmodel = auto_arima(trainData[['pv']], exogenous=trainData[['SWR']], start_p=1, start_q=1, test='adf', max_p=3, max_q=3, m=12, start_P=0, seasonal=True, d=None, D=1, trace=True, error_action='ignore', suppress_warnings=True, stepwise=True) # Fit model # arima_exog_model = auto_arima(y=trainData['pv'], exogenous=trainData['SWR'], seasonal=True, m=7) # Forecast # y_arima_exog_forecast = arima_exog_model.predict(n_periods=365, exogenous=exog_to_test) trainData.index = trainData['dtDateKst'] # import pmdarima as pm # y = pm.datasets.load_wineind() # # from pmdarima.model_selection import train_test_split # import numpy as np # train, test = train_test_split(y, train_size=150) # # # auto_arima = auto_arima( # y=trainData['pv'].values, # # X=trainData['dtDatekst', 'law']], # X=trainData['dtDateKst'].values, # # stepwise=False, # seasonal=True, # max_order=5, # m=12, # approximation=False, # information_criterion='aic') # # from darts.models import ( # NaiveSeasonal, # NaiveDrift, # Prophet, # ExponentialSmoothing, # ARIMA, # AutoARIMA, # StandardRegressionModel, # Theta, # FFT # ) # # from darts import TimeSeries # series = TimeSeries.from_dataframe(trainData, time_col='dtDateKst', value_cols='pv', fill_missing_dates=True, freq='H') # # # # for model in ( # # NaiveSeasonal, # # NaiveDrift, # # Prophet, # # ExponentialSmoothing, # # ARIMA, # # AutoARIMA, # # # StandardRegressionModel, -> 初期化時にtrain_n_points が必要 # # Theta, # # FFT # # ) # m = model() # m.fit(trainData) # pred = m.predict(len(val)) # ` # ********************************************************************************************************** # 머신러닝 # ********************************************************************************************************** # 시게열 # https://towardsdatascience.com/time-series-forecasting-with-pycaret-regression-module-237b703a0c63 # # saveCsvFile = '{}/{}_{}.csv'.format(globalVar['outPath'], serviceName, "trainDataL4") # # trainDataL4.to_csv(saveCsvFile, index=False) # log.info('[CHECK] saveCsvFile : {}'.format(saveCsvFile)) # # trainDataL4 = pd.read_csv(saveCsvFile) # trainDataL4.describe() saveMlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model.pkl'.format( globalVar['outPath'], modelDirKey, serviceName, posId, 'final', 'pycaret', 'for', '*') saveMlModelList = sorted(glob.glob(saveMlModel), reverse=True) # 학습 모델이 없을 경우 # if (sysOpt['isOverWrite']) or (len(saveMlModelList) < 1): if (len(saveMlModelList) < 1): pyModel = setup(data=trainDataL1, session_id=123, silent=True, target='pv') # 각 모형에 따른 자동 머신러닝 modelList = compare_models(sort='RMSE', n_select=3) # 앙상블 모형 blendModel = blend_models(estimator_list=modelList, fold=10) # 앙상블 튜닝 tuneModel = tune_model(blendModel, fold=10, choose_better=True) # 학습 모델 fnlModel = finalize_model(tuneModel) # 학습 모델 저장 saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format( globalVar['outPath'], modelDirKey, serviceName, posId, 'final', 'pycaret', 'for', datetime.now().strftime("%Y%m%d")) os.makedirs(os.path.dirname(saveModel), exist_ok=True) save_model(fnlModel, saveModel) # ********************************************************************************************************** # 딥러닝 # ********************************************************************************************************** saveDlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format( globalVar['outPath'], modelDirKey, serviceName, posId, 'final', 'h2o', 'for', '*') saveDlModelList = sorted(glob.glob(saveDlModel), reverse=True) # 학습 모델이 없을 경우 if (sysOpt['isOverWrite']) or (len(saveDlModelList) < 1): if (isDlModelInit == False): h2o.init() isDlModelInit = True # dnModel = H2OAutoML(max_models=20, max_runtime_secs=10000, balance_classes=True, seed=123) # 2022-03-29 # dnModel = H2OAutoML(max_models=2, max_runtime_secs=20000, balance_classes=True, seed=123) dnModel = H2OAutoML(max_models=20, max_runtime_secs=99999, balance_classes=True, seed=123) # java.lang.OutOfMemoryError: Java heap space # dnModel = H2OAutoML(max_models=None, max_runtime_secs=99999, balance_classes=True, seed=123) # dnModel = H2OAutoML(max_models=40, max_runtime_secs=99999, balance_classes=True, seed=123) # dnModel = H2OAutoML(max_models=30, max_runtime_secs=99999, balance_classes=True, seed=123) # dnModel = H2OAutoML(max_models=40, max_runtime_secs=20000, balance_classes=True, seed=123) dnModel.train(x=[ 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et' ], y='pv', training_frame=h2o.H2OFrame(trainDataL1)) # dnModel.train(x=['year', 'month', 'day', 'hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL1)) # dnModel.train(x=['hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL1)) # trainSet, validSet = np.split(trainDataL1, [int(0.70 * len(trainDataL1))]) # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSetL1), validation_frame=h2o.H2OFrame(validSetL1)) # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2), validation_frame=h2o.H2OFrame(testData)) # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSet), validation_frame=h2o.H2OFrame(validSet)) # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2)) # 학습 모델 저장 saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format( globalVar['outPath'], modelDirKey, serviceName, posId, 'final', 'h2o', 'for', datetime.now().strftime("%Y%m%d")) os.makedirs(os.path.dirname(saveModel), exist_ok=True) # h2o.save_model(model=dnModel.get_best_model(), path=os.path.dirname(saveModel), filename=os.path.basename(saveModel), force=True) dnModel.get_best_model().save_mojo( path=os.path.dirname(saveModel), filename=os.path.basename(saveModel), force=True) except Exception as e: log.error("Exception : {}".format(e)) raise e finally: log.info('[END] {}'.format("exec"))
def exec(self): log.info('[START] {}'.format("exec")) # import pandas as pd # import numpy as np h2o.init() try: if (platform.system() == 'Windows'): # 옵션 설정 sysOpt = { # 시작/종료 시간 'srtDate': '2021-10-01', 'endDate': '2021-11-01', 'isOverWrite': True # , 'isOverWrite': False } globalVar['inpPath'] = 'E:/DATA' globalVar['outPath'] = 'E:/DATA' else: # 옵션 설정 sysOpt = { # 시작/종료 시간 'srtDate': globalVar['srtDate'], 'endDate': globalVar['endDate'] # , 'isOverWrite': True , 'isOverWrite': False } inpPosFile = '{}/{}'.format(globalVar['cfgPath'], 'stnInfo/GA_STN_INFO.xlsx') posData = pd.read_excel(inpPosFile, engine='openpyxl') posDataL1 = posData[['id', 'lat', 'lon']] modelDirKeyList = ['AI_2Y'] # modelDirKeyList = ['AI_2Y', 'AI_7D', 'AI_15D', 'AI_1M', 'AI_3M', 'AI_6M'] for k, modelDirKey in enumerate(modelDirKeyList): log.info("[CHECK] modelDirKey : {}".format(modelDirKey)) for i, posInfo in posDataL1.iterrows(): posId = int(posInfo['id']) posLat = posInfo['lat'] posLon = posInfo['lon'] log.info( "[CHECK] posId (posLon, posLat) : {} ({}. {})".format( posId, posLon, posLat)) # break inpFile = '{}/{}/{}-SRV{:05d}-{}-{}-{}.xlsx'.format( globalVar['outPath'], 'ACT', serviceName, posId, 'final', 'proc', 'act') fileList = sorted(glob.glob(inpFile)) # 파일 없을 경우 예외 처리 if fileList is None or len(fileList) < 1: log.error('[ERROR] inpFile : {} / {}'.format( inpFile, '입력 자료를 확인해주세요.')) continue fileInfo = fileList[0] inpData = pd.read_excel(fileInfo, engine='openpyxl') inpData['CA_TOT'][inpData['CA_TOT'] < 0] = np.nan inpData['WS'][inpData['WS'] < 0] = np.nan inpData['WD'][inpData['WD'] < 0] = np.nan inpData['SWR'][inpData['SWR'] < 0] = np.nan inpData['pv'][inpData['pv'] < 0] = np.nan inpDataL1 = inpData.dropna().reset_index(drop=True) # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d')].index.to_numpy()[0] idxInfo = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime( '2021-10-30', format='%Y-%m-%d')].index.to_numpy() if (len(idxInfo) < 1): continue idx = idxInfo[0] # 7일, 15일, 1달, 3달, 6달, 2년 if (modelDirKey == 'AI_2Y'): # 2021년 기준으로 데이터 분할 # trainData, testData = inpDataL1[0:idx], inpDataL1[idx:len(inpDataL1)] # 전체 데이터 # trainData = inpDataL1 # 2021년 기준으로 데이터 분할 trainData, testData = inpDataL1[0:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_7D'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - timedelta(days=7)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_15D'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - timedelta(days=15)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_1M'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - relativedelta(months=1)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_3M'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - relativedelta(months=3)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_6M'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - relativedelta(months=6)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] # 2021년 기준으로 변경 trainDataL1 = trainData[[ 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv', 'sza', 'aza', 'et' ]] # trainDataL1.describe() # ********************************************************************************************************** # 머신러닝 # ********************************************************************************************************** # 시게열 # https://towardsdatascience.com/time-series-forecasting-with-pycaret-regression-module-237b703a0c63 # # saveCsvFile = '{}/{}_{}.csv'.format(globalVar['outPath'], serviceName, "trainDataL4") # # trainDataL4.to_csv(saveCsvFile, index=False) # log.info('[CHECK] saveCsvFile : {}'.format(saveCsvFile)) # # trainDataL4 = pd.read_csv(saveCsvFile) # trainDataL4.describe() saveMlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model.pkl'.format( globalVar['outPath'], modelDirKey, serviceName, posId, 'final', 'pycaret', 'act', '*') saveMlModelList = sorted(glob.glob(saveMlModel), reverse=True) # 학습 모델이 없을 경우 if (sysOpt['isOverWrite']) or (len(saveMlModelList) < 1): pyModel = setup(data=trainDataL1, session_id=123, silent=True, target='pv') # 각 모형에 따른 자동 머신러닝 modelList = compare_models(sort='RMSE', n_select=10) # 앙상블 모형 blendModel = blend_models(estimator_list=modelList, fold=10) # 앙상블 튜닝 tuneModel = tune_model(blendModel, fold=2, choose_better=True) log.info("[CHECK] tuneModel : {}".format(tuneModel)) # 학습 모델 fnlModel = finalize_model(tuneModel) log.info("[CHECK] fnlModel : {}".format(fnlModel)) # 학습 모델 저장 saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format( globalVar['outPath'], modelDirKey, serviceName, posId, 'final', 'pycaret', 'act', datetime.now().strftime("%Y%m%d")) log.info("[CHECK] saveModel : {}".format(saveModel)) os.makedirs(os.path.dirname(saveModel), exist_ok=True) save_model(fnlModel, saveModel) # ********************************************************************************************************** # 딥러닝 # ********************************************************************************************************** saveDlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format( globalVar['outPath'], modelDirKey, serviceName, posId, 'final', 'h2o', 'act', '*') saveDlModelList = sorted(glob.glob(saveDlModel), reverse=True) # 학습 모델이 없을 경우 if (sysOpt['isOverWrite']) or (len(saveDlModelList) < 1): # 개수 제한 # 10초 제한 # dnModel = H2OAutoML(max_models=20, max_runtime_secs=10000, balance_classes=True, seed=123) dnModel = H2OAutoML(max_models=40, max_runtime_secs=20000, balance_classes=True, seed=123) # trainSet, validSet = np.split(trainDataL1, [int(0.70 * len(trainDataL1))]) dnModel.train(x=[ 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et' ], y='pv', training_frame=h2o.H2OFrame(trainDataL1)) # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSetL1), validation_frame=h2o.H2OFrame(validSetL1)) # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2), validation_frame=h2o.H2OFrame(testData)) # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSet), validation_frame=h2o.H2OFrame(validSet)) # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2)) # 학습 모델 저장 saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format( globalVar['outPath'], modelDirKey, serviceName, posId, 'final', 'h2o', 'act', datetime.now().strftime("%Y%m%d")) log.info("[CHECK] saveModel : {}".format(saveModel)) os.makedirs(os.path.dirname(saveModel), exist_ok=True) h2o.save_model(model=dnModel.get_best_model(), path=os.path.dirname(saveModel), filename=os.path.basename(saveModel), force=True) except Exception as e: log.error("Exception : {}".format(e)) raise e finally: log.info('[END] {}'.format("exec"))
def exec(self): log.info('[START] {}'.format("exec")) # h2o.init() import pandas as pd try: if (platform.system() == 'Windows'): # 옵션 설정 sysOpt = { # 시작/종료 시간 'srtDate': '2019-01-01', 'endDate': '2021-12-31', 'isOverWrite': True # , 'isOverWrite': False } globalVar['inpPath'] = 'E:/DATA' globalVar['outPath'] = 'E:/DATA' else: # 옵션 설정 sysOpt = { # 시작/종료 시간 'srtDate': globalVar['srtDate'], 'endDate': globalVar['endDate'] # , 'isOverWrite': True , 'isOverWrite': False } isDlModelInit = False inpPosFile = '{}/{}'.format(globalVar['cfgPath'], 'stnInfo/GA_STN_INFO.xlsx') posData = pd.read_excel(inpPosFile, engine='openpyxl') posDataL1 = posData[['id', 'lat', 'lon']] modelDirKeyList = ['AI_2Y'] # modelDirKeyList = ['AI_1Y6M'] # modelDirKeyList = ['AI_2Y', 'AI_7D', 'AI_15D', 'AI_1M', 'AI_3M', 'AI_6M'] for k, modelDirKey in enumerate(modelDirKeyList): log.info("[CHECK] modelDirKey : {}".format(modelDirKey)) for i, posInfo in posDataL1.iterrows(): posId = int(posInfo['id']) posLat = posInfo['lat'] posLon = posInfo['lon'] # if (not re.search('51', str(posId))): continue # if (not re.search('17', str(posId))): continue # if (re.search('17|50|51|58|60|67|72|81|85|87', str(posId))): continue log.info('[CHECK] posId : {}'.format(posId)) # break inpFile = '{}/{}/{}-SRV{:05d}-{}-{}-{}.xlsx'.format( globalVar['outPath'], 'FOR', serviceName, posId, 'final', 'proc', 'for') fileList = sorted(glob.glob(inpFile)) # 파일 없을 경우 예외 처리 if fileList is None or len(fileList) < 1: log.error('[ERROR] inpFile : {} / {}'.format( inpFile, '입력 자료를 확인해주세요.')) continue fileInfo = fileList[0] inpData = pd.read_excel(fileInfo, engine='openpyxl') inpData['CA_TOT'][inpData['CA_TOT'] < 0] = np.nan inpData['WS'][inpData['WS'] < 0] = np.nan inpData['WD'][inpData['WD'] < 0] = np.nan inpData['SWR'][inpData['SWR'] < 0] = np.nan inpData['pv'][inpData['pv'] < 0] = np.nan inpDataL1 = inpData.dropna().reset_index(drop=True) inpDataL1 = inpDataL1.sort_values(by=['dtDateKst'], axis=0) # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d')].index.to_numpy()[0] # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-11-30', format='%Y-%m-%d')].index.to_numpy() idxInfo = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime( '2021-06-01', format='%Y-%m-%d')].index.to_numpy() # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2022-01-01', format='%Y-%m-%d')].index.to_numpy() if (len(idxInfo) < 1): continue idx = idxInfo[0] # 7일, 15일, 1달, 3달, 6달, 2년 if (modelDirKey == 'AI_2Y'): # 전체 데이터 # trainData = inpDataL1 # 2021년 기준으로 데이터 분할 trainData, testData = inpDataL1[0:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_7D'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - timedelta(days=7)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_15D'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - timedelta(days=15)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_1M'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - relativedelta(months=1)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_3M'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - relativedelta(months=3)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] elif (modelDirKey == 'AI_6M'): srtIdx = inpDataL1.loc[ inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d') - relativedelta(months=6)].index.to_numpy()[0] trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[ idx:len(inpDataL1)] log.info('[CHECK] len(trainData) : {}'.format( len(trainData))) log.info('[CHECK] len(testData) : {}'.format( len(testData))) log.info('[CHECK] trainData : {} - {}'.format( trainData['dtDateKst'].min(), trainData['dtDateKst'].max())) # log.info('[CHECK] testData : {} - {}'.format(trainData['testData'].min(), trainData['testData'].max())) # trainData['year'] = trainData['dtDateKst'].dt.strftime('%Y').astype('int64') # trainData['month'] = trainData['dtDateKst'].dt.strftime('%m').astype('int64') # trainData['day'] = trainData['dtDateKst'].dt.strftime('%d').astype('int64') # trainDataL1 = trainData[['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv', 'sza', 'aza', 'et']] # trainDataL1 = trainData[['year', 'month', 'day', 'hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv', 'sza', 'aza', 'et']] # trainDataL1.describe() # trainDataL1 = trainDataL1.loc[(trainDataL1['CA_TOT'] == 0)] # CA_TOT = 0 (전운량) # trainData.info() # trainData['dtDateKst'] = pd.to_datetime(trainData['dtDateKst']) # plt.scatter(trainData['dtDateKst'][0], trainData['CA_TOT'][0]) # plt.scatter(trainData['dtDate'], trainData['CA_TOT']) # plt.scatter(trainData['dtDateKst'], trainData['SWR']) # plt.scatter(trainData['pv'], trainData['SWR']) # plt.scatter(trainDataL1['CA_TOT'], trainDataL1['SWR']) # plt.scatter(trainData['dtDateKst'], trainData['SWR']) log.info('[CHECK] min-max : {} - {}'.format( int(trainData['pv'].min()), int(trainData['pv'].max()))) mainTitle = '[{:05d}] {}'.format( posId, '기상 예보 정보 (수치모델)를 활용한 입력데이터 (발전량) 시계열') saveImg = '{}/{}/{}/{}.png'.format(globalVar['figPath'], serviceName, modelDirKey, mainTitle) os.makedirs(os.path.dirname(saveImg), exist_ok=True) plt.scatter(trainData['dtDateKst'], trainData['pv']) plt.title('{:05d}'.format(posId)) plt.savefig(saveImg, dpi=600, bbox_inches='tight') # plt.scatter(trainData['dtDateKst'], trainData['SWR']) # plt.scatter(trainData['dtDateKst'], trainData['sza']) # plt.scatter(trainData['dtDateKst'], trainData['aza']) plt.show() plt.close() continue # trainData.plot() # plt.show() # plt.close() # trainDataL1 = trainData[['dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv']] # # )[['dtDate', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'DSR', 'CLD', 'CF', 'SWR', 'pv']] # # )[['dtDate', 'dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'DSR', 'CF', 'CLD', 'SWR', 'pv']].dropna() # # )[['dtDate', 'dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'CF', 'CLD', 'SWR', 'pv']] # import pandas as pd # from autofeat import AutoFeatClassifier # from sklearn.model_selection import train_test_split # from sklearn.datasets import load_breast_cancer # from sklearn.linear_model import LogisticRegression # from sklearn.metrics import accuracy_score, confusion_matrix # # # # load_breast_cancer = load_breast_cancer(as_frame=True) # # X = load_breast_cancer.data # # y = load_breast_cancer.target # # trainData, testData # model = AutoFeatClassifier(verbose=1) # X_train_feature_creation = model.fit_transform(trainData, testData) # # import pandas as pd # 기본라이브러리 # # from prophet import Prophet # Prophet # from neuralprophet import NeuralProphet # NeuralProphet # from sklearn.metrics import mean_absolute_error # 평가 지표 MAE # from statistics import mean # 평균값 계산 # import matplotlib.pyplot as plt # 그래프묘사 # # df1_nprophet_model = NeuralProphet(seasonality_mode='multiplicative') # df1_nprophet_model_result = df1_nprophet_model.fit(trainData, freq="H") # trainData['ds'] = trainData['dtDateKst'] # # import pandas as pd # from pycaret.datasets import get_data # data = get_data('pycaret_downloads') # data['Date'] = pd.to_datetime(data['Date']) # data = data.groupby('Date').sum() # data = data.asfreq('D') # data.head() # # # plot the data # data.plot() # plt.show() # # trainData.drop_duplicates(subset=['dtDateKst'], inplace=True) # trainDataL2 = trainData[['pv']] # trainDataL2.index = trainData['dtDateKst'] # import pycaret.classification # from pycaret.time_series import * # from pycaret.internal.pycaret_experiment import TimeSeriesExperiment # pyModel = setup(trainDataL2, fh=7, fold=3, session_id=123) # pyModel = setup(trainData, target = 'Price', fh=7, fold=3, session_id=123) # 각 모형에 따른 자동 머신러닝 # modelList = compare_models(sort='RMSE', n_select=3) # modelList = compare_models(sort='RMSE') # tuneModel = stack_models(modelList) # 앙상블 모형 # blendModel = blend_models(estimator_list=modelList, fold=5) # 앙상블 튜닝 # tuneModel = tune_model(modelList, fold=5, choose_better=True) # # 학습 모델 # fnlModel = finalize_model(tuneModel) # # predict_model(fnlModel, fh=90) # # plot_model(fnlModel, plot='forecast', data_kwargs = { 'fh' : 30 }) # # plot_model(modelList[0], plot='forecast', data_kwargs = { 'fh' : 30 }) # # plot_model(modelList[0], plot='forecast', data_kwargs = { 'fh' : 30 }) # plot_model(fnlModel, plot='insample') # ********************************************************************************************************** # 머신러닝 # ********************************************************************************************************** # 시게열 # https://towardsdatascience.com/time-series-forecasting-with-pycaret-regression-module-237b703a0c63 # # saveCsvFile = '{}/{}_{}.csv'.format(globalVar['outPath'], serviceName, "trainDataL4") # # trainDataL4.to_csv(saveCsvFile, index=False) # log.info('[CHECK] saveCsvFile : {}'.format(saveCsvFile)) # # trainDataL4 = pd.read_csv(saveCsvFile) # trainDataL4.describe() saveMlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model.pkl'.format( globalVar['outPath'], modelDirKey, serviceName, posId, 'final', 'pycaret', 'for', '*') saveMlModelList = sorted(glob.glob(saveMlModel), reverse=True) # 학습 모델이 없을 경우 # if (sysOpt['isOverWrite']) or (len(saveMlModelList) < 1): if (len(saveMlModelList) < 1): pyModel = setup(data=trainDataL1, session_id=123, silent=True, target='pv') # 각 모형에 따른 자동 머신러닝 modelList = compare_models(sort='RMSE', n_select=3) # 앙상블 모형 blendModel = blend_models(estimator_list=modelList, fold=10) # 앙상블 튜닝 tuneModel = tune_model(blendModel, fold=10, choose_better=True) # 학습 모델 fnlModel = finalize_model(tuneModel) # 학습 모델 저장 saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format( globalVar['outPath'], modelDirKey, serviceName, posId, 'final', 'pycaret', 'for', datetime.now().strftime("%Y%m%d")) os.makedirs(os.path.dirname(saveModel), exist_ok=True) save_model(fnlModel, saveModel) # ********************************************************************************************************** # 딥러닝 # ********************************************************************************************************** saveDlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format( globalVar['outPath'], modelDirKey, serviceName, posId, 'final', 'h2o', 'for', '*') saveDlModelList = sorted(glob.glob(saveDlModel), reverse=True) # 학습 모델이 없을 경우 if (sysOpt['isOverWrite']) or (len(saveDlModelList) < 1): if (isDlModelInit == False): h2o.init() isDlModelInit = True # dnModel = H2OAutoML(max_models=20, max_runtime_secs=10000, balance_classes=True, seed=123) # 2022-03-29 # dnModel = H2OAutoML(max_models=2, max_runtime_secs=20000, balance_classes=True, seed=123) dnModel = H2OAutoML(max_models=20, max_runtime_secs=99999, balance_classes=True, seed=123) # java.lang.OutOfMemoryError: Java heap space # dnModel = H2OAutoML(max_models=None, max_runtime_secs=99999, balance_classes=True, seed=123) # dnModel = H2OAutoML(max_models=40, max_runtime_secs=99999, balance_classes=True, seed=123) # dnModel = H2OAutoML(max_models=30, max_runtime_secs=99999, balance_classes=True, seed=123) # dnModel = H2OAutoML(max_models=40, max_runtime_secs=20000, balance_classes=True, seed=123) dnModel.train(x=[ 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et' ], y='pv', training_frame=h2o.H2OFrame(trainDataL1)) # dnModel.train(x=['year', 'month', 'day', 'hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL1)) # dnModel.train(x=['hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL1)) # trainSet, validSet = np.split(trainDataL1, [int(0.70 * len(trainDataL1))]) # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSetL1), validation_frame=h2o.H2OFrame(validSetL1)) # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2), validation_frame=h2o.H2OFrame(testData)) # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSet), validation_frame=h2o.H2OFrame(validSet)) # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2)) # 학습 모델 저장 saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format( globalVar['outPath'], modelDirKey, serviceName, posId, 'final', 'h2o', 'for', datetime.now().strftime("%Y%m%d")) os.makedirs(os.path.dirname(saveModel), exist_ok=True) # h2o.save_model(model=dnModel.get_best_model(), path=os.path.dirname(saveModel), filename=os.path.basename(saveModel), force=True) dnModel.get_best_model().save_mojo( path=os.path.dirname(saveModel), filename=os.path.basename(saveModel), force=True) except Exception as e: log.error("Exception : {}".format(e)) raise e finally: log.info('[END] {}'.format("exec"))
'Default Task'].tolist()[0] # Describe data st.write( f'This dataset has {df.shape[0]} samples and {df.shape[1]} features. Target variable is {df_target}.' ) st.dataframe(df.head()) if df_task in ['NLP / Regression', 'Regression']: # Setup PyCaret with st.spinner('PyCaret setup is running...'): pycset = regression.setup(data=df, target=df_target) # Compare models st.dataframe(regression.compare_models()) # End st.success('End of execution!') if df_task in ['Classification (Binary)', 'Classification (Multiclass)']: # Setup PyCaret with st.spinner('PyCaret setup is running...'): pycset = classification.setup(data=df, target=df_target) # Compare models st.dataframe(classification.compare_models()) # End st.success('End of execution!')
def exec(self): log.info('[START] {}'.format("exec")) try: import pandas as pd globalVar['inpPath'] = 'E:/DATA/OUTPUT' globalVar['outPath'] = 'E:/DATA/OUTPUT' inpCsvFile = '{}/{}_{}.csv'.format(globalVar['outPath'], serviceName, 'TrainData') # CSV 파일 저장 # umDataL10.to_csv(saveCsvFile, index=False) data = pd.read_csv(inpCsvFile) # test = trainDataL7.dropna().reset_index(drop=True) data = data.drop(['ML', 'DL'], axis=1) data['dtDateKst'] = pd.to_datetime(data['dtDateKst']) # # testL1 = test[['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'SWR', 'sza', 'aza', 'et', 'pv']] dataL1 = data[[ 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et', 'pv' ]] pyModel = setup(data=dataL1, target='pv', session_id=123) try: # 각 모형에 따른 자동 머신러닝 modelList = compare_models(sort='RMSE', n_select=3) # 앙상블 모형 blendModel = blend_models(estimator_list=modelList, fold=2) # 앙상블 튜닝 tuneModel = tune_model(blendModel, fold=2, choose_better=True) # 학습 모델 fnlModel = finalize_model(tuneModel) except Exception as e: log.error("Exception : {}".format(e)) # evaluate_model(tuneModel) # pred_holdout = predict_model(fnlModel) # print(fnlModel) # 회귀 시각화 # plot_model(fnlModel, plot='error') # plt.show() mlModel = fnlModel # predData = predict_model(fnlModel, data=dataL1) # 24.4427 # check_metric(dataL1['pv'], dataL1['Label'], metric='RMSE') # h2o h2o.init() aml = H2OAutoML(max_models=20, max_runtime_secs=10000, balance_classes=True, seed=1) aml.train(x=[ 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et' ], y='pv', training_frame=h2o.H2OFrame(dataL1), validation_frame=h2o.H2OFrame(dataL1)) dlModel = aml.get_best_model() dataL2 = data dataL3 = predict_model(mlModel, data=dataL2).rename({'Label': 'ML'}, axis='columns') dataL3['DL'] = dlModel.predict( h2o.H2OFrame(dataL2)).as_data_frame() anaTimeList = dataL3['anaTime'].unique() for j, anaTimeInfo in enumerate(anaTimeList): dataL4 = dataL3.loc[dataL3['anaTime'] == anaTimeInfo].dropna( ).reset_index(drop=True) mainTitle = '[{}] {}'.format( anaTimeInfo, '기상 예보 정보 (수치모델)를 활용한 48시간 예측 시계열') saveImg = '{}/{}/{}.png'.format(globalVar['figPath'], serviceName, mainTitle) makeUserTimeSeriesPlot(pd.to_datetime(dataL4['dtDateKst']), dataL4['ML'], dataL4['DL'], dataL4['pv'], '예측 (머신러닝)', '예측 (딥러닝)', '실측 (발전량)', '시간 (시)', '발전량', mainTitle, saveImg, True) mainTitle = '[{}-{}] {}'.format( min(anaTimeList), max(anaTimeList), '기상 예보 정보 (수치모델)를 활용한 머신러닝 (48시간 예측) 산점도') saveImg = '{}/{}/{}.png'.format(globalVar['figPath'], serviceName, mainTitle) makeUserScatterPlot(dataL3['ML'], dataL3['pv'], '머신러닝', '실측', mainTitle, saveImg, 0, 1000, 20, 60, True) mainTitle = '[{}-{}] {}'.format( min(anaTimeList), max(anaTimeList), '기상 예보 정보 (수치모델)를 활용한 딥러닝 (48시간 예측) 산점도') saveImg = '{}/{}/{}.png'.format(globalVar['figPath'], serviceName, mainTitle) makeUserScatterPlot(dataL3['DL'], dataL3['pv'], '딥러닝', '실측', mainTitle, saveImg, 0, 1000, 20, 60, True) except Exception as e: log.error("Exception : {}".format(e)) raise e finally: log.info('[END] {}'.format("exec"))
def regression_model(*, y_col, training_set, normalize, test_size, folds, metric, model_name, testing_set, imbalanced, seed, include_models, normalize_method): """ Build a regression model for prediction. Parameters ---------- y_col : str the name of the target column. training_set : pd.DataFrame DataFrame containing the training data. normalize : bool if True the dataset will be normalized before training. test_size : float Between [0.0-1.0]. The size of the split for test within the training set. folds : int number of folds for cross validation. metric : str the metric used for evaluating the best model. model_name : str the name to save the model. testing_set : pd.DataFrame the external dataset for evaluating the best model. imbalanced seed : int random number to initilize the process. include_models : List a list of models to be included in the process. normalize_method : str The method used for normalizing the data. Returns ------- Final regression model """ if not metric: metric = 'RMSE' setup = pyreg.setup(target=y_col, data=training_set, normalize=normalize, normalize_method=normalize_method, train_size=1 - test_size, fold=folds, silent=True, session_id=seed) best_model = pyreg.compare_models(sort=metric, include=include_models) pyreg.pull().to_csv(model_name + '_compare_models.tsv', sep='\t', index=False) reg_model = pyreg.create_model(best_model) reg_tuned_model = pyreg.tune_model(reg_model, optimize=metric) pyreg.pull().to_csv(model_name + '_tuned_model.tsv', sep='\t', index=False) final_model = pyreg.finalize_model(reg_tuned_model) pyreg.plot_model(final_model, save=True) pyreg.plot_model(final_model, plot='feature', save=True) pyreg.plot_model(final_model, plot='error', save=True) pyreg.save_model(final_model, model_name) if len(testing_set.index) != 0: unseen_predictions = test_regressor( model_path=model_name + '.pkl', x_set=testing_set.drop(columns=[y_col]), y_col=testing_set[y_col], output=model_name) unseen_predictions.to_csv(model_name + '_external_testing_results.tsv', sep='\t', index=True) return final_model
target=y_col, numeric_features=x_cols, verbose=False, remove_multicollinearity=False, # multicollinearity_threshold = 0.6, ignore_low_variance=False, silent=True, n_jobs=2) # ---- 模型选择 --------------------------------------------------------------------------------- best_model = compare_models( include=[ 'rf', 'lightgbm', 'lasso', 'ridge', 'xgboost', 'en', 'knn', 'mlp', 'lr', 'dt' ], sort='R2', verbose=True, fold=3, round=5, ) # ---- 模型调参 --------------------------------------------------------------------------------- # 初始化模型, 固定参数. params = {'max_features': 'auto'} rgsr = create_model('rf', verbose=False, **params) # 模型调参. params4tuning = { "n_estimators": np.arange(30, 250, 30), "min_samples_leaf": [10, 15, 20, 30, 40, 50],