def make_model_elec(house_no): today = datetime.datetime.now().strftime('%Y%m%d') sql = f""" SELECT * FROM AH_USAGE_DAILY_PREDICT WHERE 1=1 AND HOUSE_NO = {house_no} AND USE_DATE >= DATE_FORMAT( DATE_ADD( STR_TO_DATE( '{today}', '%Y%m%d'),INTERVAL -28 DAY), '%Y%m%d')""" df = pd.read_sql(sql, con=settings.conn) df.loc[df.use_energy_daily.isnull(), 'use_energy_daily'] = 0 x, y = dl.split_x_y(df, x_col='use_energy_daily', y_col='use_energy_daily') x, y = dl.sliding_window_transform(x, y, step_size=7, lag=0) x = x[6:-1] y = y[7:] model, param = dl.select_regression_model('linear regression') gs = sk.model_selection.GridSearchCV(estimator=model, param_grid=param, cv=5, n_jobs=-1) gs.fit(x, y) dump(gs, f'./sample_data/joblib/usage_daily/{house_no}.joblib') return gs.best_score_
def make_model_status(device_id, lag=10): sql = f""" SELECT GATEWAY_ID , DEVICE_ID , COLLECT_DATE , COLLECT_TIME , QUALITY , ONOFF , ENERGY , ENERGY_DIFF , case when APPLIANCE_STATUS is null then 0 else APPLIANCE_STATUS end APPLIANCE_STATUS , CREATE_DATE FROM AH_USE_LOG_BYMINUTE WHERE 1 = 1 AND DEVICE_ID = '{device_id}' AND COLLECT_DATE in ( SELECT t1.COLLECT_DATE FROM (SELECT COLLECT_DATE , sum(APPLIANCE_STATUS) APPLIANCE_STATUS_SUM FROM AH_USE_LOG_BYMINUTE GROUP by COLLECT_DATE) t1 WHERE 1=1 AND t1.APPLIANCE_STATUS_SUM is not null)""" pyperclip.copy(sql) with settings.open_db_connection() as conn: df = pd.read_sql(sql, con=conn) print(df.head()) x, y = dl.split_x_y(df, x_col='ENERGY_DIFF', y_col='APPLIANCE_STATUS') x, y = dl.sliding_window_transform(x, y, lag=lag, step_size=30) model, params = dl.select_classification_model('random forest') gs = sk.model_selection.GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1) gs.fit(x, y) df = df.iloc[:-lag] df.loc[:, 'appliance_status_predicted'] = gs.predict(x) dump_path = f'./joblib/status/{device_id}_labeling.joblib' dump(gs, dump_path) # 저장 print(df) return dump_path, gs.best_score_
def post(self): try: parser = reqparse.RequestParser() parser.add_argument('house_no', type=str) args = parser.parse_args() house_no = args['house_no'] today = datetime.datetime.now().strftime('%Y%m%d') sql = f""" SELECT * FROM AH_USAGE_DAILY_PREDICT WHERE 1=1 AND HOUSE_NO = {house_no} AND USE_DATE >= DATE_FORMAT( DATE_ADD( STR_TO_DATE( '{today}', '%Y%m%d'),INTERVAL -28 DAY), '%Y%m%d') """ df = pd.read_sql(sql, con=settings.conn) df.loc[df.use_energy_daily.isnull(), 'use_energy_daily'] = 0 x, y = dl.split_x_y(df, x_col='use_energy_daily', y_col='use_energy_daily') x, y = dl.sliding_window_transform(x, y, step_size=7, lag=0) x = x[6:-1] y = y[7:] """ random forest linear regression ridge regression lasso regression """ model, param = dl.select_regression_model('linear regression') gs = sk.model_selection.GridSearchCV(estimator=model, param_grid=param, cv=5, n_jobs=-1) gs.fit(x, y) print(gs.best_score_) dump(gs, f'./sample_data/joblib/usage_daily/{house_no}.joblib') return {'flag_success': True, 'best_score': str(gs.best_score_)} except Exception as e: return {'flag_success': False, 'error': str(e)}
def post(self): try: parser = reqparse.RequestParser() parser.add_argument('device_id', type=str) parser.add_argument('gateway_id', type=str) parser.add_argument('collect_date', type=str) args = parser.parse_args() device_id = args['device_id'] gateway_id = args['gateway_id'] collect_date = args['collect_date'] start = collect_date + '0000' end = collect_date + '2359' sql = f""" SELECT * FROM AH_USE_LOG_BYMINUTE WHERE 1=1 AND GATEWAY_ID = '{gateway_id}' AND DEVICE_ID = '{device_id}' AND CONCAT( COLLECT_DATE, COLLECT_TIME) >= DATE_FORMAT( DATE_ADD( STR_TO_DATE( '{start}', '%Y%m%d%H%i'),INTERVAL -20 MINUTE), '%Y%m%d%H%i') AND CONCAT( COLLECT_DATE, COLLECT_TIME) <= DATE_FORMAT( DATE_ADD( STR_TO_DATE( '{end}', '%Y%m%d%H%i'),INTERVAL 10 MINUTE), '%Y%m%d%H%i') ORDER BY COLLECT_DATE, COLLECT_TIME """ df = pd.read_sql(sql, con=conn) print(df.head()) print('df:', len(df)) x, y = dl.split_x_y(df, x_col='energy_diff') pre = 20 post = 10 length = post + pre x = [x[i:i + length] for i in range(len(x) - (pre + post))] model = load( f'./sample_data/joblib/by_device/{device_id}_labeling.joblib') y = model.predict(x) y = [int(x) for x in y] print(len(y)) return {'flag_success': True, 'predicted_status': y} except Exception as e: return {'flag_success': False, 'error': str(e)}
def make_model_elec(house_no): today = datetime.datetime.now().strftime('%Y%m%d') sql = f""" SELECT * FROM AH_USAGE_DAILY_PREDICT WHERE 1=1 AND HOUSE_NO = {house_no} -- AND USE_DATE >= DATE_FORMAT( DATE_ADD( STR_TO_DATE( '{today}', '%Y%m%d'),INTERVAL -28 DAY), '%Y%m%d')""" # 최근 28주만 선택 pyperclip.copy(sql) with settings.open_db_connection() as conn: df = pd.read_sql(sql, con=conn) df.loc[df.USE_ENERGY_DAILY.isnull(), 'USE_ENERGY_DAILY'] = 0 print(df.head()) x, y = dl.split_x_y(df, x_col='USE_ENERGY_DAILY', y_col='USE_ENERGY_DAILY') x, y = dl.sliding_window_transform(x, y, step_size=7, lag=0) x = x[6:-1] y = y[7:] model, param = dl.select_regression_model('linear regression') gs = sk.model_selection.GridSearchCV(estimator=model, param_grid=param, cv=5, n_jobs=-1) gs.fit(x, y) dump(gs, f'./joblib/usage_daily/{house_no}.joblib') print("complete") return gs.best_score_
def post(self): try: parser = reqparse.RequestParser() parser.add_argument('device_id', type=str) args = parser.parse_args() lag = 10 device_id = args['device_id'] sql = f""" SELECT GATEWAY_ID , DEVICE_ID , COLLECT_DATE , COLLECT_TIME , QUALITY , ONOFF , ENERGY , ENERGY_DIFF , case when APPLIANCE_STATUS is null then 0 else APPLIANCE_STATUS end APPLIANCE_STATUS , CREATE_DATE FROM AH_USE_LOG_BYMINUTE_LABELED_sbj WHERE 1 = 1 AND DEVICE_ID = '{device_id}' AND COLLECT_DATE in ( SELECT t1.COLLECT_DATE FROM (SELECT COLLECT_DATE , sum(APPLIANCE_STATUS) APPLIANCE_STATUS_SUM FROM AH_USE_LOG_BYMINUTE_LABELED_sbj GROUP by COLLECT_DATE) t1 WHERE 1=1 AND t1.APPLIANCE_STATUS_SUM is not null) """ df = pd.read_sql(sql, con=settings.conn) x, y = dl.split_x_y(df, x_col='energy_diff', y_col='appliance_status') x, y = dl.sliding_window_transform(x, y, lag=lag, step_size=30) model, params = dl.select_classification_model('random forest') gs = sk.model_selection.GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1) gs.fit(x, y) # gs.best_score_ # # print(round(gs.best_score_ * 100, 2), '%', sep='') df = df.iloc[:-lag] df.loc[:, 'appliance_status_predicted'] = gs.predict(x) # df['appliance_status'] = gs.predict(x) dump_path = f'./sample_data/joblib/{device_id}_labeling.joblib' dump(gs, dump_path) # 저장 return { 'flag_success': True, 'dump_path': str(dump_path), 'score': str(gs.best_score_) } except Exception as e: return {'flag_success': False, 'error': str(e)}
def make_model_elec(house_no, model_type="linear regression", is_csv=False): """가구의 전력사용량 예측모델 학습 가구의 한달 사용전력량으로 다음달의 전력 사용량 예측 모델 학습 예측모델은 liner regression(선형회귀)를 사용한다. 7일치 데이터를 기반으로 8일째 되는 데이터를 예측한다. Args: house_no (str): 가구식별번호 model_type (str, optional): 모델 타입. Defaults to 'linear regression'. is_csv (bool, optional): 데이터를 csv 파일에서 조회여부. Defaults to False. Returns: float: 교차검증점수 """ # 로그데이터를 조회하여 사용하도록 수정 # 에너지 사용량, 요일을 같이 변수로 사용 if is_csv: condition = f"house_no == '{house_no}'" df = settings.load_datas_from_csv( csv_file="training/houses.csv", condition=condition ) else: df = settings.load_datas( query_file="mt_select_house.sql", params={"house_no": house_no} ) # 예측할 일자의 앞 일자를 설정 # 7일치를 기반으로 예측 step_size = 7 columns = [ "use_energy_daily", "dayname_Sunday", "dayname_Monday", "dayname_Tuesday", "dayname_Wednesday", "dayname_Thursday", "dayname_Friday", "dayname_Saturday", ] df = pd.get_dummies(df) df = df.loc[:, columns] x, y = dl.split_x_y(df, x_col=columns, y_col="use_energy_daily") x, y = dl.sliding_window_transform(x, y, step_size=step_size, lag=0) c = pd.merge( pd.DataFrame(x[(step_size - 1) : -1]), pd.DataFrame(y[step_size:]), left_index=True, right_index=True, how="left", ) c.to_csv( "./datas/house_trainig.csv", header=True, index=False, encoding="utf-8", ) # 선형회귀 검증 ############################################################ # if model_type == "linear regression": # reg = linear_model.LinearRegression() # else: # reg = XGBRegressor() # reg.fit(x[(step_size-1):-1], y[step_size:]) # y_pred = reg.predict(x) # print('Mean squared error: %.2f' % metrics.mean_squared_error(y, y_pred)) # # The coefficient of determination: 1 is perfect prediction # print('Coefficient of determination: %.2f' % metrics.r2_score(y, y_pred)) ########################################################################### model, params = classifications.select_classification_model(model_type) gs = model_selection.GridSearchCV( estimator=model, param_grid=params, cv=5, n_jobs=-1, ) gs.fit(x[(step_size - 1) : -1], y[step_size:]) path = f"./pickles/houses/{house_no}.pkl" joblib.dump(gs, path) return gs.best_score_
def make_model_status(device_id, model_type="random forest", lag=10, is_csv=False): """디바이스의 대기/사용 라벨링에 대한 예측모델학습 모델은 Random forast 방식을 사용한다. 기본은 한달 사용량이상 적재되면 모델을 학습시킨다. x: 전력사용차(energy_diff), 순시전력(power), on/off(onoff) 등 y: 사용/대기 상태 Args: device_id (str): 디바이스ID model_type (str, optional): 모델 타입. Defaults to 'random forest' random forest / xgboost classifier lag (int, optional): 데이터 시작점 이동범위. Defaults to 10. is_csv (bool, optional): 데이터를 csv 파일에서 조회여부. Defaults to False. Returns: str: 학습모델 저장경로 float: 교차검증점수 """ if is_csv: condition = f"device_id == '{device_id}'" df = settings.load_datas_from_csv( csv_file="training/devices.csv", condition=condition ) else: df = settings.load_datas( query_file="mt_select_device.sql", params={"device_id": device_id} ) x, y = dl.split_x_y(df, x_col=["energy_diff", "power"], y_col="appliance_status") # 30분 단위로 sliding x, y = dl.sliding_window_transform(x, y, step_size=30, lag=lag) # 확인 ##################################################################### # x_train, x_test, y_train, y_test = model_selection.train_test_split( # x, # y, # test_size = 0.25, # random_state = 10) # if model_type == 'random forest': # rf = ensemble.RandomForestClassifier(random_state=0) # else: # rf = XGBClassifier(n_estimators=50, # max_depth=8, # objective='binary:logistic', # verbosity=0) # # rf.fit(x_train, y_train) # pred = rf.predict(x_test) # print(metrics.accuracy_score(y_test, pred)) # ########################################################################## model, params = classifications.select_classification_model(model_type) gs = model_selection.GridSearchCV( estimator=model, param_grid=params, cv=5, scoring="accuracy", n_jobs=-1, ) gs.fit(x, y) path = f"./pickles/devices/{device_id}.pkl" joblib.dump(gs, path) return path, gs.best_score_