def data_gen_process_env(*args, **kwargs): # logger log = kwargs['logger'] try: # read the data from the database df = kwargs['df'].copy() # smooth the data # df = a_utils.dfsmoothing(df=df, column_names=list(df.columns)) df.clip(lower=0, inplace=True ) # Remove <0 values for all columns as a result of smoothing # aggregate data rolling_sum_target, rolling_mean_target = [], [] for col_name in df.columns: if kwargs['agg'][col_name] == 'sum': rolling_sum_target.append(col_name) else: rolling_mean_target.append(col_name) df[rolling_sum_target] = a_utils.window_sum( df, window_size=6, column_names=rolling_sum_target) df[rolling_mean_target] = a_utils.window_mean( df, window_size=6, column_names=rolling_mean_target) df = a_utils.dropNaNrows(df) # Sample the data at period intervals df = a_utils.sample_timeseries_df(df, period=6) # scale the columns: here we will use min-max df[df.columns] = kwargs['scaler'].minmax_scale(df, df.columns, df.columns) # creating sat-oat for the data df['sat-oat'] = df['sat'] - df['oat'] # create avg_stpt column stpt_cols = [ele for ele in df.columns if 'vrf' in ele] df['avg_stpt'] = df[stpt_cols].mean(axis=1) # drop individual set point cols df.drop(columns=stpt_cols, inplace=True) # select retrain range of the data time_start_of_train = df.index[-1] - timedelta( weeks=kwargs['retrain_range_rl_weeks']) df = df.loc[time_start_of_train:, :] # save the data frame df.to_pickle(kwargs['save_path'] + 'env_data/env_data.pkl') except Exception as e: log.error('ENV Data Generator Module: %s', str(e)) log.debug(e, exc_info=True)
def data_gen_process_vlv(*args, **kwargs): # logger log = kwargs['logger'] try: # read the data from the database df = kwargs['df'].copy() # smooth the data # df = a_utils.dfsmoothing(df=df, column_names=list(df.columns)) df.clip(lower=0, inplace=True ) # Remove <0 values for all columns as a result of smoothing # aggregate data rolling_sum_target, rolling_mean_target = [], [] for col_name in df.columns: if kwargs['agg'][col_name] == 'sum': rolling_sum_target.append(col_name) else: rolling_mean_target.append(col_name) df[rolling_sum_target] = a_utils.window_sum( df, window_size=6, column_names=rolling_sum_target) df[rolling_mean_target] = a_utils.window_mean( df, window_size=6, column_names=rolling_mean_target) df = a_utils.dropNaNrows(df) # Sample the data at period intervals df = a_utils.sample_timeseries_df(df, period=6) # scale the columns: here we will use min-max df[df.columns] = kwargs['scaler'].minmax_scale(df, df.columns, df.columns) # creating sat-oat for the data df['sat-oat'] = df['sat'] - df['oat'] # add binary classification column df['vlv'] = 1.0 df.loc[df['hwe'] <= 0.001, ['vlv']] = 0 # determine split point for last 1 week test data t_train_end = df.index[-1] - timedelta(weeks=10) test_df = df.loc[t_train_end:, :] splitvalue = test_df.shape[0] # create train and test/validate data X_test, X_train, y_test, y_train = a_utils.df_2_arrays( df=df, predictorcols=['oat', 'oah', 'wbt', 'sat-oat'], outputcols=['vlv'], lag=0, scaling=False, scaler=None, scaleX=True, scaleY=True, split=splitvalue, shuffle=False, reshaping=True, input_timesteps=1, output_timesteps=1, ) y_train = to_categorical(y_train) y_test = to_categorical(y_test) # save test ids for later plots # idx_end = -max(X_test.shape[1],y_test.shape[1]) # idx_start = idx_end - X_test.shape[0] + 1 # test_idx = df.index[[ i for i in range(idx_start, idx_end+1, 1) ]] # test_info = {'test_idx' : [str(i) for i in test_idx], 'year_num': kwargs['year_num'], 'week_num':kwargs['week_num'] } # with open(kwargs['save_path']+'vlv_data/vlv_test_info.txt', 'a') as ifile: # ifile.write(json.dumps(test_info)+'\n') np.save(kwargs['save_path'] + 'vlv_data/vlv_X_train.npy', X_train) np.save(kwargs['save_path'] + 'vlv_data/vlv_X_val.npy', X_test) np.save(kwargs['save_path'] + 'vlv_data/vlv_y_train.npy', y_train) np.save(kwargs['save_path'] + 'vlv_data/vlv_y_val.npy', y_test) except Exception as e: log.error('VLV Data Generator Module: %s', str(e)) log.debug(e, exc_info=True)