def stats_ou(station_name="종로구"): print("Data loading start...") _df_h = data.load_imputed(HOURLY_DATA_PATH) df_h = _df_h.query('stationCode == "' + str(SEOUL_STATIONS[station_name]) + '"') if station_name == '종로구' and \ not Path("/input/python/input_jongno_imputed_hourly_pandas.csv").is_file(): # load imputed result df_h.to_csv("/input/python/input_jongno_imputed_hourly_pandas.csv") print("Data loading complete") targets = ["PM10", "PM25"] intT = {"PM10": 19.01883611948326, "PM25": 20.4090132600871} sample_size = 48 output_size = 24 train_fdate = dt.datetime(2008, 1, 5, 0).astimezone(seoultz) train_tdate = dt.datetime(2018, 12, 31, 23).astimezone(seoultz) test_fdate = dt.datetime(2019, 1, 1, 0).astimezone(seoultz) test_tdate = dt.datetime(2020, 10, 31, 23).astimezone(seoultz) # consective dates between train and test assert train_tdate + dt.timedelta(hours=1) == test_fdate for target in targets: output_dir = Path('/mnt/data/OU/' + station_name + "/" + target + "/") png_dir = output_dir / Path('png/') svg_dir = output_dir / Path('svg/') data_dir = output_dir / Path('csv/') Path.mkdir(data_dir, parents=True, exist_ok=True) Path.mkdir(png_dir, parents=True, exist_ok=True) Path.mkdir(svg_dir, parents=True, exist_ok=True) # numeric_pipeline_X = Pipeline( # [('seasonalitydecompositor', # data.SeasonalityDecompositor_AWH(smoothing=True, smoothingFrac=0.05)), # ('standardtransformer', data.StandardScalerWrapper(scaler=StandardScaler()))]) # scaler = ColumnTransformer( # transformers=[ # ('num', numeric_pipeline_X, [target])]) # prepare dataset train_set = data.UnivariateRNNMeanSeasonalityDataset( station_name=station_name, target=target, filepath=HOURLY_DATA_PATH, features=[ "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "u", "v", "pres", "humid", "prep", "snow" ], features_1=[ "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "v", "pres", "humid", "prep", "snow" ], features_2=['u'], fdate=train_fdate, tdate=train_tdate, sample_size=sample_size, output_size=output_size, train_valid_ratio=0.8) train_set.preprocess() test_set = data.UnivariateRNNMeanSeasonalityDataset( station_name=station_name, target=target, filepath=HOURLY_DATA_PATH, features=[ "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "u", "v", "pres", "humid", "prep", "snow" ], features_1=[ "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "v", "pres", "humid", "prep", "snow" ], features_2=['u'], fdate=test_fdate, tdate=test_tdate, sample_size=sample_size, output_size=output_size, scaler_X=train_set.scaler_X, scaler_Y=train_set.scaler_Y) test_set.transform() test_set.plot_seasonality(data_dir, png_dir, svg_dir) df_train = train_set.ys.copy() df_test = test_set.ys.loc[test_fdate:test_tdate, :].copy() df_test_org = test_set.ys_raw.loc[test_fdate:test_tdate, :].copy() print("Simulate by Ornstein–Uhlenbeck process for " + target + "...") def run_OU(_intT): df_obs = mw_df(df_test_org, target, output_size, test_fdate, test_tdate) dates = df_obs.index df_sim = sim_OU(df_test, dates, target, np.mean(df_test.to_numpy()), np.std(df_test.to_numpy()),\ _intT[target], test_set.scaler_Y, output_size) assert df_obs.shape == df_sim.shape # join df plot_OU(df_sim, df_obs, target, data_dir, png_dir, svg_dir, test_fdate, test_tdate, station_name, output_size) # save to csv csv_fname = "df_test_obs.csv" df_obs.to_csv(data_dir / csv_fname) csv_fname = "df_test_sim.csv" df_sim.to_csv(data_dir / csv_fname) run_OU(intT)
def ml_mlp_mul_ms(station_name="종로구"): print("Start Multivariate MLP Mean Seasonality Decomposition (MSE) Model") targets = ["PM10", "PM25"] # targets = ["SO2", "CO", "O3", "NO2", "PM10", "PM25", # "temp", "u", "v", "pres", "humid", "prep", "snow"] # 24*14 = 336 #sample_size = 336 sample_size = 48 output_size = 24 # If you want to debug, fast_dev_run = True and n_trials should be small number fast_dev_run = False n_trials = 128 # fast_dev_run = True # n_trials = 1 # Hyper parameter epoch_size = 500 batch_size = 64 learning_rate = 1e-3 # Blocked Cross Validation # neglect small overlap between train_dates and valid_dates # 11y = ((2y, 0.5y), (2y, 0.5y), (2y, 0.5y), (2.5y, 1y)) train_dates = [(dt.datetime(2008, 1, 4, 1).astimezone(SEOULTZ), dt.datetime(2009, 12, 31, 23).astimezone(SEOULTZ)), (dt.datetime(2010, 7, 1, 0).astimezone(SEOULTZ), dt.datetime(2012, 6, 30, 23).astimezone(SEOULTZ)), (dt.datetime(2013, 1, 1, 0).astimezone(SEOULTZ), dt.datetime(2014, 12, 31, 23).astimezone(SEOULTZ)), (dt.datetime(2015, 7, 1, 0).astimezone(SEOULTZ), dt.datetime(2017, 12, 31, 23).astimezone(SEOULTZ))] valid_dates = [(dt.datetime(2010, 1, 1, 0).astimezone(SEOULTZ), dt.datetime(2010, 6, 30, 23).astimezone(SEOULTZ)), (dt.datetime(2012, 7, 1, 0).astimezone(SEOULTZ), dt.datetime(2012, 12, 31, 23).astimezone(SEOULTZ)), (dt.datetime(2015, 1, 1, 0).astimezone(SEOULTZ), dt.datetime(2015, 6, 30, 23).astimezone(SEOULTZ)), (dt.datetime(2018, 1, 1, 0).astimezone(SEOULTZ), dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ))] train_valid_fdate = dt.datetime(2008, 1, 3, 1).astimezone(SEOULTZ) train_valid_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ) # Debug if fast_dev_run: train_dates = [(dt.datetime(2015, 7, 1, 0).astimezone(SEOULTZ), dt.datetime(2017, 12, 31, 23).astimezone(SEOULTZ))] valid_dates = [(dt.datetime(2018, 1, 1, 0).astimezone(SEOULTZ), dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ))] train_valid_fdate = dt.datetime(2015, 7, 1, 0).astimezone(SEOULTZ) train_valid_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ) test_fdate = dt.datetime(2019, 1, 1, 0).astimezone(SEOULTZ) test_tdate = dt.datetime(2020, 10, 31, 23).astimezone(SEOULTZ) # check date range assumption assert len(train_dates) == len(valid_dates) for i, (td, vd) in enumerate(zip(train_dates, valid_dates)): assert vd[0] > td[1] assert test_fdate > train_dates[-1][1] assert test_fdate > valid_dates[-1][1] train_features = [ "SO2", "CO", "NO2", "PM10", "PM25", "temp", "wind_spd", "wind_cdir", "wind_sdir", "pres", "humid", "prep" ] train_features_periodic = [ "SO2", "CO", "NO2", "PM10", "PM25", "temp", "wind_spd", "wind_cdir", "wind_sdir", "pres", "humid" ] train_features_nonperiodic = ["prep"] for target in targets: print("Training " + target + "...") output_dir = Path( f"/mnt/data/MLPMSMultivariate/{station_name}/{target}/") Path.mkdir(output_dir, parents=True, exist_ok=True) model_dir = output_dir / "models" Path.mkdir(model_dir, parents=True, exist_ok=True) log_dir = output_dir / "log" Path.mkdir(log_dir, parents=True, exist_ok=True) _df_h = data.load_imputed(HOURLY_DATA_PATH) df_h = _df_h.query('stationCode == "' + str(SEOUL_STATIONS[station_name]) + '"') if station_name == '종로구' and \ not Path("/input/python/input_jongno_imputed_hourly_pandas.csv").is_file(): # load imputed result df_h.to_csv("/input/python/input_jongno_imputed_hourly_pandas.csv") # construct dataset for seasonality print("Construct Train/Validation Sets...", flush=True) train_valid_dataset = construct_dataset(train_valid_fdate, train_valid_tdate, filepath=HOURLY_DATA_PATH, station_name=station_name, target=target, sample_size=sample_size, output_size=output_size, transform=False) # compute seasonality train_valid_dataset.preprocess() # For Block Cross Validation.. # load dataset in given range dates and transform using scaler from train_valid_set # all dataset are saved in tuple print("Construct Training Sets...", flush=True) train_datasets = tuple( construct_dataset(td[0], td[1], scaler_X=train_valid_dataset.scaler_X, scaler_Y=train_valid_dataset.scaler_Y, filepath=HOURLY_DATA_PATH, station_name=station_name, target=target, sample_size=sample_size, output_size=output_size, features=train_features, features_periodic=train_features_periodic, features_nonperiodic=train_features_nonperiodic, transform=True) for td in train_dates) print("Construct Validation Sets...", flush=True) valid_datasets = tuple( construct_dataset(vd[0], vd[1], scaler_X=train_valid_dataset.scaler_X, scaler_Y=train_valid_dataset.scaler_Y, filepath=HOURLY_DATA_PATH, station_name=station_name, target=target, sample_size=sample_size, output_size=output_size, features=train_features, features_periodic=train_features_periodic, features_nonperiodic=train_features_nonperiodic, transform=True) for vd in valid_dates) # just single test set print("Construct Test Sets...", flush=True) test_dataset = construct_dataset( test_fdate, test_tdate, scaler_X=train_valid_dataset.scaler_X, scaler_Y=train_valid_dataset.scaler_Y, filepath=HOURLY_DATA_PATH, station_name=station_name, target=target, sample_size=sample_size, output_size=output_size, features=train_features, features_periodic=train_features_periodic, features_nonperiodic=train_features_nonperiodic, transform=True) # convert tuple of datasets to ConcatDataset train_dataset = ConcatDataset(train_datasets) val_dataset = ConcatDataset(valid_datasets) # num_layer == number of hidden layer hparams = Namespace(num_layers=1, layer_size=128, learning_rate=learning_rate, batch_size=batch_size) def objective(trial): model = BaseMLPModel( trial=trial, hparams=hparams, input_size=sample_size * len(train_features), sample_size=sample_size, output_size=output_size, station_name=station_name, target=target, features=train_features, features_periodic=train_features_periodic, features_nonperiodic=train_features_nonperiodic, train_dataset=train_dataset, val_dataset=val_dataset, test_dataset=test_dataset, scaler_X=train_valid_dataset.scaler_X, scaler_Y=train_valid_dataset.scaler_Y, output_dir=output_dir) # most basic trainer, uses good defaults trainer = Trainer(gpus=1 if torch.cuda.is_available() else None, precision=32, min_epochs=1, max_epochs=20, default_root_dir=output_dir, fast_dev_run=fast_dev_run, logger=True, checkpoint_callback=False, callbacks=[ PyTorchLightningPruningCallback( trial, monitor="valid/MSE") ]) trainer.fit(model) # Don't Log # hyperparameters = model.hparams # trainer.logger.log_hyperparams(hyperparameters) return trainer.callback_metrics.get("valid/MSE") if n_trials > 1: study = optuna.create_study(direction="minimize") study.enqueue_trial({ 'sigma': 1.3, 'num_layers': 4, 'layer_size': 8, 'learning_rate': learning_rate, 'batch_size': batch_size }) study.enqueue_trial({ 'sigma': 1.3, 'num_layers': 4, 'layer_size': 32, 'learning_rate': learning_rate, 'batch_size': batch_size }) study.enqueue_trial({ 'sigma': 1.3, 'num_layers': 4, 'layer_size': 64, 'learning_rate': learning_rate, 'batch_size': batch_size }) study.enqueue_trial({ 'sigma': 1.3, 'num_layers': 4, 'layer_size': 32, 'learning_rate': learning_rate, 'batch_size': batch_size }) study.enqueue_trial({ 'sigma': 1.3, 'num_layers': 8, 'layer_size': 32, 'learning_rate': learning_rate, 'batch_size': batch_size }) study.enqueue_trial({ 'sigma': 1.3, 'num_layers': 12, 'layer_size': 32, 'learning_rate': learning_rate, 'batch_size': batch_size }) study.enqueue_trial({ 'sigma': 0.7, 'num_layers': 4, 'layer_size': 32, 'learning_rate': learning_rate, 'batch_size': batch_size }) study.enqueue_trial({ 'sigma': 2.0, 'num_layers': 4, 'layer_size': 32, 'learning_rate': learning_rate, 'batch_size': batch_size }) # timeout = 3600*36 = 36h study.optimize(objective, n_trials=n_trials, timeout=3600 * 36) trial = study.best_trial print(" Value: ", trial.value) print(" Params: ") for key, value in trial.params.items(): print(" {}: {}".format(key, value)) print("sample_size : ", sample_size) print("output_size : ", output_size) # plot optmization results fig_cont1 = optv.plot_contour(study, params=['num_layers', 'layer_size']) fig_cont1.write_image( str(output_dir / "contour_num_layers_layer_size.png")) fig_cont1.write_image( str(output_dir / "contour_num_layers_layer_size.svg")) fig_edf = optv.plot_edf(study) fig_edf.write_image(str(output_dir / "edf.png")) fig_edf.write_image(str(output_dir / "edf.svg")) fig_iv = optv.plot_intermediate_values(study) fig_iv.write_image(str(output_dir / "intermediate_values.png")) fig_iv.write_image(str(output_dir / "intermediate_values.svg")) fig_his = optv.plot_optimization_history(study) fig_his.write_image(str(output_dir / "opt_history.png")) fig_his.write_image(str(output_dir / "opt_history.svg")) fig_pcoord = optv.plot_parallel_coordinate( study, params=['num_layers', 'layer_size']) fig_pcoord.write_image(str(output_dir / "parallel_coord.png")) fig_pcoord.write_image(str(output_dir / "parallel_coord.svg")) fig_slice = optv.plot_slice(study, params=['num_layers', 'layer_size']) fig_slice.write_image(str(output_dir / "slice.png")) fig_slice.write_image(str(output_dir / "slice.svg")) # set hparams with optmized value hparams.num_layers = trial.params['num_layers'] hparams.layer_size = trial.params['layer_size'] dict_hparams = copy.copy(vars(hparams)) dict_hparams["sample_size"] = sample_size dict_hparams["output_size"] = output_size with open(output_dir / 'hparams.json', 'w') as f: print(dict_hparams, file=f) with open(output_dir / 'hparams.csv', 'w') as f: print(pd.DataFrame.from_dict(dict_hparams, orient='index'), file=f) model = BaseMLPModel(hparams=hparams, input_size=sample_size * len(train_features), sample_size=sample_size, output_size=output_size, station_name=station_name, target=target, features=train_features, features_periodic=train_features_periodic, features_nonperiodic=train_features_nonperiodic, train_dataset=train_dataset, val_dataset=val_dataset, test_dataset=test_dataset, scaler_X=train_valid_dataset.scaler_X, scaler_Y=train_valid_dataset.scaler_Y, output_dir=output_dir) # record input for i, _train_set in enumerate(train_datasets): _train_set.to_csv( model.data_dir / ("df_trainset_{0}_".format(str(i).zfill(2)) + target + ".csv")) for i, _valid_set in enumerate(valid_datasets): _valid_set.to_csv( model.data_dir / ("df_validset_{0}_".format(str(i).zfill(2)) + target + ".csv")) train_valid_dataset.to_csv(model.data_dir / ("df_trainvalidset_" + target + ".csv")) test_dataset.to_csv(model.data_dir / ("df_testset_" + target + ".csv")) checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join( model_dir, "train_{epoch}_{valid/MSE:.2f}"), monitor="valid/MSE", period=10) early_stop_callback = EarlyStopping(monitor='valid/MSE', min_delta=0.001, patience=30, verbose=True, mode='min') log_version = dt.date.today().strftime("%y%m%d-%H-%M") loggers = [ \ TensorBoardLogger(log_dir, version=log_version), CSVLogger(log_dir, version=log_version)] # most basic trainer, uses good defaults trainer = Trainer(gpus=1 if torch.cuda.is_available() else None, precision=32, min_epochs=1, max_epochs=epoch_size, default_root_dir=output_dir, fast_dev_run=fast_dev_run, logger=loggers, log_every_n_steps=5, flush_logs_every_n_steps=10, callbacks=[early_stop_callback], checkpoint_callback=checkpoint_callback) trainer.fit(model) # run test set trainer.test(ckpt_path=None) shutil.rmtree(model_dir)
def stats_msea_acf(station_name="종로구"): print("Data loading start...") if Path("/input/python/input_jongro_imputed_hourly_pandas.csv").is_file(): df_d = data.load_imputed( "/input/python/input_jongro_imputed_daily_pandas.csv") df_h = data.load_imputed( "/input/python/input_jongro_imputed_hourly_pandas.csv") else: # load imputed result _df_d = data.load_imputed(DAILY_DATA_PATH) _df_h = data.load_imputed(HOURLY_DATA_PATH) df_d = _df_d.query('stationCode == "' + str(SEOUL_STATIONS[station_name]) + '"') df_h = _df_h.query('stationCode == "' + str(SEOUL_STATIONS[station_name]) + '"') df_d.to_csv("/input/python/input_jongro_imputed_daily_pandas.csv") df_h.to_csv("/input/python/input_jongro_imputed_hourly_pandas.csv") print("Data loading complete") targets = ["PM10", "PM25"] sample_size = 48 output_size = 24 epoch_size = 300 batch_size = 32 for target in targets: dir_prefix = Path("/mnt/data/msea_acf/" + station_name + "/" + target + "/") Path.mkdir(dir_prefix, parents=True, exist_ok=True) target_sea_h_path = Path("/input/msea/weekly/" + station_name + "/" + target + "/df_" + target + ".csv") df_sea_h = pd.read_csv(target_sea_h_path, index_col=[0], parse_dates=[0]) nlag = 24 * 10 raw_acf = sm.tsa.acf(df_sea_h[[target + "_raw"]], nlags=nlag) ys_acf = sm.tsa.acf(df_sea_h[[target + "_ys"]], nlags=nlag) yr_acf = sm.tsa.acf(df_sea_h[[target + "_yr"]], nlags=nlag) ds_acf = sm.tsa.acf(df_sea_h[[target + "_ds"]], nlags=nlag) dr_acf = sm.tsa.acf(df_sea_h[[target + "_dr"]], nlags=nlag) ws_acf = sm.tsa.acf(df_sea_h[[target + "_ws"]], nlags=nlag) wr_acf = sm.tsa.acf(df_sea_h[[target + "_wr"]], nlags=nlag) raw_acf_sr = pd.Series(raw_acf, name="raw") ys_acf_sr = pd.Series(ys_acf, name="ys") yr_acf_sr = pd.Series(yr_acf, name="yr") ds_acf_sr = pd.Series(ds_acf, name="ds") dr_acf_sr = pd.Series(dr_acf, name="dr") ws_acf_sr = pd.Series(ws_acf, name="ws") wr_acf_sr = pd.Series(wr_acf, name="wr") acf_df = merge_acfs([ raw_acf_sr, ys_acf_sr, yr_acf_sr, ds_acf_sr, dr_acf_sr, ws_acf_sr, wr_acf_sr ], ["raw", "ys", "yr", "ds", "dr", "ws", "wr"]) plot_acf(ys_acf_sr, target, nlag, "ys", "Annual Seasonality", dir_prefix) plot_acf(yr_acf_sr, target, nlag, "yr", "Annual Residual", dir_prefix) plot_acf(ds_acf_sr, target, nlag, "ds", "Daily Seasonality", dir_prefix) plot_acf(dr_acf_sr, target, nlag, "dr", "Daily Residual", dir_prefix) plot_acf(ws_acf_sr, target, nlag, "ws", "Weekly Seasonality", dir_prefix) plot_acf(wr_acf_sr, target, nlag, "wr", "Weekly Residual", dir_prefix) plot_acfs(acf_df, ["raw", "yr", "dr", "wr"], target, nlag, "Autocorrelation", dir_prefix) plot_acfs(acf_df, ["raw", "yr"], target, nlag, "Autocorrelation", dir_prefix) plot_acfs(acf_df, ["raw", "dr"], target, nlag, "Autocorrelation", dir_prefix) plot_acfs(acf_df, ["raw", "wr"], target, nlag, "Autocorrelation", dir_prefix)
def stats_arima(station_name="종로구"): print("Data loading start...", flush=True) _df_h = data.load_imputed(HOURLY_DATA_PATH) df_h = _df_h.query('stationCode == "' + str(SEOUL_STATIONS[station_name]) + '"') if station_name == '종로구' and \ not Path("/input/python/input_jongno_imputed_hourly_pandas.csv").is_file(): # load imputed result df_h.to_csv("/input/python/input_jongno_imputed_hourly_pandas.csv") print("Data loading complete", flush=True) targets = ["PM10", "PM25"] # p (1, 0, 0) ~ (3, 0, 0), (4, 0, 0) ~ (6, 0, 0), (7, 0, 0) ~ (9, 0, 0), # p (1, 0, 1) ~ (3, 0, 1), (4, 0, 1) ~ (6, 0, 1), (7, 0, 1) ~ (9, 0, 1), # p (1, 0, 2) ~ (3, 0, 2), (4, 0, 2) ~ (6, 0, 2), (7, 0, 2) ~ (9, 0, 2), # orders1 = [(_p, 0, _q) for _q, _p in itertools.product(range(3), range(10)) if not (_p == 0 and _q == 0)] # orders2 = [(_p, 1, _q) for _q, _p in itertools.product(range(3), range(10)) if not (_p == 0 and _q == 0)] # orders = orders1 + orders2 # orders = [(_p, 0, _q) for _q, _p in itertools.product([0], range(1, 4)) if not (_p == 0 and _q == 0)] # orders = [(_p, 0, _q) for _q, _p in itertools.product([2], range(4, 7)) if not (_p == 0 and _q == 0)] # orders = [(_p, 0, _q) for _q, _p in itertools.product(range(0, 48, 6), range(0, 48, 6)) if not (_p == 0 and _q == 0)] # orders = [(_p, 0, _q) for _q, _p in itertools.product([0], [24, 48]) if not (_p == 0 and _q == 0)] # orders = [(8, 0, 0), (9, 0, 0)] orders = [(2, 0, 0), (3, 0, 0)] sample_size = 48 output_size = 24 train_fdate = dt.datetime(2008, 1, 4, 0).astimezone(SEOULTZ) train_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ) test_fdate = dt.datetime(2019, 1, 1, 0).astimezone(SEOULTZ) test_tdate = dt.datetime(2020, 10, 31, 23).astimezone(SEOULTZ) # consective dates between train and test assert train_tdate + dt.timedelta(hours=1) == test_fdate for target in targets: for order in orders: output_dir = Path('/mnt/data/ARIMA_' + str(order) + '/' + station_name + "/" + target + "/") png_dir = output_dir / Path('png/') svg_dir = output_dir / Path('svg/') data_dir = output_dir / Path('csv/') Path.mkdir(data_dir, parents=True, exist_ok=True) Path.mkdir(png_dir, parents=True, exist_ok=True) Path.mkdir(svg_dir, parents=True, exist_ok=True) norm_values, norm_maxlog = boxcox(df_h[target]) norm_target = "norm_" + target train_set = data.UnivariateRNNMeanSeasonalityDataset( station_name=station_name, target=target, filepath=HOURLY_DATA_PATH, features=[ "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "u", "v", "pres", "humid", "prep", "snow" ], features_1=[ "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "v", "pres", "humid", "prep", "snow" ], features_2=['u'], fdate=train_fdate, tdate=train_tdate, sample_size=sample_size, output_size=output_size, train_valid_ratio=0.8) train_set.preprocess() # set fdate=test_fdate, test_set = data.UnivariateRNNMeanSeasonalityDataset( station_name=station_name, target=target, filepath=HOURLY_DATA_PATH, features=[ "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "u", "v", "pres", "humid", "prep", "snow" ], features_1=[ "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "v", "pres", "humid", "prep", "snow" ], features_2=['u'], fdate=test_fdate, tdate=test_tdate, sample_size=sample_size, output_size=output_size, scaler_X=train_set.scaler_X, scaler_Y=train_set.scaler_Y) test_set.transform() df_train = train_set.ys.loc[train_fdate:train_tdate, :].copy() df_test = test_set.ys.loc[test_fdate:test_tdate, :].copy() df_test_org = test_set.ys_raw.loc[test_fdate:test_tdate, :].copy() print("ARIMA " + str(order) + " of " + target + "...", flush=True) def run_arima(order): df_obs = mw_df(df_test_org, target, output_size, test_fdate, test_tdate) dates = df_obs.index df_sim = sim_arima(df_train, df_test, dates, target, \ order, test_set.scaler_Y, sample_size, output_size, data_dir) assert df_obs.shape == df_sim.shape # join df plot_arima(df_sim, df_obs, target, order, data_dir, png_dir, svg_dir, test_fdate, test_tdate, station_name, output_size) # save to csv csv_fname = "df_test_obs.csv" df_obs.to_csv(data_dir / csv_fname) csv_fname = "df_test_sim.csv" df_sim.to_csv(data_dir / csv_fname) print("ARIMA " + str(order) + " ...") run_arima(order)
def ml_xgboost(station_name="종로구"): print("Start Multivariate XGBoost", flush=True) _df_h = data.load_imputed(HOURLY_DATA_PATH) df_h = _df_h.query('stationCode == "' + str(SEOUL_STATIONS[station_name]) + '"') if station_name == '종로구' and \ not Path("/input/python/input_jongno_imputed_hourly_pandas.csv").is_file(): # load imputed result df_h.to_csv("/input/python/input_jongno_imputed_hourly_pandas.csv") print("Data loading complete", flush=True) targets = ["PM10", "PM25"] features=["SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "wind_spd", "wind_cdir", "wind_sdir", "pres", "humid", "prep"] features_periodic=["SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "wind_spd", "wind_cdir", "wind_sdir", "pres", "humid"] features_nonperiodic=["prep"] # use one step input sample_size = 1 output_size = 24 train_fdate = dt.datetime(2008, 1, 3, 0).astimezone(SEOULTZ) train_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ) test_fdate = dt.datetime(2019, 1, 1, 0).astimezone(SEOULTZ) test_tdate = dt.datetime(2020, 10, 31, 23).astimezone(SEOULTZ) # consective dates between train and test assert train_tdate + dt.timedelta(hours=1) == test_fdate # check date range assumption assert test_tdate > train_fdate assert test_fdate > train_tdate for target in targets: train_set = data.MultivariateMeanSeasonalityDataset( station_name=station_name, target=target, filepath=HOURLY_DATA_PATH, features=features, features_1=features_nonperiodic, features_2=features_periodic, fdate=train_fdate, tdate=train_tdate, sample_size=sample_size, output_size=output_size, train_valid_ratio=0.8) train_set.preprocess() # set fdate=test_fdate, test_set = data.MultivariateMeanSeasonalityDataset( station_name=station_name, target=target, filepath=HOURLY_DATA_PATH, features=features, features_1=features_nonperiodic, features_2=features_periodic, fdate=test_fdate, tdate=test_tdate, sample_size=sample_size, output_size=output_size, scaler_X=train_set.scaler_X, scaler_Y=train_set.scaler_Y) test_set.transform() df_train = train_set.ys.loc[train_fdate:train_tdate, :].copy() df_test = test_set.ys.loc[test_fdate:test_tdate, :].copy() df_test_org = test_set.ys_raw.loc[test_fdate:test_tdate, :].copy() # for lag in range(23, 24): input_lag = 0 output_dir = Path(f'/mnt/data/XGBoost/' + station_name + "/" + target + "/") png_dir = output_dir / Path('png/') svg_dir = output_dir / Path('svg/') data_dir = output_dir / Path('csv/') Path.mkdir(data_dir, parents=True, exist_ok=True) Path.mkdir(png_dir, parents=True, exist_ok=True) Path.mkdir(svg_dir, parents=True, exist_ok=True) # prepare dataset print("Dataset conversion start..", flush=True) X_train, Y_train, train_dates = dataset2svinput(train_set, lag=input_lag) X_test, Y_test, test_dates = dataset2svinput(test_set, lag=input_lag) print("Dataset conversion complete..", flush=True) print("XGBoost " + target + "...", flush=True) df_obs = mw_df(df_test_org, target, output_size, input_lag, test_fdate, test_tdate) dates = df_obs.index # prediction df_sim = sim_xgboost(X_train.copy(), Y_train, X_test.copy(), Y_test, dates, copy.deepcopy(features), target, sample_size, output_size, test_set.scaler_Y, data_dir, png_dir, svg_dir) assert df_obs.shape == df_sim.shape # join df plot_xgboost(df_sim, df_obs, target, \ data_dir, png_dir, svg_dir, test_fdate, test_tdate, station_name, output_size) # save to csv csv_fname = "df_test_obs.csv" df_obs.to_csv(data_dir / csv_fname) csv_fname = "df_test_sim.csv" df_sim.to_csv(data_dir / csv_fname)
def stats_analysis(_station_name="종로구"): """ References * Ghil, M., et al. "Extreme events: dynamics, statistics and prediction." Nonlinear Processes in Geophysics 18.3 (2011): 295-350. """ print("Start Analysis of input") if not Path(HOURLY_DATA_PATH).is_file(): query_str = 'stationCode in ' + str(SEOUL_CODES) print(query_str, flush=True) _df_h = load_imputed() _df_h.to_csv("/input/python/input_imputed_hourly_pandas.csv") df_h = _df_h.query(query_str) df_h.to_csv(HOURLY_DATA_PATH) # filter by seoul codes print("Imputed!", flush=True) targets = ["PM10", "PM25"] sea_targets = [ "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "u", "v", "pres", "humid", "prep", "snow" ] # sea_targets = ["prep", "snow"] # 24*14 = 336 sample_size = 24 * 2 output_size = 24 # Hyper parameter epoch_size = 500 batch_size = 256 learning_rate = 1e-3 train_fdate = dt.datetime(2015, 1, 5, 0).astimezone(SEOULTZ) train_fdate = dt.datetime(2008, 1, 3, 0).astimezone(SEOULTZ) train_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ) test_fdate = dt.datetime(2019, 1, 1, 0).astimezone(SEOULTZ) #test_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ) test_tdate = dt.datetime(2020, 10, 31, 23).astimezone(SEOULTZ) # check date range assumption assert test_tdate > train_fdate assert test_fdate > train_tdate train_features = [ "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "u", "v", "pres", "humid", "prep" ] train_features_periodic = [ "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "u", "v", "pres", "humid" ] train_features_nonperiodic = ["prep"] # station_names = ['종로구'] # station_names = SEOUL_STATIONS station_names = ["종로구", "강서구", "서초구", "광진구"] def plot_sea(station_name='종로구'): for target in sea_targets: print("Analyze " + target + "...") _df_seoul = pd.read_csv(HOURLY_DATA_PATH, index_col=[0, 1], parse_dates=[0]) # filter by station_name _df_station = _df_seoul.query('stationCode == "' + str(SEOUL_STATIONS[station_name]) + '"') _df_station.reset_index(level='stationCode', drop=True, inplace=True) df_sea_h = _df_station output_dir = Path("/mnt/data/STATS_ANALYSIS_" + str(sample_size) + "/" + station_name + "/" + target + "/") Path.mkdir(output_dir, parents=True, exist_ok=True) data_dir = output_dir / Path('csv/') png_dir = output_dir / Path('png/') svg_dir = output_dir / Path('svg/') Path.mkdir(data_dir, parents=True, exist_ok=True) Path.mkdir(png_dir, parents=True, exist_ok=True) Path.mkdir(svg_dir, parents=True, exist_ok=True) hparams = Namespace(nhead=8, head_dim=128, d_feedforward=256, num_layers=3, learning_rate=learning_rate, batch_size=batch_size) # prepare dataset train_valid_set = MultivariateRNNMeanSeasonalityDataset( station_name=station_name, target=target, filepath="/input/python/input_seoul_imputed_hourly_pandas.csv", features=train_features, features_1=[ "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "v", "pres", "humid", "prep", "snow" ], features_2=['u'], fdate=train_fdate, tdate=train_tdate, sample_size=sample_size, output_size=output_size, train_valid_ratio=0.8) # first mkdir of seasonality Path.mkdir(png_dir / "seasonality", parents=True, exist_ok=True) Path.mkdir(svg_dir / "seasonality", parents=True, exist_ok=True) Path.mkdir(data_dir / "seasonality", parents=True, exist_ok=True) # fit & transform (seasonality) # without seasonality # train_valid_set.preprocess() # with seasonality train_valid_set.preprocess() # save seasonality index-wise # train_valid_set.broadcast_seasonality() train_valid_set.plot_fused_seasonality( data_dir / "seasonality_fused", png_dir / "seasonality_fused", svg_dir / "seasonality_fused") #for target in targets: for station_name in station_names: for target in train_features_periodic: print("Analyze " + target + "...") # if not Path("/input/python/input_jongro_imputed_hourly_pandas.csv").is_file(): # # load imputed result # _df_h = load_imputed(HOURLY_DATA_PATH) # df_h = _df_h.query('stationCode == "' + # str(SEOUL_STATIONS[station_name]) + '"') # df_h.to_csv("/input/python/input_jongro_imputed_hourly_pandas.csv") _df_seoul = pd.read_csv(HOURLY_DATA_PATH, index_col=[0, 1], parse_dates=[0]) # filter by station_name _df_station = _df_seoul.query('stationCode == "' + str(SEOUL_STATIONS[station_name]) + '"') _df_station.reset_index(level='stationCode', drop=True, inplace=True) df_sea_h = _df_station output_dir = Path("/mnt/data/STATS_ANALYSIS_" + str(sample_size) + "/" + station_name + "/" + target + "/") Path.mkdir(output_dir, parents=True, exist_ok=True) data_dir = output_dir / Path('csv/') png_dir = output_dir / Path('png/') svg_dir = output_dir / Path('svg/') Path.mkdir(data_dir, parents=True, exist_ok=True) Path.mkdir(png_dir, parents=True, exist_ok=True) Path.mkdir(svg_dir, parents=True, exist_ok=True) hparams = Namespace(nhead=8, head_dim=128, d_feedforward=256, num_layers=3, learning_rate=learning_rate, batch_size=batch_size) # prepare dataset train_valid_set = MultivariateRNNMeanSeasonalityDataset( station_name=station_name, target=target, filepath="/input/python/input_seoul_imputed_hourly_pandas.csv", features=train_features, features_1=train_features_nonperiodic, features_2=train_features_periodic, fdate=train_fdate, tdate=train_tdate, sample_size=sample_size, output_size=output_size, train_valid_ratio=0.8) # first mkdir of seasonality Path.mkdir(png_dir / "seasonality", parents=True, exist_ok=True) Path.mkdir(svg_dir / "seasonality", parents=True, exist_ok=True) Path.mkdir(data_dir / "seasonality", parents=True, exist_ok=True) # fit & transform (seasonality) # without seasonality # train_valid_set.preprocess() # with seasonality train_valid_set.preprocess() # save seasonality index-wise train_valid_set.broadcast_seasonality() test_set = MultivariateRNNMeanSeasonalityDataset( station_name=station_name, target=target, filepath="/input/python/input_seoul_imputed_hourly_pandas.csv", features=train_features, features_1=train_features_nonperiodic, features_2=train_features_periodic, fdate=test_fdate, tdate=test_tdate, sample_size=sample_size, output_size=output_size, scaler_X=train_valid_set.scaler_X, scaler_Y=train_valid_set.scaler_Y) test_set.transform() # save seasonality index-wise test_set.broadcast_seasonality() def run_01_CLT(): """ 1. Is data sufficient? * Central Limit Theorem => * Distribution of sample mean & sample std => * Is it normal or log-normal? """ _data_dir = data_dir / "01-CLT" _png_dir = png_dir / "01-CLT" _svg_dir = svg_dir / "01-CLT" Path.mkdir(_data_dir, parents=True, exist_ok=True) Path.mkdir(_png_dir, parents=True, exist_ok=True) Path.mkdir(_svg_dir, parents=True, exist_ok=True) # statistics of decomposed samples means_d = np.zeros(len(train_valid_set)) means_r = np.zeros(len(train_valid_set)) sample_means_d = np.zeros(len(train_valid_set)) # statistics of raw samples sample_means_r = np.zeros(len(train_valid_set)) # save sample statistics # len(train_valid_set) == 34895 for i, s in enumerate(train_valid_set): x, x_1d, x_sa, x_sw, x_sh, \ y, y_raw, y_sa, y_sw, y_sh, y_date = s if len(y) != output_size: break # it's not random sampling means_d[i] = np.mean(y) means_r[i] = np.mean(y_raw) nchoice = 64 for i in range(100): dr = np.random.choice(means_d, size=nchoice) sample_means_d[i] = np.mean(dr) rr = np.random.choice(means_r, size=nchoice) sample_means_r[i] = np.mean(rr) print("Sample & Pop. Mean : ", np.mean(sample_means_d), np.mean(means_d)) print("Sample & Pop. STD : ", np.std(sample_means_d) / sqrt(nchoice), np.std(means_d)) print("Sample & Pop. Mean : ", np.mean(sample_means_r), np.mean(means_r)) print("Sample & Pop. STD : ", np.std(sample_means_r) / sqrt(nchoice), np.std(means_r)) def run_02_MFDFA(): print("MF-DFA..") _data_dir = data_dir / "02-LRD-MFDFA" _png_dir = png_dir / "02-LRD-MFDFA" _svg_dir = svg_dir / "02-LRD-MFDFA" Path.mkdir(_data_dir, parents=True, exist_ok=True) Path.mkdir(_png_dir, parents=True, exist_ok=True) Path.mkdir(_svg_dir, parents=True, exist_ok=True) # Define unbounded process Xs = train_valid_set.ys Xs_raw = train_valid_set.ys_raw n_lag = 100 large_s = int(n_lag * 0.3) org_lag = np.unique(np.logspace(0.5, 3, n_lag).astype(int)) # Select a list of powers q # if q == 2 -> standard square root based average q_list = [-6, -2, -3, 2, 3, 6] # The order of the polynomial fitting for order in [1, 2, 3]: lag, dfa = MFDFA.MFDFA(Xs[target].to_numpy(), lag=org_lag, q=q_list, order=order) norm_dfa = np.zeros_like(dfa) for i in range(dfa.shape[1]): norm_dfa[:, i] = np.divide(dfa[:, i], np.sqrt(lag)) df = pd.DataFrame.from_dict({ str(q_list[i]): dfa[:, i] for i in range(dfa.shape[1]) }) df['s'] = lag df_norm = pd.DataFrame.from_dict({ str(q_list[i]): norm_dfa[:, i] for i in range(dfa.shape[1]) }) df_norm['s'] = lag # plot fig = plt.figure() plt.clf() sns.color_palette("tab10") sns.lineplot(x='s', y='value', hue='q', data=pd.melt(df_norm, id_vars=['s'], var_name='q')) q0fit = np.polynomial.Polynomial.fit( np.log10(lag)[large_s:], np.log10(df.to_numpy()[:, 0])[large_s:], 1) q0fit_vals = np.polynomial.polynomial.polyval( np.log10(lag), q0fit.coef) plt.plot(lag, np.power(10, q0fit_vals), label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format( q_list[0], q0fit.coef[1]), alpha=0.7, color='k', linestyle='dashed') qnfit = np.polynomial.Polynomial.fit( np.log10(lag)[large_s:], np.log10(df.to_numpy()[:, -1])[large_s:], 1) qnfit_vals = np.polynomial.polynomial.polyval( np.log10(lag), qnfit.coef) plt.plot(lag, np.power(10, q0fit_vals), label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format( q_list[-1], qnfit.coef[1]), alpha=0.7, color='k', linestyle='dashed') ax = plt.gca() leg_handles, leg_labels = ax.get_legend_handles_labels() for i in range(len(q_list)): leg_labels[i] = r'h({{{0}}})'.format(q_list[i]) ax.legend(leg_handles, leg_labels) ax.set_xlabel(r'$s$') ax.set_ylabel(r'$F^{(n)}(s)/\sqrt{s}$') ax.set_xscale('log') ax.set_yscale('log') df_norm.set_index('s', inplace=True) df_norm.to_csv(_data_dir / ('MFDFA_norm_res_o' + str(order) + '.csv')) png_path = _png_dir / ("MFDFA_norm_res_o" + str(order) + '.png') svg_path = _svg_dir / ("MFDFA_norm_res_o" + str(order) + '.svg') plt.savefig(png_path, dpi=600) plt.savefig(svg_path) plt.close() fig = plt.figure() plt.clf() sns.color_palette("tab10") sns.lineplot(x='s', y='value', hue='q', data=pd.melt(df, id_vars=['s'], var_name='q')) q0fit = np.polynomial.Polynomial.fit( np.log10(lag)[large_s:], np.log10(df.to_numpy()[:, 0])[large_s:], 1) q0fit_vals = np.polynomial.polynomial.polyval( np.log10(lag), q0fit.coef) plt.plot(lag, np.power(10, q0fit_vals), label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format( q_list[0], q0fit.coef[1]), alpha=0.7, color='k', linestyle='dashed') qnfit = np.polynomial.Polynomial.fit( np.log10(lag)[large_s:], np.log10(df.to_numpy()[:, -1])[large_s:], 1) qnfit_vals = np.polynomial.polynomial.polyval( np.log10(lag), qnfit.coef) plt.plot(lag, np.power(10, q0fit_vals), label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format( q_list[-1], qnfit.coef[1]), alpha=0.7, color='k', linestyle='dashed') ax = plt.gca() leg_handles, leg_labels = ax.get_legend_handles_labels() for i in range(len(q_list)): leg_labels[i] = r'h({{{0}}})'.format(q_list[i]) ax.legend(leg_handles, leg_labels) ax.set_xlabel(r'$s$') ax.set_ylabel(r'$F^{(n)}(s)$') ax.set_xscale('log') ax.set_yscale('log') df.set_index('s', inplace=True) df_norm.to_csv(_data_dir / ('MFDFA_res_o' + str(order) + '.csv')) png_path = _png_dir / ("MFDFA_res_o" + str(order) + '.png') svg_path = _svg_dir / ("MFDFA_res_o" + str(order) + '.svg') plt.savefig(png_path, dpi=600) plt.savefig(svg_path) plt.close() lag, dfa = MFDFA.MFDFA(Xs_raw[target].to_numpy(), lag=org_lag, q=q_list, order=order) norm_dfa = np.zeros_like(dfa) for i in range(dfa.shape[1]): norm_dfa[:, i] = dfa[:, i] / np.sqrt(lag[i]) df = pd.DataFrame.from_dict({ str(q_list[i]): dfa[:, i] for i in range(dfa.shape[1]) }) df['s'] = lag df_norm = pd.DataFrame.from_dict({ str(q_list[i]): norm_dfa[:, i] for i in range(dfa.shape[1]) }) df_norm['s'] = lag # plot fig = plt.figure() plt.clf() sns.color_palette("tab10") sns.lineplot(x='s', y='value', hue='q', data=pd.melt(df_norm, id_vars=['s'], var_name='q')) q0fit = np.polynomial.Polynomial.fit( np.log10(lag)[large_s:], np.log10(df.to_numpy()[:, 0])[large_s:], 1) q0fit_vals = np.polynomial.polynomial.polyval( np.log10(lag), q0fit.coef) plt.plot(lag, np.power(10, q0fit_vals), label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format( q_list[0], q0fit.coef[1]), alpha=0.7, color='k', linestyle='dashed') qnfit = np.polynomial.Polynomial.fit( np.log10(lag)[large_s:], np.log10(df.to_numpy()[:, -1])[large_s:], 1) qnfit_vals = np.polynomial.polynomial.polyval( np.log10(lag), qnfit.coef) plt.plot(lag, np.power(10, q0fit_vals), label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format( q_list[-1], qnfit.coef[1]), alpha=0.7, color='k', linestyle='dashed') ax = plt.gca() leg_handles, leg_labels = ax.get_legend_handles_labels() for i in range(len(q_list)): leg_labels[i] = r'h({{{0}}})'.format(q_list[i]) ax.legend(leg_handles, leg_labels) ax.set_xlabel(r'$s$') ax.set_ylabel(r'$F^{(n)}(s)/\sqrt{s}$') ax.set_xscale('log') ax.set_yscale('log') df_norm.set_index('s', inplace=True) df_norm.to_csv(_data_dir / ('MFDFA_norm_o' + str(order) + '.csv')) png_path = _png_dir / ("MFDFA_norm_o" + str(order) + '.png') svg_path = _svg_dir / ("MFDFA_norm_o" + str(order) + '.svg') plt.savefig(png_path, dpi=600) plt.savefig(svg_path) plt.close() fig = plt.figure() plt.clf() sns.color_palette("tab10") sns.lineplot(x='s', y='value', hue='q', data=pd.melt(df, id_vars=['s'], var_name='q')) q0fit = np.polynomial.Polynomial.fit( np.log10(lag)[large_s:], np.log10(df.to_numpy()[:, 0])[large_s:], 1) q0fit_vals = np.polynomial.polynomial.polyval( np.log10(lag), q0fit.coef) plt.plot(lag, np.power(10, q0fit_vals), label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format( q_list[0], q0fit.coef[1]), alpha=0.7, color='k', linestyle='dashed') qnfit = np.polynomial.Polynomial.fit( np.log10(lag)[large_s:], np.log10(df.to_numpy()[:, -1])[large_s:], 1) qnfit_vals = np.polynomial.polynomial.polyval( np.log10(lag), qnfit.coef) plt.plot(lag, np.power(10, q0fit_vals), label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format( q_list[-1], qnfit.coef[1]), alpha=0.7, color='k', linestyle='dashed') ax = plt.gca() leg_handles, leg_labels = ax.get_legend_handles_labels() for i in range(len(q_list)): leg_labels[i] = r'h({{{0}}})'.format(q_list[i]) ax.legend(leg_handles, leg_labels) ax.set_xlabel(r'$s$') ax.set_ylabel(r'$F^{(n)}(s)$') ax.set_xscale('log') ax.set_yscale('log') df.set_index('s', inplace=True) df_norm.to_csv(_data_dir / ('MFDFA_o' + str(order) + '.csv')) png_path = _png_dir / ("MFDFA_o" + str(order) + '.png') svg_path = _svg_dir / ("MFDFA_o" + str(order) + '.svg') plt.savefig(png_path, dpi=600) plt.savefig(svg_path) plt.close() def run_01_DFA(): print("DFA..") _data_dir = data_dir / "01-LRD-DFA" _png_dir = png_dir / "01-LRD-DFA" _svg_dir = svg_dir / "01-LRD-DFA" Path.mkdir(_data_dir, parents=True, exist_ok=True) Path.mkdir(_png_dir, parents=True, exist_ok=True) Path.mkdir(_svg_dir, parents=True, exist_ok=True) # Define unbounded process Xs = train_valid_set.ys Xs_raw = train_valid_set.ys_raw n_lag = 100 large_s = int(n_lag * 0.3) org_lag = np.unique(np.logspace(0.5, 3, n_lag).astype(int)) # Select a list of powers q # if q == 2 -> standard square root based average q_list = [2] def model_func(x, A, B): return A * np.power(x, B) # The order of the polynomial fitting for order in [1, 2, 3]: # RESIDUALS lag, dfa = MFDFA.MFDFA(Xs[target].to_numpy(), lag=org_lag, q=q_list, order=order) norm_dfa = np.zeros_like(dfa) for i in range(dfa.shape[1]): norm_dfa[:, i] = np.divide(dfa[:, i], np.sqrt(lag)) df = pd.DataFrame.from_dict({ str(q_list[i]): dfa[:, i] for i in range(dfa.shape[1]) }) df['s'] = lag df_norm = pd.DataFrame.from_dict({ str(q_list[i]): norm_dfa[:, i] for i in range(dfa.shape[1]) }) df_norm['s'] = lag # plot fig = plt.figure() sns.color_palette("tab10") sns.lineplot(x='s', y='value', hue='q', data=pd.melt(df_norm, id_vars=['s'], var_name='q')) base_lines = np.ones(len(lag)) * 10.0**(-2) * np.power( lag, 0.5) plt.plot(lag, base_lines, label=r'$h(2) = 0.5$', alpha=0.7, color='tab:green', linestyle='dashed') p0 = (1., 1.e-5) popt, pcov = sp.optimize.curve_fit( model_func, lag[large_s:], df_norm.to_numpy()[:, -1][large_s:], p0) coef_annot = popt[1] gamma_annot = 2.0 * (1.0 - popt[1]) estimated = model_func(lag, popt[0], popt[1]) plt.plot( lag, estimated, label=r'$h(2) = {{{0:.2f}}}, \gamma = {{{1:.2f}}}$'. format(coef_annot, gamma_annot), alpha=0.7, color='tab:orange', linestyle='dashed') ax = plt.gca() leg_handles, leg_labels = ax.get_legend_handles_labels() leg_labels[0] = r'$h(2)$' ax.legend(leg_handles, leg_labels) ax.set_xlabel(r'$s$') ax.set_ylabel(r'$F^{(n)}(s)$') ax.set_xscale('log') ax.set_yscale('log') df_norm.set_index('s', inplace=True) df_norm.to_csv(_data_dir / ('DFA_norm_res_o' + str(order) + '.csv')) png_path = _png_dir / ("DFA_norm_res_o" + str(order) + '.png') svg_path = _svg_dir / ("DFA_norm_res_o" + str(order) + '.svg') plt.savefig(png_path, dpi=600) plt.savefig(svg_path) plt.close() fig = plt.figure() sns.color_palette("tab10") sns.lineplot(x='s', y='value', hue='q', data=pd.melt(df, id_vars=['s'], var_name='q')) base_lines = np.ones(len(lag)) * \ 10.0**(-2) * np.power(lag, 0.5) plt.plot(lag, base_lines, label=r'$h(2) = 0.5$', alpha=0.7, color='tab:green', linestyle='dashed') p0 = (1., 1.e-5) popt, pcov = sp.optimize.curve_fit( model_func, lag[large_s:], df.to_numpy()[:, -1][large_s:], p0) coef_annot = popt[1] gamma_annot = 2.0 * (1.0 - popt[1]) estimated = model_func(lag, popt[0], popt[1]) plt.plot( lag, estimated, label=r'$h(2) = {{{0:.2f}}}, \gamma = {{{1:.2f}}}$'. format(coef_annot, gamma_annot), alpha=0.7, color='tab:orange', linestyle='dashed') ax = plt.gca() leg_handles, leg_labels = ax.get_legend_handles_labels() leg_labels[0] = r'$h(2)$' ax.legend(leg_handles, leg_labels) ax.set_xlabel(r'$s$') ax.set_ylabel(r'$F^{(n)}(s)$') ax.set_xscale('log') ax.set_yscale('log') df.set_index('s', inplace=True) df_norm.to_csv(_data_dir / ('DFA_res_o' + str(order) + '.csv')) png_path = _png_dir / ("DFA_res_o" + str(order) + '.png') svg_path = _svg_dir / ("DFA_res_o" + str(order) + '.svg') plt.savefig(png_path, dpi=600) plt.savefig(svg_path) plt.close() # RAW lag, dfa = MFDFA.MFDFA(Xs_raw[target].to_numpy(), lag=org_lag, q=q_list, order=order) norm_dfa = np.zeros_like(dfa) for i in range(dfa.shape[1]): norm_dfa[:, i] = dfa[:, i] / np.sqrt(lag[i]) df = pd.DataFrame.from_dict({ str(q_list[i]): dfa[:, i] for i in range(dfa.shape[1]) }) df['s'] = lag df_norm = pd.DataFrame.from_dict({ str(q_list[i]): norm_dfa[:, i] for i in range(dfa.shape[1]) }) df_norm['s'] = lag # plot fig = plt.figure() sns.color_palette("tab10") sns.lineplot(x='s', y='value', hue='q', data=pd.melt(df_norm, id_vars=['s'], var_name='q')) base_lines = np.ones(len(lag)) * \ 10.0**(-2) * np.power(lag, 0.5) plt.plot(lag, base_lines, label=r'$h(2) = 0.5$', alpha=0.7, color='tab:green', linestyle='dashed') p0 = (1., 1.e-5) popt, pcov = sp.optimize.curve_fit( model_func, lag[large_s:], df_norm.to_numpy()[:, -1][large_s:], p0) coef_annot = popt[1] gamma_annot = 2.0 * (1.0 - popt[1]) estimated = model_func(lag, popt[0], popt[1]) plt.plot( lag, estimated, label=r'$h(2) = {{{0:.2f}}}, \gamma = {{{1:.2f}}}$'. format(coef_annot, gamma_annot), alpha=0.7, color='tab:orange', linestyle='dashed') ax = plt.gca() leg_handles, leg_labels = ax.get_legend_handles_labels() leg_labels[0] = r'$h(2)$' ax.legend(leg_handles, leg_labels) ax.set_xlabel(r'$s$') ax.set_ylabel(r'$F^{(n)}(s)/\sqrt{s}$') ax.set_xscale('log') ax.set_yscale('log') df_norm.set_index('s', inplace=True) df_norm.to_csv(_data_dir / ('DFA_norm_o' + str(order) + '.csv')) png_path = _png_dir / ("DFA_norm_o" + str(order) + '.png') svg_path = _svg_dir / ("DFA_norm_o" + str(order) + '.svg') plt.savefig(png_path, dpi=600) plt.savefig(svg_path) plt.close() fig = plt.figure() sns.color_palette("tab10") sns.lineplot(x='s', y='value', hue='q', data=pd.melt(df, id_vars=['s'], var_name='q')) base_lines = np.ones(len(lag)) * \ 10.0**(-2) * np.power(lag, 0.5) plt.plot(lag, base_lines, label=r'$h(2) = 0.5$', alpha=0.7, color='tab:green', linestyle='dashed') p0 = (1., 1.e-5) popt, pcov = sp.optimize.curve_fit( model_func, lag[large_s:], df.to_numpy()[:, -1][large_s:], p0) coef_annot = popt[1] gamma_annot = 2.0 * (1.0 - popt[1]) estimated = model_func(lag, popt[0], popt[1]) plt.plot( lag, estimated, label=r'$h(2) = {{{0:.2f}}}, \gamma = {{{1:.2f}}}$'. format(coef_annot, gamma_annot), alpha=0.7, color='tab:orange', linestyle='dashed') ax = plt.gca() leg_handles, leg_labels = ax.get_legend_handles_labels() leg_labels[0] = r'$h(2)$' ax.legend(leg_handles, leg_labels) ax.set_xlabel(r'$s$') ax.set_ylabel(r'$F^{(n)}(s)$') ax.set_xscale('log') ax.set_yscale('log') df.set_index('s', inplace=True) df_norm.to_csv(_data_dir / ('DFA_o' + str(order) + '.csv')) png_path = _png_dir / ("DFA_o" + str(order) + '.png') svg_path = _svg_dir / ("DFA_o" + str(order) + '.svg') plt.savefig(png_path, dpi=600) plt.savefig(svg_path) plt.close() run_01_DFA() run_02_MFDFA()