Beispiel #1
0
def stats_ou(station_name="종로구"):
    print("Data loading start...")
    _df_h = data.load_imputed(HOURLY_DATA_PATH)
    df_h = _df_h.query('stationCode == "' + str(SEOUL_STATIONS[station_name]) +
                       '"')

    if station_name == '종로구' and \
        not Path("/input/python/input_jongno_imputed_hourly_pandas.csv").is_file():
        # load imputed result

        df_h.to_csv("/input/python/input_jongno_imputed_hourly_pandas.csv")

    print("Data loading complete")
    targets = ["PM10", "PM25"]
    intT = {"PM10": 19.01883611948326, "PM25": 20.4090132600871}
    sample_size = 48
    output_size = 24
    train_fdate = dt.datetime(2008, 1, 5, 0).astimezone(seoultz)
    train_tdate = dt.datetime(2018, 12, 31, 23).astimezone(seoultz)
    test_fdate = dt.datetime(2019, 1, 1, 0).astimezone(seoultz)
    test_tdate = dt.datetime(2020, 10, 31, 23).astimezone(seoultz)
    # consective dates between train and test
    assert train_tdate + dt.timedelta(hours=1) == test_fdate

    for target in targets:
        output_dir = Path('/mnt/data/OU/' + station_name + "/" + target + "/")
        png_dir = output_dir / Path('png/')
        svg_dir = output_dir / Path('svg/')
        data_dir = output_dir / Path('csv/')
        Path.mkdir(data_dir, parents=True, exist_ok=True)
        Path.mkdir(png_dir, parents=True, exist_ok=True)
        Path.mkdir(svg_dir, parents=True, exist_ok=True)

        # numeric_pipeline_X = Pipeline(
        #     [('seasonalitydecompositor',
        #         data.SeasonalityDecompositor_AWH(smoothing=True, smoothingFrac=0.05)),
        #      ('standardtransformer', data.StandardScalerWrapper(scaler=StandardScaler()))])

        # scaler = ColumnTransformer(
        #     transformers=[
        #         ('num', numeric_pipeline_X, [target])])

        # prepare dataset
        train_set = data.UnivariateRNNMeanSeasonalityDataset(
            station_name=station_name,
            target=target,
            filepath=HOURLY_DATA_PATH,
            features=[
                "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "u", "v",
                "pres", "humid", "prep", "snow"
            ],
            features_1=[
                "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "v", "pres",
                "humid", "prep", "snow"
            ],
            features_2=['u'],
            fdate=train_fdate,
            tdate=train_tdate,
            sample_size=sample_size,
            output_size=output_size,
            train_valid_ratio=0.8)

        train_set.preprocess()

        test_set = data.UnivariateRNNMeanSeasonalityDataset(
            station_name=station_name,
            target=target,
            filepath=HOURLY_DATA_PATH,
            features=[
                "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "u", "v",
                "pres", "humid", "prep", "snow"
            ],
            features_1=[
                "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "v", "pres",
                "humid", "prep", "snow"
            ],
            features_2=['u'],
            fdate=test_fdate,
            tdate=test_tdate,
            sample_size=sample_size,
            output_size=output_size,
            scaler_X=train_set.scaler_X,
            scaler_Y=train_set.scaler_Y)

        test_set.transform()
        test_set.plot_seasonality(data_dir, png_dir, svg_dir)

        df_train = train_set.ys.copy()
        df_test = test_set.ys.loc[test_fdate:test_tdate, :].copy()
        df_test_org = test_set.ys_raw.loc[test_fdate:test_tdate, :].copy()

        print("Simulate by Ornstein–Uhlenbeck process for " + target + "...")

        def run_OU(_intT):
            df_obs = mw_df(df_test_org, target, output_size, test_fdate,
                           test_tdate)
            dates = df_obs.index
            df_sim = sim_OU(df_test, dates, target, np.mean(df_test.to_numpy()), np.std(df_test.to_numpy()),\
                            _intT[target], test_set.scaler_Y, output_size)
            assert df_obs.shape == df_sim.shape

            # join df
            plot_OU(df_sim, df_obs, target, data_dir, png_dir, svg_dir,
                    test_fdate, test_tdate, station_name, output_size)
            # save to csv
            csv_fname = "df_test_obs.csv"
            df_obs.to_csv(data_dir / csv_fname)

            csv_fname = "df_test_sim.csv"
            df_sim.to_csv(data_dir / csv_fname)

        run_OU(intT)
Beispiel #2
0
def ml_mlp_mul_ms(station_name="종로구"):
    print("Start Multivariate MLP Mean Seasonality Decomposition (MSE) Model")
    targets = ["PM10", "PM25"]
    # targets = ["SO2", "CO", "O3", "NO2", "PM10", "PM25",
    #                   "temp", "u", "v", "pres", "humid", "prep", "snow"]
    # 24*14 = 336
    #sample_size = 336
    sample_size = 48
    output_size = 24
    # If you want to debug, fast_dev_run = True and n_trials should be small number
    fast_dev_run = False
    n_trials = 128
    # fast_dev_run = True
    # n_trials = 1

    # Hyper parameter
    epoch_size = 500
    batch_size = 64
    learning_rate = 1e-3

    # Blocked Cross Validation
    # neglect small overlap between train_dates and valid_dates
    # 11y = ((2y, 0.5y), (2y, 0.5y), (2y, 0.5y), (2.5y, 1y))
    train_dates = [(dt.datetime(2008, 1, 4, 1).astimezone(SEOULTZ),
                    dt.datetime(2009, 12, 31, 23).astimezone(SEOULTZ)),
                   (dt.datetime(2010, 7, 1, 0).astimezone(SEOULTZ),
                    dt.datetime(2012, 6, 30, 23).astimezone(SEOULTZ)),
                   (dt.datetime(2013, 1, 1, 0).astimezone(SEOULTZ),
                    dt.datetime(2014, 12, 31, 23).astimezone(SEOULTZ)),
                   (dt.datetime(2015, 7, 1, 0).astimezone(SEOULTZ),
                    dt.datetime(2017, 12, 31, 23).astimezone(SEOULTZ))]
    valid_dates = [(dt.datetime(2010, 1, 1, 0).astimezone(SEOULTZ),
                    dt.datetime(2010, 6, 30, 23).astimezone(SEOULTZ)),
                   (dt.datetime(2012, 7, 1, 0).astimezone(SEOULTZ),
                    dt.datetime(2012, 12, 31, 23).astimezone(SEOULTZ)),
                   (dt.datetime(2015, 1, 1, 0).astimezone(SEOULTZ),
                    dt.datetime(2015, 6, 30, 23).astimezone(SEOULTZ)),
                   (dt.datetime(2018, 1, 1, 0).astimezone(SEOULTZ),
                    dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ))]
    train_valid_fdate = dt.datetime(2008, 1, 3, 1).astimezone(SEOULTZ)
    train_valid_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ)

    # Debug
    if fast_dev_run:
        train_dates = [(dt.datetime(2015, 7, 1, 0).astimezone(SEOULTZ),
                        dt.datetime(2017, 12, 31, 23).astimezone(SEOULTZ))]
        valid_dates = [(dt.datetime(2018, 1, 1, 0).astimezone(SEOULTZ),
                        dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ))]
        train_valid_fdate = dt.datetime(2015, 7, 1, 0).astimezone(SEOULTZ)
        train_valid_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ)

    test_fdate = dt.datetime(2019, 1, 1, 0).astimezone(SEOULTZ)
    test_tdate = dt.datetime(2020, 10, 31, 23).astimezone(SEOULTZ)

    # check date range assumption
    assert len(train_dates) == len(valid_dates)
    for i, (td, vd) in enumerate(zip(train_dates, valid_dates)):
        assert vd[0] > td[1]
    assert test_fdate > train_dates[-1][1]
    assert test_fdate > valid_dates[-1][1]

    train_features = [
        "SO2", "CO", "NO2", "PM10", "PM25", "temp", "wind_spd", "wind_cdir",
        "wind_sdir", "pres", "humid", "prep"
    ]
    train_features_periodic = [
        "SO2", "CO", "NO2", "PM10", "PM25", "temp", "wind_spd", "wind_cdir",
        "wind_sdir", "pres", "humid"
    ]
    train_features_nonperiodic = ["prep"]

    for target in targets:
        print("Training " + target + "...")
        output_dir = Path(
            f"/mnt/data/MLPMSMultivariate/{station_name}/{target}/")
        Path.mkdir(output_dir, parents=True, exist_ok=True)
        model_dir = output_dir / "models"
        Path.mkdir(model_dir, parents=True, exist_ok=True)
        log_dir = output_dir / "log"
        Path.mkdir(log_dir, parents=True, exist_ok=True)

        _df_h = data.load_imputed(HOURLY_DATA_PATH)
        df_h = _df_h.query('stationCode == "' +
                           str(SEOUL_STATIONS[station_name]) + '"')

        if station_name == '종로구' and \
            not Path("/input/python/input_jongno_imputed_hourly_pandas.csv").is_file():
            # load imputed result

            df_h.to_csv("/input/python/input_jongno_imputed_hourly_pandas.csv")

        # construct dataset for seasonality
        print("Construct Train/Validation Sets...", flush=True)
        train_valid_dataset = construct_dataset(train_valid_fdate,
                                                train_valid_tdate,
                                                filepath=HOURLY_DATA_PATH,
                                                station_name=station_name,
                                                target=target,
                                                sample_size=sample_size,
                                                output_size=output_size,
                                                transform=False)
        # compute seasonality
        train_valid_dataset.preprocess()

        # For Block Cross Validation..
        # load dataset in given range dates and transform using scaler from train_valid_set
        # all dataset are saved in tuple
        print("Construct Training Sets...", flush=True)
        train_datasets = tuple(
            construct_dataset(td[0],
                              td[1],
                              scaler_X=train_valid_dataset.scaler_X,
                              scaler_Y=train_valid_dataset.scaler_Y,
                              filepath=HOURLY_DATA_PATH,
                              station_name=station_name,
                              target=target,
                              sample_size=sample_size,
                              output_size=output_size,
                              features=train_features,
                              features_periodic=train_features_periodic,
                              features_nonperiodic=train_features_nonperiodic,
                              transform=True) for td in train_dates)

        print("Construct Validation Sets...", flush=True)
        valid_datasets = tuple(
            construct_dataset(vd[0],
                              vd[1],
                              scaler_X=train_valid_dataset.scaler_X,
                              scaler_Y=train_valid_dataset.scaler_Y,
                              filepath=HOURLY_DATA_PATH,
                              station_name=station_name,
                              target=target,
                              sample_size=sample_size,
                              output_size=output_size,
                              features=train_features,
                              features_periodic=train_features_periodic,
                              features_nonperiodic=train_features_nonperiodic,
                              transform=True) for vd in valid_dates)

        # just single test set
        print("Construct Test Sets...", flush=True)
        test_dataset = construct_dataset(
            test_fdate,
            test_tdate,
            scaler_X=train_valid_dataset.scaler_X,
            scaler_Y=train_valid_dataset.scaler_Y,
            filepath=HOURLY_DATA_PATH,
            station_name=station_name,
            target=target,
            sample_size=sample_size,
            output_size=output_size,
            features=train_features,
            features_periodic=train_features_periodic,
            features_nonperiodic=train_features_nonperiodic,
            transform=True)

        # convert tuple of datasets to ConcatDataset
        train_dataset = ConcatDataset(train_datasets)
        val_dataset = ConcatDataset(valid_datasets)

        # num_layer == number of hidden layer
        hparams = Namespace(num_layers=1,
                            layer_size=128,
                            learning_rate=learning_rate,
                            batch_size=batch_size)

        def objective(trial):
            model = BaseMLPModel(
                trial=trial,
                hparams=hparams,
                input_size=sample_size * len(train_features),
                sample_size=sample_size,
                output_size=output_size,
                station_name=station_name,
                target=target,
                features=train_features,
                features_periodic=train_features_periodic,
                features_nonperiodic=train_features_nonperiodic,
                train_dataset=train_dataset,
                val_dataset=val_dataset,
                test_dataset=test_dataset,
                scaler_X=train_valid_dataset.scaler_X,
                scaler_Y=train_valid_dataset.scaler_Y,
                output_dir=output_dir)

            # most basic trainer, uses good defaults
            trainer = Trainer(gpus=1 if torch.cuda.is_available() else None,
                              precision=32,
                              min_epochs=1,
                              max_epochs=20,
                              default_root_dir=output_dir,
                              fast_dev_run=fast_dev_run,
                              logger=True,
                              checkpoint_callback=False,
                              callbacks=[
                                  PyTorchLightningPruningCallback(
                                      trial, monitor="valid/MSE")
                              ])

            trainer.fit(model)

            # Don't Log
            # hyperparameters = model.hparams
            # trainer.logger.log_hyperparams(hyperparameters)

            return trainer.callback_metrics.get("valid/MSE")

        if n_trials > 1:
            study = optuna.create_study(direction="minimize")
            study.enqueue_trial({
                'sigma': 1.3,
                'num_layers': 4,
                'layer_size': 8,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            study.enqueue_trial({
                'sigma': 1.3,
                'num_layers': 4,
                'layer_size': 32,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            study.enqueue_trial({
                'sigma': 1.3,
                'num_layers': 4,
                'layer_size': 64,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            study.enqueue_trial({
                'sigma': 1.3,
                'num_layers': 4,
                'layer_size': 32,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            study.enqueue_trial({
                'sigma': 1.3,
                'num_layers': 8,
                'layer_size': 32,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            study.enqueue_trial({
                'sigma': 1.3,
                'num_layers': 12,
                'layer_size': 32,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            study.enqueue_trial({
                'sigma': 0.7,
                'num_layers': 4,
                'layer_size': 32,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            study.enqueue_trial({
                'sigma': 2.0,
                'num_layers': 4,
                'layer_size': 32,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            })
            # timeout = 3600*36 = 36h
            study.optimize(objective, n_trials=n_trials, timeout=3600 * 36)

            trial = study.best_trial

            print("  Value: ", trial.value)

            print("  Params: ")
            for key, value in trial.params.items():
                print("    {}: {}".format(key, value))
            print("sample_size : ", sample_size)
            print("output_size : ", output_size)

            # plot optmization results
            fig_cont1 = optv.plot_contour(study,
                                          params=['num_layers', 'layer_size'])
            fig_cont1.write_image(
                str(output_dir / "contour_num_layers_layer_size.png"))
            fig_cont1.write_image(
                str(output_dir / "contour_num_layers_layer_size.svg"))

            fig_edf = optv.plot_edf(study)
            fig_edf.write_image(str(output_dir / "edf.png"))
            fig_edf.write_image(str(output_dir / "edf.svg"))

            fig_iv = optv.plot_intermediate_values(study)
            fig_iv.write_image(str(output_dir / "intermediate_values.png"))
            fig_iv.write_image(str(output_dir / "intermediate_values.svg"))

            fig_his = optv.plot_optimization_history(study)
            fig_his.write_image(str(output_dir / "opt_history.png"))
            fig_his.write_image(str(output_dir / "opt_history.svg"))

            fig_pcoord = optv.plot_parallel_coordinate(
                study, params=['num_layers', 'layer_size'])
            fig_pcoord.write_image(str(output_dir / "parallel_coord.png"))
            fig_pcoord.write_image(str(output_dir / "parallel_coord.svg"))

            fig_slice = optv.plot_slice(study,
                                        params=['num_layers', 'layer_size'])
            fig_slice.write_image(str(output_dir / "slice.png"))
            fig_slice.write_image(str(output_dir / "slice.svg"))

            # set hparams with optmized value
            hparams.num_layers = trial.params['num_layers']
            hparams.layer_size = trial.params['layer_size']

            dict_hparams = copy.copy(vars(hparams))
            dict_hparams["sample_size"] = sample_size
            dict_hparams["output_size"] = output_size
            with open(output_dir / 'hparams.json', 'w') as f:
                print(dict_hparams, file=f)
            with open(output_dir / 'hparams.csv', 'w') as f:
                print(pd.DataFrame.from_dict(dict_hparams, orient='index'),
                      file=f)

        model = BaseMLPModel(hparams=hparams,
                             input_size=sample_size * len(train_features),
                             sample_size=sample_size,
                             output_size=output_size,
                             station_name=station_name,
                             target=target,
                             features=train_features,
                             features_periodic=train_features_periodic,
                             features_nonperiodic=train_features_nonperiodic,
                             train_dataset=train_dataset,
                             val_dataset=val_dataset,
                             test_dataset=test_dataset,
                             scaler_X=train_valid_dataset.scaler_X,
                             scaler_Y=train_valid_dataset.scaler_Y,
                             output_dir=output_dir)

        # record input
        for i, _train_set in enumerate(train_datasets):
            _train_set.to_csv(
                model.data_dir /
                ("df_trainset_{0}_".format(str(i).zfill(2)) + target + ".csv"))
        for i, _valid_set in enumerate(valid_datasets):
            _valid_set.to_csv(
                model.data_dir /
                ("df_validset_{0}_".format(str(i).zfill(2)) + target + ".csv"))
        train_valid_dataset.to_csv(model.data_dir /
                                   ("df_trainvalidset_" + target + ".csv"))
        test_dataset.to_csv(model.data_dir / ("df_testset_" + target + ".csv"))

        checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join(
            model_dir, "train_{epoch}_{valid/MSE:.2f}"),
                                                           monitor="valid/MSE",
                                                           period=10)

        early_stop_callback = EarlyStopping(monitor='valid/MSE',
                                            min_delta=0.001,
                                            patience=30,
                                            verbose=True,
                                            mode='min')

        log_version = dt.date.today().strftime("%y%m%d-%H-%M")
        loggers = [ \
            TensorBoardLogger(log_dir, version=log_version),
            CSVLogger(log_dir, version=log_version)]

        # most basic trainer, uses good defaults
        trainer = Trainer(gpus=1 if torch.cuda.is_available() else None,
                          precision=32,
                          min_epochs=1,
                          max_epochs=epoch_size,
                          default_root_dir=output_dir,
                          fast_dev_run=fast_dev_run,
                          logger=loggers,
                          log_every_n_steps=5,
                          flush_logs_every_n_steps=10,
                          callbacks=[early_stop_callback],
                          checkpoint_callback=checkpoint_callback)

        trainer.fit(model)

        # run test set
        trainer.test(ckpt_path=None)

        shutil.rmtree(model_dir)
Beispiel #3
0
def stats_msea_acf(station_name="종로구"):
    print("Data loading start...")
    if Path("/input/python/input_jongro_imputed_hourly_pandas.csv").is_file():
        df_d = data.load_imputed(
            "/input/python/input_jongro_imputed_daily_pandas.csv")
        df_h = data.load_imputed(
            "/input/python/input_jongro_imputed_hourly_pandas.csv")
    else:
        # load imputed result
        _df_d = data.load_imputed(DAILY_DATA_PATH)
        _df_h = data.load_imputed(HOURLY_DATA_PATH)
        df_d = _df_d.query('stationCode == "' +
                           str(SEOUL_STATIONS[station_name]) + '"')
        df_h = _df_h.query('stationCode == "' +
                           str(SEOUL_STATIONS[station_name]) + '"')

        df_d.to_csv("/input/python/input_jongro_imputed_daily_pandas.csv")
        df_h.to_csv("/input/python/input_jongro_imputed_hourly_pandas.csv")

    print("Data loading complete")
    targets = ["PM10", "PM25"]
    sample_size = 48
    output_size = 24
    epoch_size = 300
    batch_size = 32

    for target in targets:
        dir_prefix = Path("/mnt/data/msea_acf/" + station_name + "/" + target +
                          "/")
        Path.mkdir(dir_prefix, parents=True, exist_ok=True)
        target_sea_h_path = Path("/input/msea/weekly/" + station_name + "/" +
                                 target + "/df_" + target + ".csv")

        df_sea_h = pd.read_csv(target_sea_h_path,
                               index_col=[0],
                               parse_dates=[0])

        nlag = 24 * 10

        raw_acf = sm.tsa.acf(df_sea_h[[target + "_raw"]], nlags=nlag)
        ys_acf = sm.tsa.acf(df_sea_h[[target + "_ys"]], nlags=nlag)
        yr_acf = sm.tsa.acf(df_sea_h[[target + "_yr"]], nlags=nlag)
        ds_acf = sm.tsa.acf(df_sea_h[[target + "_ds"]], nlags=nlag)
        dr_acf = sm.tsa.acf(df_sea_h[[target + "_dr"]], nlags=nlag)
        ws_acf = sm.tsa.acf(df_sea_h[[target + "_ws"]], nlags=nlag)
        wr_acf = sm.tsa.acf(df_sea_h[[target + "_wr"]], nlags=nlag)

        raw_acf_sr = pd.Series(raw_acf, name="raw")
        ys_acf_sr = pd.Series(ys_acf, name="ys")
        yr_acf_sr = pd.Series(yr_acf, name="yr")
        ds_acf_sr = pd.Series(ds_acf, name="ds")
        dr_acf_sr = pd.Series(dr_acf, name="dr")
        ws_acf_sr = pd.Series(ws_acf, name="ws")
        wr_acf_sr = pd.Series(wr_acf, name="wr")

        acf_df = merge_acfs([
            raw_acf_sr, ys_acf_sr, yr_acf_sr, ds_acf_sr, dr_acf_sr, ws_acf_sr,
            wr_acf_sr
        ], ["raw", "ys", "yr", "ds", "dr", "ws", "wr"])

        plot_acf(ys_acf_sr, target, nlag, "ys", "Annual Seasonality",
                 dir_prefix)
        plot_acf(yr_acf_sr, target, nlag, "yr", "Annual Residual", dir_prefix)
        plot_acf(ds_acf_sr, target, nlag, "ds", "Daily Seasonality",
                 dir_prefix)
        plot_acf(dr_acf_sr, target, nlag, "dr", "Daily Residual", dir_prefix)
        plot_acf(ws_acf_sr, target, nlag, "ws", "Weekly Seasonality",
                 dir_prefix)
        plot_acf(wr_acf_sr, target, nlag, "wr", "Weekly Residual", dir_prefix)
        plot_acfs(acf_df, ["raw", "yr", "dr", "wr"], target, nlag,
                  "Autocorrelation", dir_prefix)
        plot_acfs(acf_df, ["raw", "yr"], target, nlag, "Autocorrelation",
                  dir_prefix)
        plot_acfs(acf_df, ["raw", "dr"], target, nlag, "Autocorrelation",
                  dir_prefix)
        plot_acfs(acf_df, ["raw", "wr"], target, nlag, "Autocorrelation",
                  dir_prefix)
Beispiel #4
0
def stats_arima(station_name="종로구"):
    print("Data loading start...", flush=True)
    _df_h = data.load_imputed(HOURLY_DATA_PATH)
    df_h = _df_h.query('stationCode == "' + str(SEOUL_STATIONS[station_name]) +
                       '"')

    if station_name == '종로구' and \
        not Path("/input/python/input_jongno_imputed_hourly_pandas.csv").is_file():
        # load imputed result

        df_h.to_csv("/input/python/input_jongno_imputed_hourly_pandas.csv")

    print("Data loading complete", flush=True)
    targets = ["PM10", "PM25"]

    # p (1, 0, 0) ~ (3, 0, 0), (4, 0, 0) ~ (6, 0, 0), (7, 0, 0) ~ (9, 0, 0),
    # p (1, 0, 1) ~ (3, 0, 1), (4, 0, 1) ~ (6, 0, 1), (7, 0, 1) ~ (9, 0, 1),
    # p (1, 0, 2) ~ (3, 0, 2), (4, 0, 2) ~ (6, 0, 2), (7, 0, 2) ~ (9, 0, 2),
    # orders1 = [(_p, 0, _q) for _q, _p in itertools.product(range(3), range(10)) if not (_p == 0 and _q == 0)]
    # orders2 = [(_p, 1, _q) for _q, _p in itertools.product(range(3), range(10)) if not (_p == 0 and _q == 0)]
    # orders = orders1 + orders2
    # orders = [(_p, 0, _q) for _q, _p in itertools.product([0], range(1, 4)) if not (_p == 0 and _q == 0)]
    # orders = [(_p, 0, _q) for _q, _p in itertools.product([2], range(4, 7)) if not (_p == 0 and _q == 0)]
    # orders = [(_p, 0, _q) for _q, _p in itertools.product(range(0, 48, 6), range(0, 48, 6)) if not (_p == 0 and _q == 0)]
    # orders = [(_p, 0, _q) for _q, _p in itertools.product([0], [24, 48]) if not (_p == 0 and _q == 0)]
    # orders = [(8, 0, 0), (9, 0, 0)]
    orders = [(2, 0, 0), (3, 0, 0)]

    sample_size = 48
    output_size = 24
    train_fdate = dt.datetime(2008, 1, 4, 0).astimezone(SEOULTZ)
    train_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ)
    test_fdate = dt.datetime(2019, 1, 1, 0).astimezone(SEOULTZ)
    test_tdate = dt.datetime(2020, 10, 31, 23).astimezone(SEOULTZ)
    # consective dates between train and test
    assert train_tdate + dt.timedelta(hours=1) == test_fdate

    for target in targets:
        for order in orders:
            output_dir = Path('/mnt/data/ARIMA_' + str(order) + '/' +
                              station_name + "/" + target + "/")
            png_dir = output_dir / Path('png/')
            svg_dir = output_dir / Path('svg/')
            data_dir = output_dir / Path('csv/')
            Path.mkdir(data_dir, parents=True, exist_ok=True)
            Path.mkdir(png_dir, parents=True, exist_ok=True)
            Path.mkdir(svg_dir, parents=True, exist_ok=True)
            norm_values, norm_maxlog = boxcox(df_h[target])
            norm_target = "norm_" + target

            train_set = data.UnivariateRNNMeanSeasonalityDataset(
                station_name=station_name,
                target=target,
                filepath=HOURLY_DATA_PATH,
                features=[
                    "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "u", "v",
                    "pres", "humid", "prep", "snow"
                ],
                features_1=[
                    "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "v",
                    "pres", "humid", "prep", "snow"
                ],
                features_2=['u'],
                fdate=train_fdate,
                tdate=train_tdate,
                sample_size=sample_size,
                output_size=output_size,
                train_valid_ratio=0.8)

            train_set.preprocess()

            # set fdate=test_fdate,
            test_set = data.UnivariateRNNMeanSeasonalityDataset(
                station_name=station_name,
                target=target,
                filepath=HOURLY_DATA_PATH,
                features=[
                    "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "u", "v",
                    "pres", "humid", "prep", "snow"
                ],
                features_1=[
                    "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "v",
                    "pres", "humid", "prep", "snow"
                ],
                features_2=['u'],
                fdate=test_fdate,
                tdate=test_tdate,
                sample_size=sample_size,
                output_size=output_size,
                scaler_X=train_set.scaler_X,
                scaler_Y=train_set.scaler_Y)

            test_set.transform()

            df_train = train_set.ys.loc[train_fdate:train_tdate, :].copy()
            df_test = test_set.ys.loc[test_fdate:test_tdate, :].copy()
            df_test_org = test_set.ys_raw.loc[test_fdate:test_tdate, :].copy()

            print("ARIMA " + str(order) + " of " + target + "...", flush=True)

            def run_arima(order):
                df_obs = mw_df(df_test_org, target, output_size, test_fdate,
                               test_tdate)
                dates = df_obs.index

                df_sim = sim_arima(df_train, df_test, dates, target, \
                                   order, test_set.scaler_Y, sample_size, output_size, data_dir)

                assert df_obs.shape == df_sim.shape

                # join df
                plot_arima(df_sim, df_obs, target, order, data_dir, png_dir,
                           svg_dir, test_fdate, test_tdate, station_name,
                           output_size)
                # save to csv
                csv_fname = "df_test_obs.csv"
                df_obs.to_csv(data_dir / csv_fname)

                csv_fname = "df_test_sim.csv"
                df_sim.to_csv(data_dir / csv_fname)

            print("ARIMA " + str(order) + " ...")
            run_arima(order)
Beispiel #5
0
def ml_xgboost(station_name="종로구"):
    print("Start Multivariate XGBoost", flush=True)
    _df_h = data.load_imputed(HOURLY_DATA_PATH)
    df_h = _df_h.query('stationCode == "' +
                        str(SEOUL_STATIONS[station_name]) + '"')

    if station_name == '종로구' and \
        not Path("/input/python/input_jongno_imputed_hourly_pandas.csv").is_file():
        # load imputed result

        df_h.to_csv("/input/python/input_jongno_imputed_hourly_pandas.csv")

    print("Data loading complete", flush=True)
    targets = ["PM10", "PM25"]

    features=["SO2", "CO", "O3", "NO2", "PM10", "PM25",
            "temp", "wind_spd", "wind_cdir", "wind_sdir",
            "pres", "humid", "prep"]
    features_periodic=["SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp",
                        "wind_spd", "wind_cdir", "wind_sdir", "pres", "humid"]
    features_nonperiodic=["prep"]

    # use one step input
    sample_size = 1
    output_size = 24
    train_fdate = dt.datetime(2008, 1, 3, 0).astimezone(SEOULTZ)
    train_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ)
    test_fdate = dt.datetime(2019, 1, 1, 0).astimezone(SEOULTZ)
    test_tdate = dt.datetime(2020, 10, 31, 23).astimezone(SEOULTZ)

    # consective dates between train and test
    assert train_tdate + dt.timedelta(hours=1) == test_fdate

    # check date range assumption
    assert test_tdate > train_fdate
    assert test_fdate > train_tdate

    for target in targets:
        train_set = data.MultivariateMeanSeasonalityDataset(
            station_name=station_name,
            target=target,
            filepath=HOURLY_DATA_PATH,
            features=features,
            features_1=features_nonperiodic,
            features_2=features_periodic,
            fdate=train_fdate,
            tdate=train_tdate,
            sample_size=sample_size,
            output_size=output_size,
            train_valid_ratio=0.8)

        train_set.preprocess()

        # set fdate=test_fdate,
        test_set = data.MultivariateMeanSeasonalityDataset(
            station_name=station_name,
            target=target,
            filepath=HOURLY_DATA_PATH,
            features=features,
            features_1=features_nonperiodic,
            features_2=features_periodic,
            fdate=test_fdate,
            tdate=test_tdate,
            sample_size=sample_size,
            output_size=output_size,
            scaler_X=train_set.scaler_X,
            scaler_Y=train_set.scaler_Y)

        test_set.transform()

        df_train = train_set.ys.loc[train_fdate:train_tdate, :].copy()
        df_test = test_set.ys.loc[test_fdate:test_tdate, :].copy()
        df_test_org = test_set.ys_raw.loc[test_fdate:test_tdate, :].copy()

        # for lag in range(23, 24):
        input_lag = 0
        output_dir = Path(f'/mnt/data/XGBoost/' + station_name + "/" + target + "/")
        png_dir = output_dir / Path('png/')
        svg_dir = output_dir / Path('svg/')
        data_dir = output_dir / Path('csv/')
        Path.mkdir(data_dir, parents=True, exist_ok=True)
        Path.mkdir(png_dir, parents=True, exist_ok=True)
        Path.mkdir(svg_dir, parents=True, exist_ok=True)
        # prepare dataset
        print("Dataset conversion start..", flush=True)
        X_train, Y_train, train_dates = dataset2svinput(train_set, lag=input_lag)
        X_test, Y_test, test_dates = dataset2svinput(test_set, lag=input_lag)

        print("Dataset conversion complete..", flush=True)

        print("XGBoost " + target + "...", flush=True)
        df_obs = mw_df(df_test_org, target, output_size, input_lag,
                    test_fdate, test_tdate)
        dates = df_obs.index

        # prediction
        df_sim = sim_xgboost(X_train.copy(), Y_train, X_test.copy(), Y_test, dates,
                copy.deepcopy(features), target, sample_size, output_size, test_set.scaler_Y,
                data_dir, png_dir, svg_dir)

        assert df_obs.shape == df_sim.shape

        # join df
        plot_xgboost(df_sim, df_obs, target, \
            data_dir, png_dir, svg_dir, test_fdate, test_tdate, station_name, output_size)
        # save to csv
        csv_fname = "df_test_obs.csv"
        df_obs.to_csv(data_dir / csv_fname)

        csv_fname = "df_test_sim.csv"
        df_sim.to_csv(data_dir / csv_fname)
Beispiel #6
0
def stats_analysis(_station_name="종로구"):
    """
    References
    * Ghil, M., et al. "Extreme events: dynamics, statistics and prediction." Nonlinear Processes in Geophysics 18.3 (2011): 295-350.
    """
    print("Start Analysis of input")
    if not Path(HOURLY_DATA_PATH).is_file():
        query_str = 'stationCode in ' + str(SEOUL_CODES)
        print(query_str, flush=True)

        _df_h = load_imputed()
        _df_h.to_csv("/input/python/input_imputed_hourly_pandas.csv")

        df_h = _df_h.query(query_str)
        df_h.to_csv(HOURLY_DATA_PATH)

        # filter by seoul codes
        print("Imputed!", flush=True)

    targets = ["PM10", "PM25"]
    sea_targets = [
        "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "u", "v", "pres",
        "humid", "prep", "snow"
    ]
    # sea_targets = ["prep", "snow"]
    # 24*14 = 336
    sample_size = 24 * 2
    output_size = 24

    # Hyper parameter
    epoch_size = 500
    batch_size = 256
    learning_rate = 1e-3

    train_fdate = dt.datetime(2015, 1, 5, 0).astimezone(SEOULTZ)
    train_fdate = dt.datetime(2008, 1, 3, 0).astimezone(SEOULTZ)
    train_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ)
    test_fdate = dt.datetime(2019, 1, 1, 0).astimezone(SEOULTZ)
    #test_tdate = dt.datetime(2018, 12, 31, 23).astimezone(SEOULTZ)
    test_tdate = dt.datetime(2020, 10, 31, 23).astimezone(SEOULTZ)

    # check date range assumption
    assert test_tdate > train_fdate
    assert test_fdate > train_tdate

    train_features = [
        "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "u", "v", "pres",
        "humid", "prep"
    ]
    train_features_periodic = [
        "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "u", "v", "pres",
        "humid"
    ]
    train_features_nonperiodic = ["prep"]
    # station_names = ['종로구']
    # station_names = SEOUL_STATIONS
    station_names = ["종로구", "강서구", "서초구", "광진구"]

    def plot_sea(station_name='종로구'):
        for target in sea_targets:
            print("Analyze " + target + "...")

            _df_seoul = pd.read_csv(HOURLY_DATA_PATH,
                                    index_col=[0, 1],
                                    parse_dates=[0])
            # filter by station_name
            _df_station = _df_seoul.query('stationCode == "' +
                                          str(SEOUL_STATIONS[station_name]) +
                                          '"')
            _df_station.reset_index(level='stationCode',
                                    drop=True,
                                    inplace=True)
            df_sea_h = _df_station

            output_dir = Path("/mnt/data/STATS_ANALYSIS_" + str(sample_size) +
                              "/" + station_name + "/" + target + "/")
            Path.mkdir(output_dir, parents=True, exist_ok=True)

            data_dir = output_dir / Path('csv/')
            png_dir = output_dir / Path('png/')
            svg_dir = output_dir / Path('svg/')
            Path.mkdir(data_dir, parents=True, exist_ok=True)
            Path.mkdir(png_dir, parents=True, exist_ok=True)
            Path.mkdir(svg_dir, parents=True, exist_ok=True)

            hparams = Namespace(nhead=8,
                                head_dim=128,
                                d_feedforward=256,
                                num_layers=3,
                                learning_rate=learning_rate,
                                batch_size=batch_size)

            # prepare dataset
            train_valid_set = MultivariateRNNMeanSeasonalityDataset(
                station_name=station_name,
                target=target,
                filepath="/input/python/input_seoul_imputed_hourly_pandas.csv",
                features=train_features,
                features_1=[
                    "SO2", "CO", "O3", "NO2", "PM10", "PM25", "temp", "v",
                    "pres", "humid", "prep", "snow"
                ],
                features_2=['u'],
                fdate=train_fdate,
                tdate=train_tdate,
                sample_size=sample_size,
                output_size=output_size,
                train_valid_ratio=0.8)

            # first mkdir of seasonality
            Path.mkdir(png_dir / "seasonality", parents=True, exist_ok=True)
            Path.mkdir(svg_dir / "seasonality", parents=True, exist_ok=True)
            Path.mkdir(data_dir / "seasonality", parents=True, exist_ok=True)

            # fit & transform (seasonality)
            # without seasonality
            # train_valid_set.preprocess()
            # with seasonality
            train_valid_set.preprocess()
            # save seasonality index-wise
            # train_valid_set.broadcast_seasonality()

            train_valid_set.plot_fused_seasonality(
                data_dir / "seasonality_fused", png_dir / "seasonality_fused",
                svg_dir / "seasonality_fused")

    #for target in targets:
    for station_name in station_names:
        for target in train_features_periodic:
            print("Analyze " + target + "...")

            # if not Path("/input/python/input_jongro_imputed_hourly_pandas.csv").is_file():
            #     # load imputed result
            #     _df_h = load_imputed(HOURLY_DATA_PATH)
            #     df_h = _df_h.query('stationCode == "' +
            #                     str(SEOUL_STATIONS[station_name]) + '"')
            #     df_h.to_csv("/input/python/input_jongro_imputed_hourly_pandas.csv")

            _df_seoul = pd.read_csv(HOURLY_DATA_PATH,
                                    index_col=[0, 1],
                                    parse_dates=[0])
            # filter by station_name
            _df_station = _df_seoul.query('stationCode == "' +
                                          str(SEOUL_STATIONS[station_name]) +
                                          '"')
            _df_station.reset_index(level='stationCode',
                                    drop=True,
                                    inplace=True)
            df_sea_h = _df_station

            output_dir = Path("/mnt/data/STATS_ANALYSIS_" + str(sample_size) +
                              "/" + station_name + "/" + target + "/")
            Path.mkdir(output_dir, parents=True, exist_ok=True)

            data_dir = output_dir / Path('csv/')
            png_dir = output_dir / Path('png/')
            svg_dir = output_dir / Path('svg/')
            Path.mkdir(data_dir, parents=True, exist_ok=True)
            Path.mkdir(png_dir, parents=True, exist_ok=True)
            Path.mkdir(svg_dir, parents=True, exist_ok=True)

            hparams = Namespace(nhead=8,
                                head_dim=128,
                                d_feedforward=256,
                                num_layers=3,
                                learning_rate=learning_rate,
                                batch_size=batch_size)

            # prepare dataset
            train_valid_set = MultivariateRNNMeanSeasonalityDataset(
                station_name=station_name,
                target=target,
                filepath="/input/python/input_seoul_imputed_hourly_pandas.csv",
                features=train_features,
                features_1=train_features_nonperiodic,
                features_2=train_features_periodic,
                fdate=train_fdate,
                tdate=train_tdate,
                sample_size=sample_size,
                output_size=output_size,
                train_valid_ratio=0.8)

            # first mkdir of seasonality
            Path.mkdir(png_dir / "seasonality", parents=True, exist_ok=True)
            Path.mkdir(svg_dir / "seasonality", parents=True, exist_ok=True)
            Path.mkdir(data_dir / "seasonality", parents=True, exist_ok=True)

            # fit & transform (seasonality)
            # without seasonality
            # train_valid_set.preprocess()
            # with seasonality
            train_valid_set.preprocess()
            # save seasonality index-wise
            train_valid_set.broadcast_seasonality()

            test_set = MultivariateRNNMeanSeasonalityDataset(
                station_name=station_name,
                target=target,
                filepath="/input/python/input_seoul_imputed_hourly_pandas.csv",
                features=train_features,
                features_1=train_features_nonperiodic,
                features_2=train_features_periodic,
                fdate=test_fdate,
                tdate=test_tdate,
                sample_size=sample_size,
                output_size=output_size,
                scaler_X=train_valid_set.scaler_X,
                scaler_Y=train_valid_set.scaler_Y)

            test_set.transform()
            # save seasonality index-wise
            test_set.broadcast_seasonality()

            def run_01_CLT():
                """
                1. Is data sufficient?
                    * Central Limit Theorem =>
                    * Distribution of sample mean & sample std =>
                    * Is it normal or log-normal?
                """
                _data_dir = data_dir / "01-CLT"
                _png_dir = png_dir / "01-CLT"
                _svg_dir = svg_dir / "01-CLT"
                Path.mkdir(_data_dir, parents=True, exist_ok=True)
                Path.mkdir(_png_dir, parents=True, exist_ok=True)
                Path.mkdir(_svg_dir, parents=True, exist_ok=True)

                # statistics of decomposed samples
                means_d = np.zeros(len(train_valid_set))
                means_r = np.zeros(len(train_valid_set))
                sample_means_d = np.zeros(len(train_valid_set))

                # statistics of raw samples
                sample_means_r = np.zeros(len(train_valid_set))

                # save sample statistics
                # len(train_valid_set) == 34895
                for i, s in enumerate(train_valid_set):
                    x, x_1d, x_sa, x_sw, x_sh, \
                        y, y_raw, y_sa, y_sw, y_sh, y_date = s

                    if len(y) != output_size:
                        break

                    # it's not random sampling
                    means_d[i] = np.mean(y)
                    means_r[i] = np.mean(y_raw)

                nchoice = 64
                for i in range(100):
                    dr = np.random.choice(means_d, size=nchoice)
                    sample_means_d[i] = np.mean(dr)
                    rr = np.random.choice(means_r, size=nchoice)
                    sample_means_r[i] = np.mean(rr)

                print("Sample & Pop. Mean : ", np.mean(sample_means_d),
                      np.mean(means_d))
                print("Sample & Pop. STD : ",
                      np.std(sample_means_d) / sqrt(nchoice), np.std(means_d))

                print("Sample & Pop. Mean : ", np.mean(sample_means_r),
                      np.mean(means_r))
                print("Sample & Pop. STD : ",
                      np.std(sample_means_r) / sqrt(nchoice), np.std(means_r))

            def run_02_MFDFA():
                print("MF-DFA..")
                _data_dir = data_dir / "02-LRD-MFDFA"
                _png_dir = png_dir / "02-LRD-MFDFA"
                _svg_dir = svg_dir / "02-LRD-MFDFA"
                Path.mkdir(_data_dir, parents=True, exist_ok=True)
                Path.mkdir(_png_dir, parents=True, exist_ok=True)
                Path.mkdir(_svg_dir, parents=True, exist_ok=True)

                # Define unbounded process
                Xs = train_valid_set.ys
                Xs_raw = train_valid_set.ys_raw

                n_lag = 100
                large_s = int(n_lag * 0.3)
                org_lag = np.unique(np.logspace(0.5, 3, n_lag).astype(int))

                # Select a list of powers q
                # if q == 2 -> standard square root based average
                q_list = [-6, -2, -3, 2, 3, 6]

                # The order of the polynomial fitting
                for order in [1, 2, 3]:
                    lag, dfa = MFDFA.MFDFA(Xs[target].to_numpy(),
                                           lag=org_lag,
                                           q=q_list,
                                           order=order)
                    norm_dfa = np.zeros_like(dfa)

                    for i in range(dfa.shape[1]):
                        norm_dfa[:, i] = np.divide(dfa[:, i], np.sqrt(lag))

                    df = pd.DataFrame.from_dict({
                        str(q_list[i]): dfa[:, i]
                        for i in range(dfa.shape[1])
                    })
                    df['s'] = lag

                    df_norm = pd.DataFrame.from_dict({
                        str(q_list[i]): norm_dfa[:, i]
                        for i in range(dfa.shape[1])
                    })
                    df_norm['s'] = lag

                    # plot
                    fig = plt.figure()
                    plt.clf()
                    sns.color_palette("tab10")
                    sns.lineplot(x='s',
                                 y='value',
                                 hue='q',
                                 data=pd.melt(df_norm,
                                              id_vars=['s'],
                                              var_name='q'))
                    q0fit = np.polynomial.Polynomial.fit(
                        np.log10(lag)[large_s:],
                        np.log10(df.to_numpy()[:, 0])[large_s:], 1)
                    q0fit_vals = np.polynomial.polynomial.polyval(
                        np.log10(lag), q0fit.coef)
                    plt.plot(lag,
                             np.power(10, q0fit_vals),
                             label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format(
                                 q_list[0], q0fit.coef[1]),
                             alpha=0.7,
                             color='k',
                             linestyle='dashed')
                    qnfit = np.polynomial.Polynomial.fit(
                        np.log10(lag)[large_s:],
                        np.log10(df.to_numpy()[:, -1])[large_s:], 1)
                    qnfit_vals = np.polynomial.polynomial.polyval(
                        np.log10(lag), qnfit.coef)
                    plt.plot(lag,
                             np.power(10, q0fit_vals),
                             label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format(
                                 q_list[-1], qnfit.coef[1]),
                             alpha=0.7,
                             color='k',
                             linestyle='dashed')
                    ax = plt.gca()
                    leg_handles, leg_labels = ax.get_legend_handles_labels()
                    for i in range(len(q_list)):
                        leg_labels[i] = r'h({{{0}}})'.format(q_list[i])
                    ax.legend(leg_handles, leg_labels)
                    ax.set_xlabel(r'$s$')
                    ax.set_ylabel(r'$F^{(n)}(s)/\sqrt{s}$')
                    ax.set_xscale('log')
                    ax.set_yscale('log')

                    df_norm.set_index('s', inplace=True)
                    df_norm.to_csv(_data_dir /
                                   ('MFDFA_norm_res_o' + str(order) + '.csv'))
                    png_path = _png_dir / ("MFDFA_norm_res_o" + str(order) +
                                           '.png')
                    svg_path = _svg_dir / ("MFDFA_norm_res_o" + str(order) +
                                           '.svg')
                    plt.savefig(png_path, dpi=600)
                    plt.savefig(svg_path)
                    plt.close()

                    fig = plt.figure()
                    plt.clf()
                    sns.color_palette("tab10")
                    sns.lineplot(x='s',
                                 y='value',
                                 hue='q',
                                 data=pd.melt(df, id_vars=['s'], var_name='q'))
                    q0fit = np.polynomial.Polynomial.fit(
                        np.log10(lag)[large_s:],
                        np.log10(df.to_numpy()[:, 0])[large_s:], 1)
                    q0fit_vals = np.polynomial.polynomial.polyval(
                        np.log10(lag), q0fit.coef)
                    plt.plot(lag,
                             np.power(10, q0fit_vals),
                             label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format(
                                 q_list[0], q0fit.coef[1]),
                             alpha=0.7,
                             color='k',
                             linestyle='dashed')
                    qnfit = np.polynomial.Polynomial.fit(
                        np.log10(lag)[large_s:],
                        np.log10(df.to_numpy()[:, -1])[large_s:], 1)
                    qnfit_vals = np.polynomial.polynomial.polyval(
                        np.log10(lag), qnfit.coef)
                    plt.plot(lag,
                             np.power(10, q0fit_vals),
                             label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format(
                                 q_list[-1], qnfit.coef[1]),
                             alpha=0.7,
                             color='k',
                             linestyle='dashed')
                    ax = plt.gca()
                    leg_handles, leg_labels = ax.get_legend_handles_labels()
                    for i in range(len(q_list)):
                        leg_labels[i] = r'h({{{0}}})'.format(q_list[i])
                    ax.legend(leg_handles, leg_labels)
                    ax.set_xlabel(r'$s$')
                    ax.set_ylabel(r'$F^{(n)}(s)$')
                    ax.set_xscale('log')
                    ax.set_yscale('log')

                    df.set_index('s', inplace=True)
                    df_norm.to_csv(_data_dir /
                                   ('MFDFA_res_o' + str(order) + '.csv'))
                    png_path = _png_dir / ("MFDFA_res_o" + str(order) + '.png')
                    svg_path = _svg_dir / ("MFDFA_res_o" + str(order) + '.svg')
                    plt.savefig(png_path, dpi=600)
                    plt.savefig(svg_path)
                    plt.close()

                    lag, dfa = MFDFA.MFDFA(Xs_raw[target].to_numpy(),
                                           lag=org_lag,
                                           q=q_list,
                                           order=order)
                    norm_dfa = np.zeros_like(dfa)

                    for i in range(dfa.shape[1]):
                        norm_dfa[:, i] = dfa[:, i] / np.sqrt(lag[i])

                    df = pd.DataFrame.from_dict({
                        str(q_list[i]): dfa[:, i]
                        for i in range(dfa.shape[1])
                    })
                    df['s'] = lag

                    df_norm = pd.DataFrame.from_dict({
                        str(q_list[i]): norm_dfa[:, i]
                        for i in range(dfa.shape[1])
                    })
                    df_norm['s'] = lag

                    # plot
                    fig = plt.figure()
                    plt.clf()
                    sns.color_palette("tab10")
                    sns.lineplot(x='s',
                                 y='value',
                                 hue='q',
                                 data=pd.melt(df_norm,
                                              id_vars=['s'],
                                              var_name='q'))
                    q0fit = np.polynomial.Polynomial.fit(
                        np.log10(lag)[large_s:],
                        np.log10(df.to_numpy()[:, 0])[large_s:], 1)
                    q0fit_vals = np.polynomial.polynomial.polyval(
                        np.log10(lag), q0fit.coef)
                    plt.plot(lag,
                             np.power(10, q0fit_vals),
                             label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format(
                                 q_list[0], q0fit.coef[1]),
                             alpha=0.7,
                             color='k',
                             linestyle='dashed')
                    qnfit = np.polynomial.Polynomial.fit(
                        np.log10(lag)[large_s:],
                        np.log10(df.to_numpy()[:, -1])[large_s:], 1)
                    qnfit_vals = np.polynomial.polynomial.polyval(
                        np.log10(lag), qnfit.coef)
                    plt.plot(lag,
                             np.power(10, q0fit_vals),
                             label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format(
                                 q_list[-1], qnfit.coef[1]),
                             alpha=0.7,
                             color='k',
                             linestyle='dashed')
                    ax = plt.gca()
                    leg_handles, leg_labels = ax.get_legend_handles_labels()
                    for i in range(len(q_list)):
                        leg_labels[i] = r'h({{{0}}})'.format(q_list[i])
                    ax.legend(leg_handles, leg_labels)
                    ax.set_xlabel(r'$s$')
                    ax.set_ylabel(r'$F^{(n)}(s)/\sqrt{s}$')
                    ax.set_xscale('log')
                    ax.set_yscale('log')

                    df_norm.set_index('s', inplace=True)
                    df_norm.to_csv(_data_dir /
                                   ('MFDFA_norm_o' + str(order) + '.csv'))
                    png_path = _png_dir / ("MFDFA_norm_o" + str(order) +
                                           '.png')
                    svg_path = _svg_dir / ("MFDFA_norm_o" + str(order) +
                                           '.svg')
                    plt.savefig(png_path, dpi=600)
                    plt.savefig(svg_path)
                    plt.close()

                    fig = plt.figure()
                    plt.clf()
                    sns.color_palette("tab10")
                    sns.lineplot(x='s',
                                 y='value',
                                 hue='q',
                                 data=pd.melt(df, id_vars=['s'], var_name='q'))
                    q0fit = np.polynomial.Polynomial.fit(
                        np.log10(lag)[large_s:],
                        np.log10(df.to_numpy()[:, 0])[large_s:], 1)
                    q0fit_vals = np.polynomial.polynomial.polyval(
                        np.log10(lag), q0fit.coef)
                    plt.plot(lag,
                             np.power(10, q0fit_vals),
                             label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format(
                                 q_list[0], q0fit.coef[1]),
                             alpha=0.7,
                             color='k',
                             linestyle='dashed')
                    qnfit = np.polynomial.Polynomial.fit(
                        np.log10(lag)[large_s:],
                        np.log10(df.to_numpy()[:, -1])[large_s:], 1)
                    qnfit_vals = np.polynomial.polynomial.polyval(
                        np.log10(lag), qnfit.coef)
                    plt.plot(lag,
                             np.power(10, q0fit_vals),
                             label=r'$h({{{0}}}) = {{{1:.2f}}}$'.format(
                                 q_list[-1], qnfit.coef[1]),
                             alpha=0.7,
                             color='k',
                             linestyle='dashed')
                    ax = plt.gca()
                    leg_handles, leg_labels = ax.get_legend_handles_labels()
                    for i in range(len(q_list)):
                        leg_labels[i] = r'h({{{0}}})'.format(q_list[i])
                    ax.legend(leg_handles, leg_labels)
                    ax.set_xlabel(r'$s$')
                    ax.set_ylabel(r'$F^{(n)}(s)$')
                    ax.set_xscale('log')
                    ax.set_yscale('log')

                    df.set_index('s', inplace=True)
                    df_norm.to_csv(_data_dir /
                                   ('MFDFA_o' + str(order) + '.csv'))
                    png_path = _png_dir / ("MFDFA_o" + str(order) + '.png')
                    svg_path = _svg_dir / ("MFDFA_o" + str(order) + '.svg')
                    plt.savefig(png_path, dpi=600)
                    plt.savefig(svg_path)
                    plt.close()

            def run_01_DFA():
                print("DFA..")
                _data_dir = data_dir / "01-LRD-DFA"
                _png_dir = png_dir / "01-LRD-DFA"
                _svg_dir = svg_dir / "01-LRD-DFA"
                Path.mkdir(_data_dir, parents=True, exist_ok=True)
                Path.mkdir(_png_dir, parents=True, exist_ok=True)
                Path.mkdir(_svg_dir, parents=True, exist_ok=True)

                # Define unbounded process
                Xs = train_valid_set.ys
                Xs_raw = train_valid_set.ys_raw

                n_lag = 100
                large_s = int(n_lag * 0.3)
                org_lag = np.unique(np.logspace(0.5, 3, n_lag).astype(int))

                # Select a list of powers q
                # if q == 2 -> standard square root based average
                q_list = [2]

                def model_func(x, A, B):
                    return A * np.power(x, B)

                # The order of the polynomial fitting
                for order in [1, 2, 3]:
                    # RESIDUALS
                    lag, dfa = MFDFA.MFDFA(Xs[target].to_numpy(),
                                           lag=org_lag,
                                           q=q_list,
                                           order=order)
                    norm_dfa = np.zeros_like(dfa)

                    for i in range(dfa.shape[1]):
                        norm_dfa[:, i] = np.divide(dfa[:, i], np.sqrt(lag))

                    df = pd.DataFrame.from_dict({
                        str(q_list[i]): dfa[:, i]
                        for i in range(dfa.shape[1])
                    })
                    df['s'] = lag

                    df_norm = pd.DataFrame.from_dict({
                        str(q_list[i]): norm_dfa[:, i]
                        for i in range(dfa.shape[1])
                    })
                    df_norm['s'] = lag

                    # plot
                    fig = plt.figure()
                    sns.color_palette("tab10")
                    sns.lineplot(x='s',
                                 y='value',
                                 hue='q',
                                 data=pd.melt(df_norm,
                                              id_vars=['s'],
                                              var_name='q'))
                    base_lines = np.ones(len(lag)) * 10.0**(-2) * np.power(
                        lag, 0.5)
                    plt.plot(lag,
                             base_lines,
                             label=r'$h(2) = 0.5$',
                             alpha=0.7,
                             color='tab:green',
                             linestyle='dashed')
                    p0 = (1., 1.e-5)
                    popt, pcov = sp.optimize.curve_fit(
                        model_func, lag[large_s:],
                        df_norm.to_numpy()[:, -1][large_s:], p0)
                    coef_annot = popt[1]
                    gamma_annot = 2.0 * (1.0 - popt[1])
                    estimated = model_func(lag, popt[0], popt[1])
                    plt.plot(
                        lag,
                        estimated,
                        label=r'$h(2) = {{{0:.2f}}}, \gamma = {{{1:.2f}}}$'.
                        format(coef_annot, gamma_annot),
                        alpha=0.7,
                        color='tab:orange',
                        linestyle='dashed')
                    ax = plt.gca()
                    leg_handles, leg_labels = ax.get_legend_handles_labels()
                    leg_labels[0] = r'$h(2)$'
                    ax.legend(leg_handles, leg_labels)
                    ax.set_xlabel(r'$s$')
                    ax.set_ylabel(r'$F^{(n)}(s)$')
                    ax.set_xscale('log')
                    ax.set_yscale('log')

                    df_norm.set_index('s', inplace=True)
                    df_norm.to_csv(_data_dir /
                                   ('DFA_norm_res_o' + str(order) + '.csv'))
                    png_path = _png_dir / ("DFA_norm_res_o" + str(order) +
                                           '.png')
                    svg_path = _svg_dir / ("DFA_norm_res_o" + str(order) +
                                           '.svg')
                    plt.savefig(png_path, dpi=600)
                    plt.savefig(svg_path)
                    plt.close()

                    fig = plt.figure()
                    sns.color_palette("tab10")
                    sns.lineplot(x='s',
                                 y='value',
                                 hue='q',
                                 data=pd.melt(df, id_vars=['s'], var_name='q'))
                    base_lines = np.ones(len(lag)) * \
                        10.0**(-2) * np.power(lag, 0.5)
                    plt.plot(lag,
                             base_lines,
                             label=r'$h(2) = 0.5$',
                             alpha=0.7,
                             color='tab:green',
                             linestyle='dashed')
                    p0 = (1., 1.e-5)
                    popt, pcov = sp.optimize.curve_fit(
                        model_func, lag[large_s:],
                        df.to_numpy()[:, -1][large_s:], p0)
                    coef_annot = popt[1]
                    gamma_annot = 2.0 * (1.0 - popt[1])
                    estimated = model_func(lag, popt[0], popt[1])
                    plt.plot(
                        lag,
                        estimated,
                        label=r'$h(2) = {{{0:.2f}}}, \gamma = {{{1:.2f}}}$'.
                        format(coef_annot, gamma_annot),
                        alpha=0.7,
                        color='tab:orange',
                        linestyle='dashed')
                    ax = plt.gca()
                    leg_handles, leg_labels = ax.get_legend_handles_labels()
                    leg_labels[0] = r'$h(2)$'
                    ax.legend(leg_handles, leg_labels)
                    ax.set_xlabel(r'$s$')
                    ax.set_ylabel(r'$F^{(n)}(s)$')
                    ax.set_xscale('log')
                    ax.set_yscale('log')

                    df.set_index('s', inplace=True)
                    df_norm.to_csv(_data_dir /
                                   ('DFA_res_o' + str(order) + '.csv'))
                    png_path = _png_dir / ("DFA_res_o" + str(order) + '.png')
                    svg_path = _svg_dir / ("DFA_res_o" + str(order) + '.svg')
                    plt.savefig(png_path, dpi=600)
                    plt.savefig(svg_path)
                    plt.close()

                    # RAW
                    lag, dfa = MFDFA.MFDFA(Xs_raw[target].to_numpy(),
                                           lag=org_lag,
                                           q=q_list,
                                           order=order)
                    norm_dfa = np.zeros_like(dfa)

                    for i in range(dfa.shape[1]):
                        norm_dfa[:, i] = dfa[:, i] / np.sqrt(lag[i])

                    df = pd.DataFrame.from_dict({
                        str(q_list[i]): dfa[:, i]
                        for i in range(dfa.shape[1])
                    })
                    df['s'] = lag

                    df_norm = pd.DataFrame.from_dict({
                        str(q_list[i]): norm_dfa[:, i]
                        for i in range(dfa.shape[1])
                    })
                    df_norm['s'] = lag

                    # plot
                    fig = plt.figure()
                    sns.color_palette("tab10")
                    sns.lineplot(x='s',
                                 y='value',
                                 hue='q',
                                 data=pd.melt(df_norm,
                                              id_vars=['s'],
                                              var_name='q'))
                    base_lines = np.ones(len(lag)) * \
                        10.0**(-2) * np.power(lag, 0.5)
                    plt.plot(lag,
                             base_lines,
                             label=r'$h(2) = 0.5$',
                             alpha=0.7,
                             color='tab:green',
                             linestyle='dashed')
                    p0 = (1., 1.e-5)
                    popt, pcov = sp.optimize.curve_fit(
                        model_func, lag[large_s:],
                        df_norm.to_numpy()[:, -1][large_s:], p0)
                    coef_annot = popt[1]
                    gamma_annot = 2.0 * (1.0 - popt[1])
                    estimated = model_func(lag, popt[0], popt[1])
                    plt.plot(
                        lag,
                        estimated,
                        label=r'$h(2) = {{{0:.2f}}}, \gamma = {{{1:.2f}}}$'.
                        format(coef_annot, gamma_annot),
                        alpha=0.7,
                        color='tab:orange',
                        linestyle='dashed')
                    ax = plt.gca()
                    leg_handles, leg_labels = ax.get_legend_handles_labels()
                    leg_labels[0] = r'$h(2)$'
                    ax.legend(leg_handles, leg_labels)
                    ax.set_xlabel(r'$s$')
                    ax.set_ylabel(r'$F^{(n)}(s)/\sqrt{s}$')
                    ax.set_xscale('log')
                    ax.set_yscale('log')

                    df_norm.set_index('s', inplace=True)
                    df_norm.to_csv(_data_dir /
                                   ('DFA_norm_o' + str(order) + '.csv'))
                    png_path = _png_dir / ("DFA_norm_o" + str(order) + '.png')
                    svg_path = _svg_dir / ("DFA_norm_o" + str(order) + '.svg')
                    plt.savefig(png_path, dpi=600)
                    plt.savefig(svg_path)
                    plt.close()

                    fig = plt.figure()
                    sns.color_palette("tab10")
                    sns.lineplot(x='s',
                                 y='value',
                                 hue='q',
                                 data=pd.melt(df, id_vars=['s'], var_name='q'))
                    base_lines = np.ones(len(lag)) * \
                        10.0**(-2) * np.power(lag, 0.5)
                    plt.plot(lag,
                             base_lines,
                             label=r'$h(2) = 0.5$',
                             alpha=0.7,
                             color='tab:green',
                             linestyle='dashed')
                    p0 = (1., 1.e-5)
                    popt, pcov = sp.optimize.curve_fit(
                        model_func, lag[large_s:],
                        df.to_numpy()[:, -1][large_s:], p0)
                    coef_annot = popt[1]
                    gamma_annot = 2.0 * (1.0 - popt[1])
                    estimated = model_func(lag, popt[0], popt[1])
                    plt.plot(
                        lag,
                        estimated,
                        label=r'$h(2) = {{{0:.2f}}}, \gamma = {{{1:.2f}}}$'.
                        format(coef_annot, gamma_annot),
                        alpha=0.7,
                        color='tab:orange',
                        linestyle='dashed')
                    ax = plt.gca()
                    leg_handles, leg_labels = ax.get_legend_handles_labels()
                    leg_labels[0] = r'$h(2)$'
                    ax.legend(leg_handles, leg_labels)
                    ax.set_xlabel(r'$s$')
                    ax.set_ylabel(r'$F^{(n)}(s)$')
                    ax.set_xscale('log')
                    ax.set_yscale('log')

                    df.set_index('s', inplace=True)
                    df_norm.to_csv(_data_dir / ('DFA_o' + str(order) + '.csv'))
                    png_path = _png_dir / ("DFA_o" + str(order) + '.png')
                    svg_path = _svg_dir / ("DFA_o" + str(order) + '.svg')
                    plt.savefig(png_path, dpi=600)
                    plt.savefig(svg_path)
                    plt.close()

            run_01_DFA()
            run_02_MFDFA()