コード例 #1
0
ファイル: _m4.py プロジェクト: zhupeiru/gluon-ts
def generate_m4_dataset(dataset_path: Path, m4_freq: str, pandas_freq: str,
                        prediction_length: int):
    m4_dataset_url = (
        "https://github.com/M4Competition/M4-methods/raw/master/Dataset")
    train_df = pd.read_csv(f"{m4_dataset_url}/Train/{m4_freq}-train.csv",
                           index_col=0)
    test_df = pd.read_csv(f"{m4_dataset_url}/Test/{m4_freq}-test.csv",
                          index_col=0)

    os.makedirs(dataset_path, exist_ok=True)

    with open(dataset_path / "metadata.json", "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=len(train_df),
                    freq=pandas_freq,
                    prediction_length=prediction_length,
                )))

    train_file = dataset_path / "train" / "data.json"
    test_file = dataset_path / "test" / "data.json"

    train_target_values = [ts[~np.isnan(ts)] for ts in train_df.values]

    test_target_values = [
        np.hstack([train_ts, test_ts])
        for train_ts, test_ts in zip(train_target_values, test_df.values)
    ]

    if m4_freq == "Yearly":
        # some time series have more than 300 years which can not be represented in pandas,
        # this is probably due to a misclassification of those time series as Yearly
        # we simply use only the last 300 years for training
        # note this does not affect test time as prediction length is less than 300 years
        train_target_values = [ts[-300:] for ts in train_target_values]
        test_target_values = [ts[-300:] for ts in test_target_values]

    # the original dataset did not include time stamps, so we use a mock start date for each time series
    # we use the earliest point available in pandas
    mock_start_dataset = "1750-01-01 00:00:00"

    save_to_file(
        train_file,
        [
            to_dict(target_values=target, start=mock_start_dataset, cat=[cat])
            for cat, target in enumerate(train_target_values)
        ],
    )

    save_to_file(
        test_file,
        [
            to_dict(target_values=target, start=mock_start_dataset, cat=[cat])
            for cat, target in enumerate(test_target_values)
        ],
    )
コード例 #2
0
def save_dataset(dataset_path: Path, ds_info: GPCopulaDataset):
    dataset = list(FileDataset(dataset_path, freq=ds_info.freq))
    shutil.rmtree(dataset_path)
    train_file = dataset_path / "data.json"
    save_to_file(
        train_file,
        [
            to_dict(
                target_values=data_entry[FieldName.TARGET],
                start=data_entry[FieldName.START],
                # Handles adding categorical features of rolling
                # evaluation dates
                cat=[cat - ds_info.num_series * (cat // ds_info.num_series)],
            )
            for cat, data_entry in enumerate(dataset)
        ],
    )
コード例 #3
0
def generate_lstnet_dataset(dataset_path: Path, dataset_name: str):
    ds_info = datasets_info[dataset_name]

    os.makedirs(dataset_path, exist_ok=True)

    with open(dataset_path / "metadata.json", "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=ds_info.num_series,
                    freq=ds_info.freq,
                    prediction_length=ds_info.prediction_length,
                )))

    train_file = dataset_path / "train" / "data.json"
    test_file = dataset_path / "test" / "data.json"

    time_index = pd.date_range(
        start=ds_info.start_date,
        freq=ds_info.freq,
        periods=ds_info.num_time_steps,
    )

    df = pd.read_csv(ds_info.url, header=None)

    assert df.shape == (
        ds_info.num_time_steps,
        ds_info.num_series,
    ), f"expected num_time_steps/num_series {(ds_info.num_time_steps, ds_info.num_series)} but got {df.shape}"

    timeseries = load_from_pandas(df=df,
                                  time_index=time_index,
                                  agg_freq=ds_info.agg_freq)

    # the last date seen during training
    ts_index = timeseries[0].index
    training_end = ts_index[int(len(ts_index) * (8 / 10))]

    train_ts = []
    for cat, ts in enumerate(timeseries):
        sliced_ts = ts[:training_end]
        if len(sliced_ts) > 0:
            train_ts.append(
                to_dict(
                    target_values=sliced_ts.values,
                    start=sliced_ts.index[0],
                    cat=[cat],
                ))

    assert len(train_ts) == ds_info.num_series

    save_to_file(train_file, train_ts)

    # time of the first prediction
    prediction_dates = [
        frequency_add(training_end, i * ds_info.prediction_length)
        for i in range(ds_info.rolling_evaluations)
    ]

    test_ts = []
    for prediction_start_date in prediction_dates:
        for cat, ts in enumerate(timeseries):
            # print(prediction_start_date)
            prediction_end_date = frequency_add(prediction_start_date,
                                                ds_info.prediction_length)
            sliced_ts = ts[:prediction_end_date]
            test_ts.append(
                to_dict(
                    target_values=sliced_ts.values,
                    start=sliced_ts.index[0],
                    cat=[cat],
                ))

    assert len(test_ts) == ds_info.num_series * ds_info.rolling_evaluations

    save_to_file(test_file, test_ts)
コード例 #4
0
def generate_lstnet_dataset(
    dataset_path: Path,
    dataset_name: str,
    prediction_length: Optional[int] = None,
):
    ds_info = datasets_info[dataset_name]

    ds_metadata = metadata(
        cardinality=ds_info.num_series,
        freq=ds_info.freq if ds_info.agg_freq is None else ds_info.agg_freq,
        prediction_length=prediction_length or ds_info.prediction_length,
    )

    os.makedirs(dataset_path, exist_ok=True)

    with open(dataset_path / "metadata.json", "w") as f:
        json.dump(ds_metadata, f)

    time_index = pd.period_range(
        start=ds_info.start_date,
        freq=ds_info.freq,
        periods=ds_info.num_time_steps,
    )

    df = cast(
        pd.DataFrame,
        pd.read_csv(ds_info.url, header=None),  # type: ignore
    )

    assert df.shape == (
        ds_info.num_time_steps,
        ds_info.num_series,
    ), ("expected num_time_steps/num_series"
        f" {(ds_info.num_time_steps, ds_info.num_series)} but got {df.shape}")

    timeseries = load_from_pandas(df=df,
                                  time_index=time_index,
                                  agg_freq=ds_info.agg_freq)

    # the last date seen during training
    ts_index = cast(pd.PeriodIndex, timeseries[0].index)
    training_end = ts_index[int(len(ts_index) * (8 / 10))]

    train_ts = []
    for cat, ts in enumerate(timeseries):
        sliced_ts = ts[:training_end]
        if len(sliced_ts) > 0:
            train_ts.append(
                to_dict(
                    target_values=sliced_ts.values,
                    start=sliced_ts.index[0],
                    cat=[cat],
                    item_id=cat,
                ))

    assert len(train_ts) == ds_info.num_series

    save_to_file(dataset_path / "train" / "data.json", train_ts)

    # time of the first prediction
    prediction_dates = [
        training_end + i * ds_info.prediction_length
        for i in range(ds_info.rolling_evaluations)
    ]

    test_ts = []
    for prediction_start_date in prediction_dates:
        for cat, ts in enumerate(timeseries):
            # print(prediction_start_date)
            prediction_end_date = (prediction_start_date +
                                   ds_info.prediction_length)
            sliced_ts = ts[:prediction_end_date]
            test_ts.append(
                to_dict(
                    target_values=sliced_ts.values,
                    start=sliced_ts.index[0],
                    cat=[cat],
                    item_id=cat,
                ))

    assert len(test_ts) == ds_info.num_series * ds_info.rolling_evaluations

    save_to_file(dataset_path / "test" / "data.json", test_ts)
コード例 #5
0
def generate_m3_dataset(dataset_path: Path, m3_freq: str):
    from gluonts.dataset.repository.datasets import default_dataset_path

    m3_xls_path = default_dataset_path / "M3C.xls"
    if not os.path.exists(m3_xls_path):
        raise RuntimeError(
            f"The m3 data is available at https://forecasters.org/resources/time-series-data/m3-competition/ "
            f"Please download the file and copy the files to this location: {m3_xls_path}"
        )

    class M3Setting(NamedTuple):
        sheet_name: str
        prediction_length: int
        freq: str

    subsets = {
        "yearly": M3Setting("M3Year", 6, "12M"),
        "quarterly": M3Setting("M3Quart", 8, "3M"),
        "monthly": M3Setting("M3Month", 18, "1M"),
        "other": M3Setting("M3Other", 8, "3M"),
    }
    assert (m3_freq.lower() in subsets
            ), f"invalid m3_freq='{m3_freq}'. Allowed values: {subsets.keys()}"

    if m3_freq.lower() == "other":
        warnings.warn(
            "Be aware: The M3-other dataset does not have a known frequency. Since gluonts needs a known frequency, "
            "we will generate the dataset with an artificial `quarterly` frequency."
        )

    subset = subsets[m3_freq.lower()]
    df = pd.read_excel(m3_xls_path, sheet_name=subset.sheet_name)

    def truncate_trailing_nan(v: np.ndarray):
        last_finite_index = np.where(np.isfinite(v))[0][-1]
        return v[:last_finite_index + 1]

    train_data = []
    test_data = []

    def normalize_category(c: str):
        return c.strip()

    df["Category"] = df["Category"].apply(normalize_category)
    categories = list(df["Category"].unique())

    cat_map = {c: i for i, c in enumerate(categories)}

    i = 0
    for _, row in df.iterrows():
        vals = row.values
        series, n, nf, category, starting_year, starting_offset = vals[:6]
        target = np.asarray(vals[6:], dtype=np.float64)
        target = truncate_trailing_nan(target)
        assert len(target) == n
        assert nf == subset.prediction_length
        mock_start = "1750-01-01 00:00:00"
        if starting_year == 0:
            assert starting_offset == 0
            starting_year = mock_start
        s = pd.Timestamp(str(starting_year), freq=subset.freq)
        offset = max(starting_offset - 1, 0)
        if offset:
            s += offset * s.freq
        start = str(s).split(" ")[0]

        cat = [i, cat_map[category]]

        d_train = to_dict(
            target_values=target[:-subset.prediction_length],
            start=start,
            cat=cat,
            item_id=series,
        )
        train_data.append(d_train)

        d_test = to_dict(target_values=target,
                         start=start,
                         cat=cat,
                         item_id=series)
        test_data.append(d_test)
        i += 1

    os.makedirs(dataset_path, exist_ok=True)
    with open(dataset_path / "metadata.json", "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=[len(train_data),
                                 len(categories)],
                    freq=subset.freq,
                    prediction_length=subset.prediction_length,
                )))

    train_file = dataset_path / "train" / "data.json"
    test_file = dataset_path / "test" / "data.json"

    save_to_file(train_file, train_data)
    save_to_file(test_file, test_data)

    check_dataset(dataset_path, len(df))