Ejemplo n.º 1
0
def generate_m4_dataset(dataset_path: Path, m4_freq: str, pandas_freq: str,
                        prediction_length: int):
    m4_dataset_url = (
        "https://github.com/M4Competition/M4-methods/raw/master/Dataset")
    train_df = pd.read_csv(f"{m4_dataset_url}/Train/{m4_freq}-train.csv",
                           index_col=0)
    test_df = pd.read_csv(f"{m4_dataset_url}/Test/{m4_freq}-test.csv",
                          index_col=0)

    os.makedirs(dataset_path, exist_ok=True)

    with open(dataset_path / "metadata.json", "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=len(train_df),
                    freq=pandas_freq,
                    prediction_length=prediction_length,
                )))

    train_file = dataset_path / "train" / "data.json"
    test_file = dataset_path / "test" / "data.json"

    train_target_values = [ts[~np.isnan(ts)] for ts in train_df.values]

    test_target_values = [
        np.hstack([train_ts, test_ts])
        for train_ts, test_ts in zip(train_target_values, test_df.values)
    ]

    if m4_freq == "Yearly":
        # some time series have more than 300 years which can not be represented in pandas,
        # this is probably due to a misclassification of those time series as Yearly
        # we simply use only the last 300 years for training
        # note this does not affect test time as prediction length is less than 300 years
        train_target_values = [ts[-300:] for ts in train_target_values]
        test_target_values = [ts[-300:] for ts in test_target_values]

    # the original dataset did not include time stamps, so we use a mock start date for each time series
    # we use the earliest point available in pandas
    mock_start_dataset = "1750-01-01 00:00:00"

    save_to_file(
        train_file,
        [
            to_dict(target_values=target, start=mock_start_dataset, cat=[cat])
            for cat, target in enumerate(train_target_values)
        ],
    )

    save_to_file(
        test_file,
        [
            to_dict(target_values=target, start=mock_start_dataset, cat=[cat])
            for cat, target in enumerate(test_target_values)
        ],
    )
Ejemplo n.º 2
0
def save_dataset(dataset_path: Path, ds_info: GPCopulaDataset):
    dataset = list(FileDataset(dataset_path, freq=ds_info.freq))
    shutil.rmtree(dataset_path)
    train_file = dataset_path / "data.json"
    save_to_file(
        train_file,
        [
            to_dict(
                target_values=data_entry[FieldName.TARGET],
                start=data_entry[FieldName.START],
                # Handles adding categorical features of rolling
                # evaluation dates
                cat=[cat - ds_info.num_series * (cat // ds_info.num_series)],
            )
            for cat, data_entry in enumerate(dataset)
        ],
    )
Ejemplo n.º 3
0
def generate_lstnet_dataset(dataset_path: Path, dataset_name: str):
    ds_info = datasets_info[dataset_name]

    os.makedirs(dataset_path, exist_ok=True)

    with open(dataset_path / "metadata.json", "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=ds_info.num_series,
                    freq=ds_info.freq,
                    prediction_length=ds_info.prediction_length,
                )))

    train_file = dataset_path / "train" / "data.json"
    test_file = dataset_path / "test" / "data.json"

    time_index = pd.date_range(
        start=ds_info.start_date,
        freq=ds_info.freq,
        periods=ds_info.num_time_steps,
    )

    df = pd.read_csv(ds_info.url, header=None)

    assert df.shape == (
        ds_info.num_time_steps,
        ds_info.num_series,
    ), f"expected num_time_steps/num_series {(ds_info.num_time_steps, ds_info.num_series)} but got {df.shape}"

    timeseries = load_from_pandas(df=df,
                                  time_index=time_index,
                                  agg_freq=ds_info.agg_freq)

    # the last date seen during training
    ts_index = timeseries[0].index
    training_end = ts_index[int(len(ts_index) * (8 / 10))]

    train_ts = []
    for cat, ts in enumerate(timeseries):
        sliced_ts = ts[:training_end]
        if len(sliced_ts) > 0:
            train_ts.append(
                to_dict(
                    target_values=sliced_ts.values,
                    start=sliced_ts.index[0],
                    cat=[cat],
                ))

    assert len(train_ts) == ds_info.num_series

    save_to_file(train_file, train_ts)

    # time of the first prediction
    prediction_dates = [
        frequency_add(training_end, i * ds_info.prediction_length)
        for i in range(ds_info.rolling_evaluations)
    ]

    test_ts = []
    for prediction_start_date in prediction_dates:
        for cat, ts in enumerate(timeseries):
            # print(prediction_start_date)
            prediction_end_date = frequency_add(prediction_start_date,
                                                ds_info.prediction_length)
            sliced_ts = ts[:prediction_end_date]
            test_ts.append(
                to_dict(
                    target_values=sliced_ts.values,
                    start=sliced_ts.index[0],
                    cat=[cat],
                ))

    assert len(test_ts) == ds_info.num_series * ds_info.rolling_evaluations

    save_to_file(test_file, test_ts)
Ejemplo n.º 4
0
def generate_lstnet_dataset(
    dataset_path: Path,
    dataset_name: str,
    prediction_length: Optional[int] = None,
):
    ds_info = datasets_info[dataset_name]

    ds_metadata = metadata(
        cardinality=ds_info.num_series,
        freq=ds_info.freq if ds_info.agg_freq is None else ds_info.agg_freq,
        prediction_length=prediction_length or ds_info.prediction_length,
    )

    os.makedirs(dataset_path, exist_ok=True)

    with open(dataset_path / "metadata.json", "w") as f:
        json.dump(ds_metadata, f)

    time_index = pd.period_range(
        start=ds_info.start_date,
        freq=ds_info.freq,
        periods=ds_info.num_time_steps,
    )

    df = cast(
        pd.DataFrame,
        pd.read_csv(ds_info.url, header=None),  # type: ignore
    )

    assert df.shape == (
        ds_info.num_time_steps,
        ds_info.num_series,
    ), ("expected num_time_steps/num_series"
        f" {(ds_info.num_time_steps, ds_info.num_series)} but got {df.shape}")

    timeseries = load_from_pandas(df=df,
                                  time_index=time_index,
                                  agg_freq=ds_info.agg_freq)

    # the last date seen during training
    ts_index = cast(pd.PeriodIndex, timeseries[0].index)
    training_end = ts_index[int(len(ts_index) * (8 / 10))]

    train_ts = []
    for cat, ts in enumerate(timeseries):
        sliced_ts = ts[:training_end]
        if len(sliced_ts) > 0:
            train_ts.append(
                to_dict(
                    target_values=sliced_ts.values,
                    start=sliced_ts.index[0],
                    cat=[cat],
                    item_id=cat,
                ))

    assert len(train_ts) == ds_info.num_series

    save_to_file(dataset_path / "train" / "data.json", train_ts)

    # time of the first prediction
    prediction_dates = [
        training_end + i * ds_info.prediction_length
        for i in range(ds_info.rolling_evaluations)
    ]

    test_ts = []
    for prediction_start_date in prediction_dates:
        for cat, ts in enumerate(timeseries):
            # print(prediction_start_date)
            prediction_end_date = (prediction_start_date +
                                   ds_info.prediction_length)
            sliced_ts = ts[:prediction_end_date]
            test_ts.append(
                to_dict(
                    target_values=sliced_ts.values,
                    start=sliced_ts.index[0],
                    cat=[cat],
                    item_id=cat,
                ))

    assert len(test_ts) == ds_info.num_series * ds_info.rolling_evaluations

    save_to_file(dataset_path / "test" / "data.json", test_ts)
Ejemplo n.º 5
0
def generate_pts_m5_dataset(
    dataset_path: Path,
    pandas_freq: str,
    prediction_length: int = 28,
    alpha: float = 0.5,
):
    cal_path = f"{dataset_path}/calendar.csv"
    sales_path = f"{dataset_path}/sales_train_validation.csv"
    sales_test_path = f"{dataset_path}/sales_train_evaluation.csv"
    sell_prices_path = f"{dataset_path}/sell_prices.csv"

    if not os.path.exists(cal_path) or not os.path.exists(sales_path):
        raise RuntimeError(
            f"M5 data is available on Kaggle (https://www.kaggle.com/c/m5-forecasting-accuracy/data). "
            f"You first need to agree to the terms of the competition before being able to download the data. "
            f"After you have done that, please copy the files into {dataset_path}."
        )

    # Read M5 data from dataset_path
    calendar = pd.read_csv(cal_path, parse_dates=True)
    calendar.sort_index(inplace=True)
    calendar.date = pd.to_datetime(calendar.date)

    sales_train_validation = pd.read_csv(
        sales_path,
        index_col=[
            "id", "item_id", "dept_id", "cat_id", "store_id", "state_id"
        ],
    )
    sales_train_validation.sort_index(inplace=True)

    sales_train_evaluation = pd.read_csv(
        sales_test_path,
        index_col=[
            "id", "item_id", "dept_id", "cat_id", "store_id", "state_id"
        ],
    )
    sales_train_evaluation.sort_index(inplace=True)

    sell_prices = pd.read_csv(sell_prices_path,
                              index_col=["item_id", "store_id"])
    sell_prices.sort_index(inplace=True)

    @lru_cache(maxsize=None)
    def get_sell_price(item_id, store_id):
        return calendar.merge(sell_prices.loc[item_id, store_id],
                              on=["wm_yr_wk"],
                              how="left").sell_price

    # Build dynamic features
    kernel = squared_exponential_kernel(alpha=alpha)
    event_1 = CustomDateFeatureSet(
        calendar[calendar.event_name_1.notna()].date, kernel)
    event_2 = CustomDateFeatureSet(
        calendar[calendar.event_name_2.notna()].date, kernel)

    snap_CA = CustomDateFeatureSet(calendar[calendar.snap_CA == 1].date,
                                   kernel)
    snap_TX = CustomDateFeatureSet(calendar[calendar.snap_TX == 1].date,
                                   kernel)
    snap_WI = CustomDateFeatureSet(calendar[calendar.snap_WI == 1].date,
                                   kernel)

    time_index = pd.to_datetime(calendar.date)
    event_1_feature = event_1(time_index)
    event_2_feature = event_2(time_index)

    snap_CA_feature = snap_CA(time_index)
    snap_TX_feature = snap_TX(time_index)
    snap_WI_feature = snap_WI(time_index)

    # Build static features
    sales_train_validation["state"] = pd.CategoricalIndex(
        sales_train_validation.index.get_level_values(5)).codes
    sales_train_validation["store"] = pd.CategoricalIndex(
        sales_train_validation.index.get_level_values(4)).codes
    sales_train_validation["cat"] = pd.CategoricalIndex(
        sales_train_validation.index.get_level_values(3)).codes
    sales_train_validation["dept"] = pd.CategoricalIndex(
        sales_train_validation.index.get_level_values(2)).codes
    sales_train_validation["item"] = pd.CategoricalIndex(
        sales_train_validation.index.get_level_values(1)).codes

    sales_train_evaluation["state"] = pd.CategoricalIndex(
        sales_train_evaluation.index.get_level_values(5)).codes
    sales_train_evaluation["store"] = pd.CategoricalIndex(
        sales_train_evaluation.index.get_level_values(4)).codes
    sales_train_evaluation["cat"] = pd.CategoricalIndex(
        sales_train_evaluation.index.get_level_values(3)).codes
    sales_train_evaluation["dept"] = pd.CategoricalIndex(
        sales_train_evaluation.index.get_level_values(2)).codes
    sales_train_evaluation["item"] = pd.CategoricalIndex(
        sales_train_evaluation.index.get_level_values(1)).codes

    feat_static_cat = [
        {
            "name": "state_id",
            "cardinality": len(sales_train_validation["state"].unique()),
        },
        {
            "name": "store_id",
            "cardinality": len(sales_train_validation["store"].unique()),
        },
        {
            "name": "cat_id",
            "cardinality": len(sales_train_validation["cat"].unique())
        },
        {
            "name": "dept_id",
            "cardinality": len(sales_train_validation["dept"].unique()),
        },
        {
            "name": "item_id",
            "cardinality": len(sales_train_validation["item"].unique()),
        },
    ]

    feat_dynamic_real = [
        {
            "name": "sell_price",
            "cardinality": 1
        },
        {
            "name": "event_1",
            "cardinality": 1
        },
        {
            "name": "event_2",
            "cardinality": 1
        },
        {
            "name": "snap",
            "cardinality": 1
        },
    ]

    # Build training set
    train_file = dataset_path / "train" / "data.json"
    train_ds = []
    for index, item in sales_train_validation.iterrows():
        id, item_id, dept_id, cat_id, store_id, state_id = index
        start_index = np.nonzero(item.iloc[:1913].values)[0][0]
        start_date = time_index[start_index]
        time_series = {}

        state_enc, store_enc, cat_enc, dept_enc, item_enc = item.iloc[1913:]

        time_series["start"] = str(start_date)
        time_series["item_id"] = id[:-11]

        time_series["feat_static_cat"] = [
            state_enc,
            store_enc,
            cat_enc,
            dept_enc,
            item_enc,
        ]

        sell_price = get_sell_price(item_id, store_id)
        snap_feature = {
            "CA": snap_CA_feature,
            "TX": snap_TX_feature,
            "WI": snap_WI_feature,
        }[state_id]

        time_series["target"] = (item.iloc[start_index:1913].values.astype(
            np.float32).tolist())
        time_series["feat_dynamic_real"] = (np.concatenate(
            (
                np.expand_dims(sell_price.iloc[start_index:1913].values, 0),
                event_1_feature[:, start_index:1913],
                event_2_feature[:, start_index:1913],
                snap_feature[:, start_index:1913],
            ),
            0,
        ).astype(np.float32).tolist())

        train_ds.append(time_series.copy())

    # Build training set
    train_file = dataset_path / "train" / "data.json"
    save_to_file(train_file, train_ds)

    # Create metadata file
    meta_file = dataset_path / "metadata.json"
    with open(meta_file, "w") as f:
        f.write(
            json.dumps({
                "freq": pandas_freq,
                "prediction_length": prediction_length,
                "feat_static_cat": feat_static_cat,
                "feat_dynamic_real": feat_dynamic_real,
                "cardinality": len(train_ds),
            }))

    # Build testing set
    test_file = dataset_path / "test" / "data.json"
    test_ds = []
    for index, item in sales_train_evaluation.iterrows():
        id, item_id, dept_id, cat_id, store_id, state_id = index
        start_index = np.nonzero(item.iloc[:1941].values)[0][0]
        start_date = time_index[start_index]
        time_series = {}

        state_enc, store_enc, cat_enc, dept_enc, item_enc = item.iloc[1941:]

        time_series["start"] = str(start_date)
        time_series["item_id"] = id[:-11]

        time_series["feat_static_cat"] = [
            state_enc,
            store_enc,
            cat_enc,
            dept_enc,
            item_enc,
        ]

        sell_price = get_sell_price(item_id, store_id)
        snap_feature = {
            "CA": snap_CA_feature,
            "TX": snap_TX_feature,
            "WI": snap_WI_feature,
        }[state_id]

        time_series["target"] = (item.iloc[start_index:1941].values.astype(
            np.float32).tolist())
        time_series["feat_dynamic_real"] = (np.concatenate(
            (
                np.expand_dims(sell_price.iloc[start_index:1941].values, 0),
                event_1_feature[:, start_index:1941],
                event_2_feature[:, start_index:1941],
                snap_feature[:, start_index:1941],
            ),
            0,
        ).astype(np.float32).tolist())

        test_ds.append(time_series.copy())

    save_to_file(test_file, test_ds)
Ejemplo n.º 6
0
def generate_m5_dataset(
    dataset_path: Path,
    pandas_freq: str,
    prediction_length: int,
    m5_file_path: Path,
):
    cal_path = f"{m5_file_path}/calendar.csv"
    sales_path = f"{m5_file_path}/sales_train_validation.csv"

    if not os.path.exists(cal_path) or not os.path.exists(sales_path):
        raise RuntimeError(
            "M5 data is available on Kaggle"
            " (https://www.kaggle.com/c/m5-forecasting-accuracy/data). You"
            " first need to agree to the terms of the competition before"
            " being able to download the data. After you have done that,"
            f" please supply the files at {m5_file_path}.")

    # Prepare directory
    dataset_path.mkdir(exist_ok=True)

    # Read M5 data from dataset_path
    calendar = pd.read_csv(cal_path)
    sales_train_validation = pd.read_csv(sales_path)
    submission_prediction_length = prediction_length * 2

    # Build dynamic features
    cal_features = calendar.drop(
        [
            "date",
            "wm_yr_wk",
            "weekday",
            "wday",
            "month",
            "year",
            "event_name_1",
            "event_name_2",
            "d",
        ],
        axis=1,
    )
    cal_features["event_type_1"] = cal_features["event_type_1"].apply(
        lambda x: 0 if str(x) == "nan" else 1)
    cal_features["event_type_2"] = cal_features["event_type_2"].apply(
        lambda x: 0 if str(x) == "nan" else 1)
    test_cal_features = cal_features.values.T
    train_cal_features = test_cal_features[:, :-submission_prediction_length -
                                           prediction_length]
    test_cal_features = test_cal_features[:, :-submission_prediction_length]

    test_cal_features_list = [test_cal_features] * len(sales_train_validation)
    train_cal_features_list = [train_cal_features
                               ] * len(sales_train_validation)

    # Build static features
    state_ids = (
        sales_train_validation["state_id"].astype("category").cat.codes.values)
    state_ids_un = np.unique(state_ids)
    store_ids = (
        sales_train_validation["store_id"].astype("category").cat.codes.values)
    store_ids_un = np.unique(store_ids)
    cat_ids = (
        sales_train_validation["cat_id"].astype("category").cat.codes.values)
    cat_ids_un = np.unique(cat_ids)
    dept_ids = (
        sales_train_validation["dept_id"].astype("category").cat.codes.values)
    dept_ids_un = np.unique(dept_ids)
    item_ids = (
        sales_train_validation["item_id"].astype("category").cat.codes.values)
    item_ids_un = np.unique(item_ids)
    stat_cat_list = [item_ids, dept_ids, cat_ids, store_ids, state_ids]
    stat_cat = np.concatenate(stat_cat_list)
    stat_cat = stat_cat.reshape(len(stat_cat_list), len(item_ids)).T
    cardinalities = [
        len(item_ids_un),
        len(dept_ids_un),
        len(cat_ids_un),
        len(store_ids_un),
        len(state_ids_un),
    ]

    # Build target series
    train_ids = sales_train_validation["id"]
    train_df = sales_train_validation.drop(
        ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"], axis=1)
    test_target_values = train_df.values.copy()
    train_target_values = [ts[:-prediction_length] for ts in train_df.values]
    dates = ["2011-01-29 00:00:00" for _ in range(len(sales_train_validation))]

    # Create metadata file
    meta_file = dataset_path / "metadata.json"
    with open(meta_file, "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=cardinalities,
                    freq=pandas_freq,
                    prediction_length=prediction_length,
                )))

    # Build training set
    train_file = dataset_path / "train" / "data.json"
    train_ds = [{
        FieldName.TARGET: target.tolist(),
        FieldName.START: start,
        FieldName.FEAT_DYNAMIC_REAL: fdr.tolist(),
        FieldName.FEAT_STATIC_CAT: fsc.tolist(),
        FieldName.ITEM_ID: id,
    } for (target, start, fdr, fsc, id) in zip(
        train_target_values,
        dates,
        train_cal_features_list,
        stat_cat,
        train_ids,
    )]
    save_to_file(train_file, train_ds)

    # Build testing set
    test_file = dataset_path / "test" / "data.json"
    test_ds = [{
        FieldName.TARGET: target.tolist(),
        FieldName.START: start,
        FieldName.FEAT_DYNAMIC_REAL: fdr.tolist(),
        FieldName.FEAT_STATIC_CAT: fsc.tolist(),
        FieldName.ITEM_ID: id,
    } for (target, start, fdr, fsc, id) in zip(
        test_target_values,
        dates,
        test_cal_features_list,
        stat_cat,
        train_ids,
    )]
    save_to_file(test_file, test_ds)
Ejemplo n.º 7
0
def generate_retail_dataset(dataset_path: Path, split: str = "2011-11-24"):
    retail_dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx"
    df = pd.read_excel(retail_dataset_url)
    combination = ["StockCode", "Country"]
    df = _preprocess_retail_data(df, combination)
    df.to_pickle("tmp/temp.pkl")
    # df = pd.read_pickle("temp.pkl")
    idx = pd.IndexSlice[:, :, :split]
    train_df = df.loc[idx, :].reset_index()
    idx = pd.IndexSlice[:, :, split:]
    test_df = df.loc[idx, :].reset_index()
    full_df = df.reset_index()
    single_prediction_length = len(test_df["InvoiceDate"].unique())
    feat_static_cat = combination
    feat_dynamic_real = ['UnitPrice']
    target = 'Quantity'
    date_col = 'InvoiceDate'

    os.makedirs(dataset_path, exist_ok=True)

    uniq_combs = train_df[combination].drop_duplicates().apply(tuple, axis=1)
    dynamic_real_train_l = []
    dynamic_real_test_l = []
    stat_cat_l = []
    start_l = []
    train_target_l = []
    test_target_l = []
    for stock_code, country in tqdm(uniq_combs):
        df = train_df[
            (train_df.StockCode == stock_code) & (train_df.Country == country)
        ]
        _df = full_df[(full_df.StockCode == stock_code) & (full_df.Country == country)]
        train_ts = _df[target].values.ravel()
        if (train_ts>0).sum() > (single_prediction_length+13):
            test_feat_dyn_array = _df.loc[:, feat_dynamic_real].values.T
            train_feat_dyn_array = test_feat_dyn_array[:, :-single_prediction_length]

            test_ts = train_ts.copy()
            train_ts = train_ts[:-single_prediction_length]

            dynamic_real_train_l.append(train_feat_dyn_array)
            dynamic_real_test_l.append(test_feat_dyn_array)
            start_l.append(df[date_col].min())
            train_target_l.append(train_ts)
            test_target_l.append(test_ts)
            stat_cat_l.append(
                np.squeeze(df.loc[:, feat_static_cat].drop_duplicates().values)
            )
    stat_cat_cardinalities = [
            len(full_df[col].unique()) for col in feat_static_cat
        ]

    with open(dataset_path / "metadata.json", "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=stat_cat_cardinalities,
                    freq="1D",
                    prediction_length=single_prediction_length,
                )
            )
        )

    train_file = dataset_path / "train" / "data.json"
    test_file = dataset_path / "test" / "data.json"
    train_ds = [
        {
            FieldName.ITEM_ID: "|".join(map(str,uniq_comb)),
            FieldName.TARGET: target.tolist(),
            FieldName.START: str(start),
            FieldName.FEAT_STATIC_CAT: fsc.tolist(),
            FieldName.FEAT_DYNAMIC_REAL: fdr.tolist(),
        }
        for uniq_comb, target, start, fdr, fsc in zip(
            uniq_combs, train_target_l, start_l, dynamic_real_train_l, stat_cat_l,
        )
    ]
    save_to_file(train_file, train_ds)
    test_ds = [
        {
            FieldName.ITEM_ID: "|".join(map(str,uniq_comb)),
            FieldName.TARGET: target.tolist(),
            FieldName.START: str(start),
            FieldName.FEAT_STATIC_CAT: fsc.tolist(),
            FieldName.FEAT_DYNAMIC_REAL: fdr.tolist(),
        }
        for uniq_comb, target, start, fdr, fsc in zip(
            uniq_combs, test_target_l, start_l, dynamic_real_test_l, stat_cat_l,
        )
    ]
    save_to_file(test_file, test_ds)
Ejemplo n.º 8
0
def generate_m3_dataset(dataset_path: Path, m3_freq: str):
    from gluonts.dataset.repository.datasets import default_dataset_path

    m3_xls_path = default_dataset_path / "M3C.xls"
    if not os.path.exists(m3_xls_path):
        raise RuntimeError(
            f"The m3 data is available at https://forecasters.org/resources/time-series-data/m3-competition/ "
            f"Please download the file and copy the files to this location: {m3_xls_path}"
        )

    class M3Setting(NamedTuple):
        sheet_name: str
        prediction_length: int
        freq: str

    subsets = {
        "yearly": M3Setting("M3Year", 6, "12M"),
        "quarterly": M3Setting("M3Quart", 8, "3M"),
        "monthly": M3Setting("M3Month", 18, "1M"),
        "other": M3Setting("M3Other", 8, "3M"),
    }
    assert (m3_freq.lower() in subsets
            ), f"invalid m3_freq='{m3_freq}'. Allowed values: {subsets.keys()}"

    if m3_freq.lower() == "other":
        warnings.warn(
            "Be aware: The M3-other dataset does not have a known frequency. Since gluonts needs a known frequency, "
            "we will generate the dataset with an artificial `quarterly` frequency."
        )

    subset = subsets[m3_freq.lower()]
    df = pd.read_excel(m3_xls_path, sheet_name=subset.sheet_name)

    def truncate_trailing_nan(v: np.ndarray):
        last_finite_index = np.where(np.isfinite(v))[0][-1]
        return v[:last_finite_index + 1]

    train_data = []
    test_data = []

    def normalize_category(c: str):
        return c.strip()

    df["Category"] = df["Category"].apply(normalize_category)
    categories = list(df["Category"].unique())

    cat_map = {c: i for i, c in enumerate(categories)}

    i = 0
    for _, row in df.iterrows():
        vals = row.values
        series, n, nf, category, starting_year, starting_offset = vals[:6]
        target = np.asarray(vals[6:], dtype=np.float64)
        target = truncate_trailing_nan(target)
        assert len(target) == n
        assert nf == subset.prediction_length
        mock_start = "1750-01-01 00:00:00"
        if starting_year == 0:
            assert starting_offset == 0
            starting_year = mock_start
        s = pd.Timestamp(str(starting_year), freq=subset.freq)
        offset = max(starting_offset - 1, 0)
        if offset:
            s += offset * s.freq
        start = str(s).split(" ")[0]

        cat = [i, cat_map[category]]

        d_train = to_dict(
            target_values=target[:-subset.prediction_length],
            start=start,
            cat=cat,
            item_id=series,
        )
        train_data.append(d_train)

        d_test = to_dict(target_values=target,
                         start=start,
                         cat=cat,
                         item_id=series)
        test_data.append(d_test)
        i += 1

    os.makedirs(dataset_path, exist_ok=True)
    with open(dataset_path / "metadata.json", "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=[len(train_data),
                                 len(categories)],
                    freq=subset.freq,
                    prediction_length=subset.prediction_length,
                )))

    train_file = dataset_path / "train" / "data.json"
    test_file = dataset_path / "test" / "data.json"

    save_to_file(train_file, train_data)
    save_to_file(test_file, test_data)

    check_dataset(dataset_path, len(df))