Esempio n. 1
0
def get_group_data_by_duplicate(name, num_duplicates, num_groups):
    dataset = get_dataset(name)
    dataset_group = [[] for i in range(num_groups)]
    whole_data_list = []
    no_duplicate_whole_data_list = []
    ret = []
    it = iter(dataset.train)
    num_ts = int(dataset.metadata.feat_static_cat[0].cardinality)
    for i in range(num_ts):
        train_entry = next(it)
        no_duplicate_whole_data_list.append({
            "target": train_entry["target"],
            "start": train_entry["start"]
        })
        for j in range(num_duplicates):
            dataset_group[i % num_groups].append({
                "target":
                train_entry["target"],
                "start":
                train_entry["start"],
            })
            whole_data_list.append({
                "target": train_entry["target"],
                "start": train_entry["start"],
            })
    random.shuffle(whole_data_list)
    random.shuffle(no_duplicate_whole_data_list)
    ret.append(
        ListDataset(no_duplicate_whole_data_list, freq=dataset.metadata.freq))
    ret.append(ListDataset(whole_data_list, freq=dataset.metadata.freq))
    for group in dataset_group:
        random.shuffle(group)
        ret.append(ListDataset(group, freq=dataset.metadata.freq))
    return ret, dataset.metadata.freq
Esempio n. 2
0
def get_temperature_data(context_length=24,
                         prediction_length=4,
                         samples_per_ts=2000,
                         num_groups=8):
    ts_file = pd.read_csv("temperature.csv")
    city_names = [
        "Vancouver",
        "Los Angeles",
        "Las Vegas",
        "San Diego",
        "Philadelphia",
        "Montreal",
        "Boston",
        "Haifa",
    ]
    datetime = ts_file["datetime"]
    dataset_group = [[] for i in range(num_groups)]
    whole_data = []
    ret = dict()
    for gid in range(num_groups):
        ts = ts_file[city_names[gid]]
        num_samples = 0
        index = 0
        while True:
            num_samples += 1
            index += 1
            ts_slice = torch.tensor(ts[index:index + context_length +
                                       prediction_length].values)
            nu = 1 + sum(ts_slice) / len(ts_slice)
            ts_slice /= nu
            if torch.sum(torch.isnan(ts_slice)).item() == 0:
                dataset_group[gid].append({
                    "target":
                    ts_slice,
                    "start":
                    pd.Timestamp(datetime[index]),
                })
                whole_data.append({
                    "target": ts_slice,
                    "start": pd.Timestamp(datetime[index]),
                })
            if num_samples == samples_per_ts:
                break
    random.shuffle(whole_data)
    ret["whole_data"] = ListDataset(whole_data, freq="1H")
    group_data_list = []
    for group in dataset_group:
        random.shuffle(group)
        group_data_list.append(ListDataset(group, freq="1H"))
    ret["group_data"] = group_data_list
    print("write whole data")
    with open("synthetic_temperature_whole_data.csv", "wb") as output:
        pickle.dump(ret["whole_data"], output)
    print("write group data")
    with open("synthetic_temperature_group_data.csv", "wb") as output:
        pickle.dump(ret, output)
    return True
Esempio n. 3
0
def get_synthetic_data_mlp(model_name=None,
                           num_groups=8,
                           mean_boundary=0.5,
                           num_duplicates=16):
    assert num_groups > 1
    prediction_length = 1
    context_length = 12
    device = "cpu"
    dataset_group = []
    whole_data_list = []
    start = pd.Timestamp("01-01-2019", freq="1H")
    for gid in range(num_groups):
        net = SimpleFeedForwardEstimator(
            freq="1H",
            prediction_length=prediction_length,
            context_length=context_length,
        ).create_training_network(device)
        for p in net.parameters():
            p.data = torch.normal(0, 0.1, size=p.data.shape)
        pattern_group = []
        # for j in range(num_duplicates):
        # ts = torch.Uniform(0, 1, size=(1, context_length))
        while True:
            ts = torch.rand(size=(1, context_length))
            ts_slice = torch.Tensor(ts[0][-context_length:]).view(
                1, context_length)
            prediction = net.get_distr(ts_slice).sample((1000, ))
            prediction = sum(prediction) / len(prediction)
            if abs(torch.norm(prediction)) <= 1:
                break
        ts = torch.cat([ts, prediction], dim=1)
        ts = ts.view(len(ts[0]), )  # [context_length:]
        for j in range(num_duplicates):
            ts_sample = ts + torch.normal(0, 0.1, size=ts.shape)
            whole_data_list.append({"target": ts_sample, "start": start})
            pattern_group.append({"target": ts_sample, "start": start})
        dataset_group.append(ListDataset(pattern_group, freq="1H"))
    random.shuffle(whole_data_list)
    random.shuffle(dataset_group)
    dataset = ListDataset(whole_data_list, freq="1H")
    ret = []
    ret.append(dataset)
    ret.append(dataset)
    dataset_group = [dataset] + dataset_group
    dataset_group = [dataset] + dataset_group

    # save to files
    with open("synthetic_mlp_whole_data.csv", "wb") as output:
        pickle.dump(ret, output)

    with open("synthetic_mlp_group_data.csv", "wb") as output:
        pickle.dump(dataset_group, output)
    return True
def test_multivariate_grouper_train(univariate_ts, multivariate_ts,
                                    train_fill_rule) -> None:
    univariate_ds = ListDataset(univariate_ts, freq="1D")
    multivariate_ds = ListDataset(multivariate_ts,
                                  freq="1D",
                                  one_dim_target=False)

    grouper = MultivariateGrouper(train_fill_rule=train_fill_rule)
    assert (list(grouper(univariate_ds))[0]["target"] == list(multivariate_ds)
            [0]["target"]).all()

    assert list(grouper(univariate_ds))[0]["start"] == list(
        multivariate_ds)[0]["start"]
Esempio n. 5
0
def get_synthetic_data_sin(model_name=None,
                           num_groups=32,
                           mean_boundary=1,
                           num_duplicates=50):
    assert num_groups > 1
    num_time_steps = 100

    dataset_group = []
    whole_data_list = []
    no_duplicate_whole_data_list = []
    start = pd.Timestamp("01-01-2019", freq="1D")
    for gid in range(num_groups):
        mean = (gid + 1) * mean_boundary
        base = np.linspace(0, mean, num_time_steps)
        pattern_group = []
        ts = (gid + 1) * torch.sin(torch.FloatTensor(base)).view(
            1, num_time_steps)
        ts += torch.FloatTensor((gid + 1) * base).view(1, num_time_steps)
        no_duplicate_whole_data_list.append({
            "target": ts.view(len(ts[0]), ),
            "start": start,
        })
        for j in range(num_duplicates):
            ts += torch.normal(0, 0.1, size=ts.shape)
            whole_data_list.append({
                "target": ts.view(len(ts[0]), ),
                "start": start,
            })
            pattern_group.append({
                "target": ts.view(len(ts[0]), ),
                "start": start,
            })
        dataset_group.append(ListDataset(pattern_group, freq="1D"))

    random.shuffle(whole_data_list)
    random.shuffle(no_duplicate_whole_data_list)
    ret_whole_dataset = []
    dataset = ListDataset(whole_data_list, freq="1D")
    no_duplicate_dataset = ListDataset(no_duplicate_whole_data_list, freq="1D")
    ret_whole_dataset.append(dataset)
    ret_whole_dataset.append(dataset)
    dataset_group = [dataset] + dataset_group
    dataset_group = [dataset] + dataset_group

    # save to files
    with open("synthetic_complexsin_whole_data.csv", "wb") as output:
        pickle.dump(ret_whole_dataset, output)

    with open("synthetic_complexsin_group_data.csv", "wb") as output:
        pickle.dump(dataset_group, output)
    return True
Esempio n. 6
0
def get_synthetic_data(model_name=None, num_groups=8, mean_boundary=1):
    assert num_groups > 1
    prediction_length = 1
    context_length = 5
    num_time_steps = 1
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = "cpu"
    net = SimpleFeedForwardEstimator(
        freq="1H",
        prediction_length=prediction_length,
        context_length=context_length,
    ).create_training_network(device)
    delta = 2 * mean_boundary / num_groups
    dataset_group = []
    whole_data_list = []
    start = pd.Timestamp("01-01-2019", freq="1H")
    for gid in range(num_groups):
        parameter_mean = -mean_boundary + gid * delta
        # change the parameters of the model
        for p in net.parameters():
            p.data = torch.normal(parameter_mean, 0.1, size=p.data.shape)
        ts = torch.normal(0, 0.1, size=(1, context_length))
        for num_ts in range(num_time_steps):
            ts_slice = torch.Tensor(ts[0][-context_length:]).view(
                1, context_length)
            prediction = net.get_distr(ts_slice).sample((5000, ))
            prediction = sum(prediction) / len(prediction)
            ts = torch.cat([ts, prediction], dim=1)
        whole_data_list.append({
            "target": ts.view(len(ts[0]), )[context_length:],
            "start": start,
        })
        dataset_group.append(
            ListDataset(
                [{
                    "target": ts.view(len(ts[0]), )[context_length:],
                    "start": start,
                }],
                freq="1H",
            ))
    dataset = ListDataset(whole_data_list, freq="1H")
    dataset_group = [dataset] + dataset_group

    # save to files
    with open("synthetic_whole_data.csv", "wb") as output:
        pickle.dump(dataset, output)

    with open("synthetic_group_data.csv", "wb") as output:
        pickle.dump(dataset_group, output)
    return True
Esempio n. 7
0
    def make_test_data():
        target = np.array(
            [0, 0, 0, 0, 10, 10, 20, 20, 30, 30, 40, 50, 59, 60, 60, 70, 80, 90, 100,]
        ).tolist()

        np.random.shuffle(target)

        multi_dim_target = np.array([target, target]).transpose()

        past_is_pad = np.array([[0] * len(target)]).transpose()

        past_observed_target = np.array(
            [[1] * len(target), [1] * len(target)]
        ).transpose()

        ds = ListDataset(
            # Mimic output from InstanceSplitter
            data_iter=[
                {
                    "start": "2012-01-01",
                    "target": multi_dim_target,
                    "past_target": multi_dim_target,
                    "future_target": multi_dim_target,
                    "past_is_pad": past_is_pad,
                    f"past_{FieldName.OBSERVED_VALUES}": past_observed_target,
                }
            ],
            freq="1D",
            one_dim_target=False,
        )
        return ds
Esempio n. 8
0
def get_group_data_by_var(name, num_groups, len_sample=9):
    dataset = get_dataset(name)
    dataset_group = [[] for i in range(num_groups)]
    whole_data = []
    ret = []
    it = iter(dataset.train)
    num_ts = int(dataset.metadata.feat_static_cat[0].cardinality)
    group_boundary = [1e3, 5e3, 1e4, 5e4, 1e5, 5e5]
    for i in range(num_ts):
        train_entry = next(it)
        unsplit_ts = train_entry["target"][0:800]
        unsplit_start = train_entry["start"]
        whole_data.append({"target": unsplit_ts, "start": unsplit_start})
        for ts_sample_start in range(len(unsplit_ts) - len_sample):
            group_id = 0
            print(
                torch.var(
                    torch.FloatTensor(
                        unsplit_ts[ts_sample_start:ts_sample_start +
                                   len_sample])))
            continue
            dataset_group[group_id].append({
                "target":
                unsplit_ts[ts_sample_start:ts_sample_start + len_sample],
                "start":
                unsplit_start,
            })
            unsplit_start += pd.Timedelta(hours=1)
    import pdb

    pdb.set_trace()
    random.shuffle(whole_data)
    print("append once")
    ret.append(ListDataset(whole_data, freq=dataset.metadata.freq))
    print("append twice")
    ret.append(ListDataset(whole_data, freq=dataset.metadata.freq))
    print("append data")
    for group in dataset_group:
        random.shuffle(group)
        ret.append(ListDataset(group, freq=dataset.metadata.freq))
    print("write whole data")
    with open("synthetic_traffic_time_whole_data.csv", "wb") as output:
        pickle.dump(ret[0:2], output)
    print("write group data")
    with open("synthetic_traffic_time_group_data.csv", "wb") as output:
        pickle.dump(ret, output)
    return True
Esempio n. 9
0
def GlounTS():
    from pts.dataset import ListDataset
    from pts.model.deepar import DeepAREstimator
    from pts import Trainer
    from pts.dataset import to_pandas
    # gluonts crash in my sistem.
    #from gluonts.dataset.common import ListDataset
    #from gluonts.model.deepar import DeepAREstimator
    #from gluonts.trainer import Trainer
    training_data = ListDataset([{"start": df.index[0], "target": df.value[:"2015-03-08 23:22:53"]}], freq="5min")
    estimator = DeepAREstimator(freq="5min",input_size = 43, prediction_length=forecast_size, trainer=Trainer(epochs=20))
    predictor = estimator.train(training_data=training_data)
    test_data = ListDataset([{"start": df.index[0], "target": df.value[:"2015-03-08 23:22:53"]}], freq="5min")
    GluonTS_prediction = next(predictor.predict(test_data))
    GluonTS_mean_yhat = GluonTS_prediction.mean
    GluonTS_median_yhat = GluonTS_prediction.median
    return GluonTS_mean_yhat.tolist(), GluonTS_median_yhat.tolist(), GluonTS_prediction
def test_multivariate_grouper_test(univariate_ts, multivariate_ts,
                                   test_fill_rule, max_target_dim) -> None:
    univariate_ds = ListDataset(univariate_ts, freq="1D")
    multivariate_ds = ListDataset(multivariate_ts,
                                  freq="1D",
                                  one_dim_target=False)

    grouper = MultivariateGrouper(
        test_fill_rule=test_fill_rule,
        num_test_dates=2,
        max_target_dim=max_target_dim,
    )

    for grouped_data, multivariate_data in zip(grouper(univariate_ds),
                                               multivariate_ds):
        assert (grouped_data["target"] == multivariate_data["target"]).all()

        assert grouped_data["start"] == multivariate_data["start"]
Esempio n. 11
0
def get_amazon_sales():
    f = open("./dataset/sgc_train.json", "r", encoding="utf-8")
    dataset_group = [[] for i in range(8)]
    whole_data = []
    ret = dict()
    X = []
    Y = []
    # split_grid = [0.04, 0.1, 0.25, 0.5, 1, 10, 100, 5000]
    for line in f.readlines():
        # gid = 0
        dic = json.loads(line)
        var = torch.var(torch.FloatTensor(dic["target"])).item()
        if var > 5000:
            continue
        nu = 1 + sum(dic["target"]) / len(dic["target"])
        ts = [i / nu for i in dic["target"]]
        start = dic["start"]
        # ts = dic['target']
        if len(ts) < 28:
            continue
        X.append((ts, var))
    X = sorted(X, key=lambda x: x[1])
    X = [x[0] for x in X]
    length = int(len(X) / 8)
    for gid in range(8):
        for j in range(gid * length, (gid + 1) * length):
            whole_data.append({"target": X[j], "start": start})
            dataset_group[gid].append({"target": X[j], "start": start})
    random.shuffle(whole_data)
    ret["whole_data"] = ListDataset(whole_data, freq="1H")
    group_list = []
    for group in dataset_group:
        random.shuffle(group)
        group_list.append(ListDataset(group, freq="1H"))
    ret["group_data"] = group_list
    print("write whole data")
    with open("synthetic_sales_time_whole_data.csv", "wb") as output:
        pickle.dump(ret["whole_data"], output)
    print("write group data")
    with open("synthetic_sales_time_group_data.csv", "wb") as output:
        pickle.dump(ret, output)
    return True
Esempio n. 12
0
def get_whole_data(name):
    dataset = get_dataset(name)
    dataset_group = []
    it = iter(dataset.train)
    num_ts = int(dataset.metadata.feat_static_cat[0].cardinality)
    for i in range(num_ts):
        train_entry = next(it)
        dataset_group.append({
            "target": train_entry["target"],
            "start": train_entry["start"]
        })
    return ListDataset(dataset_group, freq=dataset.metadata.freq)
Esempio n. 13
0
def test_Transformation():
    train_length = 100
    ds = ListDataset(
        [{"start": "2012-01-01", "target": [0.2] * train_length}], freq="1D"
    )

    pred_length = 10

    t = transform.Chain(
        trans=[
            transform.AddTimeFeatures(
                start_field=FieldName.START,
                target_field=FieldName.TARGET,
                output_field="time_feat",
                time_features=[
                    time_feature.DayOfWeek(),
                    time_feature.DayOfMonth(),
                    time_feature.MonthOfYear(),
                ],
                pred_length=pred_length,
            ),
            transform.AddAgeFeature(
                target_field=FieldName.TARGET,
                output_field="age",
                pred_length=pred_length,
                log_scale=True,
            ),
            transform.AddObservedValuesIndicator(
                target_field=FieldName.TARGET, output_field="observed_values"
            ),
            transform.VstackFeatures(
                output_field="dynamic_feat",
                input_fields=["age", "time_feat"],
                drop_inputs=True,
            ),
            transform.InstanceSplitter(
                target_field=FieldName.TARGET,
                is_pad_field=FieldName.IS_PAD,
                start_field=FieldName.START,
                forecast_start_field=FieldName.FORECAST_START,
                train_sampler=transform.ExpectedNumInstanceSampler(
                    num_instances=4
                ),
                past_length=train_length,
                future_length=pred_length,
                time_series_fields=["dynamic_feat", "observed_values"],
            ),
        ]
    )

    for u in t(iter(ds), is_train=True):
        print(u)
Esempio n. 14
0
def make_dataset(N, train_length):
    # generates 2 ** N - 1 timeseries with constant increasing values
    n = 2 ** N - 1
    targets = np.ones((n, train_length))
    for i in range(0, n):
        targets[i, :] = targets[i, :] * i

    ds = ListDataset(
        data_iter=[{"start": "2012-01-01", "target": targets[i, :]} for i in range(n)],
        freq="1D",
    )

    return ds
Esempio n. 15
0
def point_process_dataset():

    ia_times = np.array([0.2, 0.7, 0.2, 0.5, 0.3, 0.3, 0.2, 0.1])
    marks = np.array([0, 1, 2, 0, 1, 2, 2, 2])

    lds = ListDataset(
        [{
            "target": np.c_[ia_times, marks].T,
            "start": pd.Timestamp("2011-01-01 00:00:00", freq="H"),
            "end": pd.Timestamp("2011-01-01 03:00:00", freq="H"),
        }],
        freq="H",
        one_dim_target=False,
    )

    return lds
Esempio n. 16
0
def test_target_dim_indicator():
    target = np.array([0, 2, 3, 10]).tolist()

    multi_dim_target = np.array([target, target, target, target])
    dataset = ListDataset(
        data_iter=[{"start": "2012-01-01", "target": multi_dim_target}],
        freq="1D",
        one_dim_target=False,
    )

    t = transform.Chain(
        trans=[
            transform.TargetDimIndicator(
                target_field=FieldName.TARGET, field_name="target_dimensions"
            )
        ]
    )

    for data_entry in t(dataset, is_train=True):
        assert (data_entry["target_dimensions"] == np.array([0, 1, 2, 3])).all()
Esempio n. 17
0
def group_electricity_cv(
    num_ts=10,
    num_groups=14,
    context_length=72,
    prediction_length=12,
    file_name="default",
):
    dataset = get_dataset("electricity", regenerate=True)
    len_sample = context_length + prediction_length
    dataset_group = [[] for i in range(num_groups)]
    train_full_data = []
    test_full_data = []
    ret = dict()
    train_it = iter(dataset.train)
    test_it = iter(dataset.test)
    date_checkpoint = [
        "2012-03-01",
        "2012-06-01",
        "2012-09-01",
        "2012-12-01",
        "2013-03-01",
        "2013-06-01",
        "2013-09-01",
        "2013-12-01",
        "2014-03-01",
    ]
    # get ready the training data
    for i in range(num_ts):
        train_entry = next(train_it)
        unsplit_ts = train_entry["target"]
        unsplit_start = train_entry["start"]
        t = unsplit_start
        start_date = 4

        for ts_sample_start in range(0,
                                     len(unsplit_ts) - len_sample,
                                     prediction_length):
            for j, date_ckpt in enumerate(date_checkpoint):
                if unsplit_start < pd.Timestamp(date_ckpt):
                    sid = j
                    break
                elif unsplit_start > pd.Timestamp(date_checkpoint[-1]):
                    sid = len(date_checkpoint)
                    break
            gid = ((start_date + 1) % 7) + sid * 7
            start_date += 1
            ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample]
            train_full_data.append({
                "target": ts_slice,
                "start": t,
                "feat_static_cat": np.array([gid]),
            })
            dataset_group[gid].append({
                "target": ts_slice,
                "start": t,
                "feat_static_cat": np.array([gid]),
            })
            unsplit_start += pd.Timedelta(hours=prediction_length)

    # get ready the test data
    for i in range(int(num_ts * 0.2)):
        test_entry = next(test_it)
        unsplit_ts = test_entry["target"]
        unsplit_start = test_entry["start"]
        for ts_sample_start in range(0,
                                     len(unsplit_ts) - len_sample,
                                     prediction_length):
            ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample]
            test_full_data.append({
                "target":
                ts_slice,
                "start":
                unsplit_start,
                "feat_static_cat":
                test_entry["feat_static_cat"],
            })

    print(
        "Generating the electricity training data, the total number of training examples:",
        len(train_full_data),
    )
    ret["group_ratio"] = [len(i) / len(train_full_data) for i in dataset_group]
    random.shuffle(train_full_data)
    ret["whole_data"] = ListDataset(train_full_data,
                                    freq=dataset.metadata.freq)
    random.shuffle(test_full_data)
    ret["val_data"] = ListDataset(test_full_data, freq=dataset.metadata.freq)
    group_data_list = []
    for group in dataset_group:
        random.shuffle(group)
        group_data_list.append(ListDataset(group, freq=dataset.metadata.freq))
    ret["group_data"] = group_data_list
    os.makedirs("./dataset", exist_ok=True)
    with open("./dataset/" + file_name + ".csv", "wb") as output:
        pickle.dump(ret, output)
    print("Finished pre-processing of the electricity dataset")
    return True

    dataset = get_dataset("traffic")
    len_sample = context_length + prediction_length
    dataset_group = [[] for i in range(num_groups)]
    train_full_data = []
    test_full_data = []
    ret = dict()
    train_it = iter(dataset.train)
    test_it = iter(dataset.test)
    # num_ts = int(dataset.metadata.feat_static_cat[0].cardinality)
    date_checkpoint = ["2016-01-01"]
    # get ready the training data
    for i in range(num_ts):
        train_entry = next(train_it)
        unsplit_ts = train_entry["target"]
        unsplit_start = train_entry["start"]
        t = unsplit_start
        start_date = 4

        for ts_sample_start in range(0,
                                     len(unsplit_ts) - len_sample,
                                     prediction_length):
            for j, date_ckpt in enumerate(date_checkpoint):
                if unsplit_start < pd.Timestamp(date_ckpt):
                    sid = j
                    break
                elif unsplit_start > pd.Timestamp(date_checkpoint[-1]):
                    sid = len(date_checkpoint)
                    break
            gid = ((start_date + 1) % 7) + sid * 7
            start_date += 1
            ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample]
            train_full_data.append({
                "target":
                ts_slice,
                "start":
                t,
                "feat_static_cat":
                train_entry["feat_static_cat"],
            })
            dataset_group[gid].append({
                "target":
                ts_slice,
                "start":
                t,
                "feat_static_cat":
                train_entry["feat_static_cat"],
            })
            unsplit_start += pd.Timedelta(hours=prediction_length)

    # get ready the test data
    for i in range(int(num_ts * 0.2)):
        test_entry = next(test_it)
        unsplit_ts = test_entry["target"]
        unsplit_start = test_entry["start"]
        for ts_sample_start in range(0,
                                     len(unsplit_ts) - len_sample,
                                     prediction_length):
            ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample]
            test_full_data.append({
                "target":
                ts_slice,
                "start":
                unsplit_start,
                "feat_static_cat":
                test_entry["feat_static_cat"],
            })

    print("total number of training examples: ", len(train_full_data))
    ret["group_ratio"] = [len(i) / len(train_full_data) for i in dataset_group]
    print("ratio for each group: ", ret["group_ratio"])
    random.shuffle(train_full_data)
    ret["whole_data"] = ListDataset(train_full_data,
                                    freq=dataset.metadata.freq)
    random.shuffle(test_full_data)
    ret["val_data"] = ListDataset(test_full_data, freq=dataset.metadata.freq)
    group_data_list = []
    for group in dataset_group:
        random.shuffle(group)
        group_data_list.append(ListDataset(group, freq=dataset.metadata.freq))
    ret["group_data"] = group_data_list
    os.makedirs("./dataset", exist_ok=True)
    with open("./dataset/" + file_name + ".csv", "wb") as output:
        pickle.dump(ret, output)
    return True
Esempio n. 18
0
def group_exchangerate_cv(
    num_ts=10,
    num_groups=14,
    context_length=15,
    prediction_length=10,
    file_name="default",
):
    dataset = get_dataset("exchange_rate", regenerate=True)
    len_sample = context_length + prediction_length
    dataset_group = [[] for i in range(num_groups)]
    train_full_data = []
    test_full_data = []
    ret = dict()
    train_it = iter(dataset.train)
    test_it = iter(dataset.test)
    # num_ts = int(dataset.metadata.feat_static_cat[0].cardinality)
    date_checkpoint = ["1994-01-01", "1998-01-01", "2002-01-01"]
    for i in range(num_ts):
        train_entry = next(train_it)
        unsplit_ts = train_entry["target"]
        unsplit_start = train_entry["start"]
        for ts_sample_start in range(0,
                                     len(unsplit_ts) - len_sample,
                                     prediction_length):
            for j, date_ckpt in enumerate(date_checkpoint):
                if unsplit_start < pd.Timestamp(date_ckpt):
                    sid = j
                    break
                elif unsplit_start > pd.Timestamp(date_checkpoint[-1]):
                    sid = len(date_checkpoint)
                    break
            gid = i * 4 + sid
            ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample]
            train_full_data.append({
                "target":
                ts_slice,
                "start":
                unsplit_start,
                "feat_static_cat":
                train_entry["feat_static_cat"],
            })
            dataset_group[gid].append({
                "target":
                ts_slice,
                "start":
                unsplit_start,
                "feat_static_cat":
                train_entry["feat_static_cat"],
            })
            unsplit_start += pd.Timedelta("1D") * prediction_length
    # get ready the test data
    for i in range(int(num_ts * 0.2)):
        test_entry = next(test_it)
        unsplit_ts = test_entry["target"]
        unsplit_start = test_entry["start"]
        for ts_sample_start in range(0,
                                     len(unsplit_ts) - len_sample,
                                     prediction_length):
            ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample]
            test_full_data.append({
                "target":
                ts_slice,
                "start":
                unsplit_start,
                "feat_static_cat":
                test_entry["feat_static_cat"],
            })
    print(
        "Generating the exchange rate training data, the total number of training examples:",
        len(train_full_data),
    )
    ret["group_ratio"] = [len(i) / len(train_full_data) for i in dataset_group]
    random.shuffle(train_full_data)
    ret["whole_data"] = ListDataset(train_full_data,
                                    freq=dataset.metadata.freq)
    random.shuffle(test_full_data)
    ret["val_data"] = ListDataset(test_full_data, freq=dataset.metadata.freq)
    group_data_list = []
    for group in dataset_group:
        random.shuffle(group)
        group_data_list.append(ListDataset(group, freq=dataset.metadata.freq))
    ret["group_data"] = group_data_list
    os.makedirs("./dataset", exist_ok=True)
    with open("./dataset/" + file_name + ".csv", "wb") as output:
        pickle.dump(ret, output)
    print("Finished pre-processing the exchange rate dataset")
    return True
Esempio n. 19
0
def get_mixed_pattern(unit_length=16, num_duplicates=1000):
    freq = "1H"
    context_length = 3 * unit_length
    prediction_length = unit_length
    len_sample = context_length + prediction_length
    dataset_group = [[] for j in range(16)]
    whole_data = []
    val_data = []
    ret = dict()
    start = pd.Timestamp("01-01-2000", freq=freq)
    patterns = [
        ["sin", "linear", "quadratic", "sqrt"],
        ["sqrt", "quadratic", "linear", "sin"],
        ["linear", "sqrt", "sin", "quadratic"],
        ["quadratic", "sin", "sqrt", "linear"],
    ]
    pattern_number = 4
    for m, pattern in enumerate(patterns):
        for gid in range(pattern_number):
            for j in range(num_duplicates):
                context = torch.arange(context_length, dtype=torch.float)
                for i in range(1, pattern_number):
                    context[unit_length * (i - 1):unit_length *
                            i] = _get_mixed_pattern(
                                context[unit_length *
                                        (i - 1):unit_length * i] -
                                unit_length * (i - 1),
                                pattern[(gid + i) % pattern_number],
                            )
                ts_sample = torch.cat([
                    context,
                    _get_mixed_pattern(
                        torch.arange(prediction_length, dtype=torch.float),
                        pattern[gid],
                    ),
                ])
                whole_data.append({"target": ts_sample, "start": start})
                if j % 5 == 0:
                    val_data.append({
                        "target":
                        ts_sample + torch.normal(0, 1, ts_sample.shape),
                        "start":
                        start,
                    })
                dataset_group[m * 4 + gid].append({
                    "target": ts_sample,
                    "start": start
                })
    print(
        "Generating the synthetic training data, the total number of training examples:",
        len(whole_data),
    )
    ret["group_ratio"] = [len(i) / len(whole_data) for i in dataset_group]
    random.shuffle(whole_data)
    group_data = []
    ret["whole_data"] = ListDataset(whole_data, freq=freq)
    ret["val_data"] = ListDataset(val_data, freq=freq)
    for group in dataset_group:
        random.shuffle(group)
        group_data.append(ListDataset(group, freq=freq))
    ret["group_data"] = group_data
    # save to files
    os.makedirs("./dataset", exist_ok=True)
    with open("./dataset/synthetic.csv", "wb") as output:
        pickle.dump(ret, output)
    print("Finished the pre-processing of synthetic dataset")

    return True
Esempio n. 20
0
def test_multi_dim_transformation(is_train):
    train_length = 10

    first_dim: list = list(np.arange(1, 11, 1))
    first_dim[-1] = "NaN"

    second_dim: list = list(np.arange(11, 21, 1))
    second_dim[0] = "NaN"

    ds = ListDataset(
        data_iter=[{
            "start": "2012-01-01",
            "target": [first_dim, second_dim]
        }],
        freq="1D",
        one_dim_target=False,
    )
    pred_length = 2

    # Looks weird - but this is necessary to assert the nan entries correctly.
    first_dim[-1] = np.nan
    second_dim[0] = np.nan

    t = transform.Chain(trans=[
        transform.AddTimeFeatures(
            start_field=FieldName.START,
            target_field=FieldName.TARGET,
            output_field="time_feat",
            time_features=[
                time_feature.DayOfWeek(),
                time_feature.DayOfMonth(),
                time_feature.MonthOfYear(),
            ],
            pred_length=pred_length,
        ),
        transform.AddAgeFeature(
            target_field=FieldName.TARGET,
            output_field="age",
            pred_length=pred_length,
            log_scale=True,
        ),
        transform.AddObservedValuesIndicator(
            target_field=FieldName.TARGET,
            output_field="observed_values",
            convert_nans=False,
        ),
        transform.VstackFeatures(
            output_field="dynamic_feat",
            input_fields=["age", "time_feat"],
            drop_inputs=True,
        ),
        transform.InstanceSplitter(
            target_field=FieldName.TARGET,
            is_pad_field=FieldName.IS_PAD,
            start_field=FieldName.START,
            forecast_start_field=FieldName.FORECAST_START,
            train_sampler=transform.ExpectedNumInstanceSampler(
                num_instances=4),
            past_length=train_length,
            future_length=pred_length,
            time_series_fields=["dynamic_feat", "observed_values"],
            time_first=False,
        ),
    ])

    if is_train:
        for u in t(iter(ds), is_train=True):
            assert_shape(u["past_target"], (2, 10))
            assert_shape(u["past_dynamic_feat"], (4, 10))
            assert_shape(u["past_observed_values"], (2, 10))
            assert_shape(u["future_target"], (2, 2))

            assert_padded_array(
                u["past_observed_values"],
                np.array([[1.0] * 9 + [0.0], [0.0] + [1.0] * 9]),
                u["past_is_pad"],
            )
            assert_padded_array(
                u["past_target"],
                np.array([first_dim, second_dim]),
                u["past_is_pad"],
            )
    else:
        for u in t(iter(ds), is_train=False):
            assert_shape(u["past_target"], (2, 10))
            assert_shape(u["past_dynamic_feat"], (4, 10))
            assert_shape(u["past_observed_values"], (2, 10))
            assert_shape(u["future_target"], (2, 0))

            assert_padded_array(
                u["past_observed_values"],
                np.array([[1.0] * 9 + [0.0], [0.0] + [1.0] * 9]),
                u["past_is_pad"],
            )
            assert_padded_array(
                u["past_target"],
                np.array([first_dim, second_dim]),
                u["past_is_pad"],
            )
Esempio n. 21
0
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

url = "https://raw.githubusercontent.com/numenta/NAB/master/data/realTweets/Twitter_volume_AMZN.csv"
path = "/Users/seenli/Documents/workspace/code/pytorch_learn2/time_series_DL/Twitter_volume_AMZN.csv"
# df = pd.read_csv(url, header=0, index_col=0, parse_dates=True)
df = pd.read_csv(path, header=0, index_col=0, parse_dates=True)

df[:100].plot(linewidth=2)
plt.grid(which='both')
plt.show()

training_data = ListDataset([{
    "start": df.index[0],
    "target": df.value[:"2015-04-05 00:00:00"]
}],
                            freq="5min")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

estimator = DeepAREstimator(freq="5min",
                            prediction_length=12,
                            input_size=43,
                            trainer=Trainer(epochs=20, device=device))

# print('.....')

predictor = estimator.train(training_data=training_data)

test_data = ListDataset(
Esempio n. 22
0
def KMeans_m5_dataset(
    num_ts_=1,
    num_groups=16,
    context_length=72,
    prediction_length=24,
    file_name="default",
):
    df = pd.read_csv("sales_train_evaluation.csv")
    dataset_group = [[] for i in range(num_groups)]
    whole_data = []
    ret = dict()
    # num_ts = int(dataset.metadata.feat_static_cat[0].cardinality)
    num_ts = num_ts_
    len_sample = context_length + prediction_length
    # compute mean and variance
    df = df.iloc[:num_ts, :]
    df["mean"] = df.iloc[:, 1947 - len_sample:1947].mean(axis=1)
    df["var"] = df.iloc[:, 1947 - len_sample:1947].var(axis=1)
    df["mean"] = (df["mean"] - df["mean"].min()) / (df["mean"].max() -
                                                    df["mean"].min())
    df["var"] = (df["var"] - df["var"].min()) / (df["var"].max() -
                                                 df["var"].min())
    df_feature = df.iloc[:, 2:6]
    df_feature = pd.get_dummies(df_feature, dummy_na=True)
    df_feature = pd.concat([df_feature, df.iloc[:, -2]], axis=1)
    feature = torch.from_numpy(df_feature.to_numpy()).contiguous()
    cl, c = KMeans(feature, num_groups)
    # print(cl)
    # import pdb;pdb.set_trace()
    sample_id = 0
    for i in range(num_ts):
        ts_slice = df.iloc[i:i + 1, 1947 - len_sample:1947]
        ts_slice = torch.from_numpy(ts_slice.to_numpy())[0]
        # print(ts_slice)
        # import pdb;pdb.set_trace()
        gid = cl[sample_id]
        unsplit_start = pd.Timestamp("1990-01-01")
        dataset_group[gid].append({
            "target": ts_slice,
            "start": unsplit_start,
        }  # , 'feat_static_cat': train_entry['feat_static_cat']}
                                  )
        whole_data.append({
            "target": ts_slice,
            "start": unsplit_start,
        }  # , 'feat_static_cat': train_entry['feat_static_cat']}
                          )
        sample_id += 1
    print(len(whole_data))
    ret["group_ratio"] = [len(i) / len(whole_data) for i in dataset_group]
    print(ret["group_ratio"])
    random.shuffle(whole_data)
    ret["whole_data"] = ListDataset(whole_data, freq="1H")
    group_data_list = []
    for group in dataset_group:
        random.shuffle(group)
        group_data_list.append(ListDataset(group, freq="1H"))
    ret["group_data"] = group_data_list
    print("write whole data")
    with open("synthetic_" + file_name + "_whole_data.csv", "wb") as output:
        pickle.dump(ret["whole_data"], output)
    print("write group data")
    with open("synthetic_" + file_name + "_group_data.csv", "wb") as output:
        pickle.dump(ret, output)
    return True
Esempio n. 23
0
def get_synthetic_data_linear(
    context_length=24,
    prediction_length=8,
    num_groups=8,
    steps_per_ts=1,
    num_duplicates=16,
):
    assert num_groups > 1
    freq = "1H"
    len_sample = context_length + prediction_length

    dataset_group = []
    whole_data = []
    ret = dict()
    start = pd.Timestamp("01-01-2000", freq=freq)
    for gid in range(num_groups):
        model1 = torch.nn.Linear(context_length, prediction_length)
        model2 = torch.nn.Linear(context_length, prediction_length)
        # model1 = torch.sin
        # model2 = torch.cos
        pattern_group1 = []
        pattern_group2 = []
        sample_context = torch.rand(context_length)
        for t_step in range(2 * steps_per_ts):
            while True:
                with torch.no_grad():
                    if t_step <= steps_per_ts:
                        prediction = model1(sample_context)
                    else:
                        prediction = model2(sample_context)
                    if (
                            torch.norm(prediction) < prediction_length
                    ):  # and prediction_length*0.1 < torch.norm(prediction):
                        # prediction = torch.sin(prediction)
                        # prediction /= torch.max(prediction)
                        break
                    # prediction *= 10
                    # prediction += torch.normal(0, 0.1, size=prediction.shape)
            ts_sample = torch.cat([sample_context, prediction])
            # print(ts_sample)
            for j in range(num_duplicates):
                ts_sample += torch.normal(0, 0.1, size=ts_sample.shape)
                whole_data.append({"target": ts_sample, "start": start})
                if t_step <= steps_per_ts:
                    pattern_group1.append({
                        "target": ts_sample,
                        "start": start
                    })
                else:
                    pattern_group2.append({
                        "target": ts_sample,
                        "start": start
                    })
            sample_context = ts_sample[-context_length:]
            start += pd.Timedelta(hours=prediction_length)
        dataset_group.append(ListDataset(pattern_group1, freq=freq))
        """
        dataset_group.append(
            ListDataset(
                pattern_group2,
                freq=freq
            )
        )
        """
    print(len(whole_data))
    ret["group_ratio"] = [len(i) / len(whole_data) for i in dataset_group]
    print(ret["group_ratio"])
    random.shuffle(whole_data)
    ret["whole_data"] = ListDataset(whole_data, freq=freq)
    ret["group_data"] = dataset_group

    # save to files
    with open("synthetic_linear_new_whole_data.csv", "wb") as output:
        pickle.dump(ret["whole_data"], output)

    with open("synthetic_linear_new_group_data.csv", "wb") as output:
        pickle.dump(ret, output)

    return True
Esempio n. 24
0
def KMeans_inside_dataset(
    num_ts_=1,
    num_groups=16,
    context_length=72,
    prediction_length=24,
    file_name="default",
):
    dataset = get_dataset("traffic")
    dataset_group = [[] for i in range(num_groups)]
    whole_data = []
    ret = dict()
    it = iter(dataset.train)
    # num_ts = int(dataset.metadata.feat_static_cat[0].cardinality)
    num_ts = num_ts_
    len_sample = context_length + prediction_length
    index = 0
    feature = torch.Tensor([])
    for i in range(num_ts):
        train_entry = next(it)
        target = train_entry["target"]

        for ts_sample_start in range(0,
                                     len(target) - len_sample,
                                     prediction_length):
            ts_slice = target[ts_sample_start:ts_sample_start + len_sample]
            feature = torch.cat((
                feature,
                torch.Tensor([
                    ts_slice.mean(),
                    ts_slice.var(),
                    index % 7,
                    index // 90,
                ]),
            ))
            index += 1
    feature = feature.reshape(index, 4)
    feature = _get_pre_features(feature).contiguous()
    # print(feature)
    # import pdb;pdb.set_trace()
    cl, c = KMeans(feature, num_groups)
    it = iter(dataset.train)
    sample_id = 0
    for i in range(num_ts):
        train_entry = next(it)
        target = train_entry["target"]
        unsplit_start = train_entry["start"]
        for ts_sample_start in range(0,
                                     len(target) - len_sample,
                                     prediction_length):
            ts_slice = target[ts_sample_start:ts_sample_start + len_sample]
            gid = cl[sample_id]
            dataset_group[gid].append({
                "target":
                ts_slice,
                "start":
                unsplit_start,
                "feat_static_cat":
                train_entry["feat_static_cat"],
            })
            whole_data.append({
                "target":
                ts_slice,
                "start":
                unsplit_start,
                "feat_static_cat":
                train_entry["feat_static_cat"],
            })
            unsplit_start += pd.Timedelta(hours=prediction_length)
            sample_id += 1
    print(len(whole_data))
    ret["group_ratio"] = [len(i) / len(whole_data) for i in dataset_group]
    print(ret["group_ratio"])
    random.shuffle(whole_data)
    ret["whole_data"] = ListDataset(whole_data, freq=dataset.metadata.freq)
    group_data_list = []
    for group in dataset_group:
        random.shuffle(group)
        group_data_list.append(ListDataset(group, freq=dataset.metadata.freq))
    ret["group_data"] = group_data_list
    print("write whole data")
    with open("synthetic_" + file_name + "_whole_data.csv", "wb") as output:
        pickle.dump(ret["whole_data"], output)
    print("write group data")
    with open("synthetic_" + file_name + "_group_data.csv", "wb") as output:
        pickle.dump(ret, output)
    return True
Esempio n. 25
0
def get_m4_by_freq(
    context_length=72,
    prediction_length=24,
    len_per_ts=200,
    num_ts=50,
    num_groups=6,
    file_name="m4_freq",
):
    dataset_group = [[] for i in range(num_groups)]
    whole_data = []
    ret = dict()
    datasets_name = [
        "m4_hourly",
        "m4_daily",
        "m4_weekly",
        "m4_monthly",
        "m4_quarterly",
        "m4_yearly",
    ]
    hours_factor = [
        1,
        24,
        24 * 7,
        24 * 7 * 30,
        24 * 7 * 30 * 3,
        24 * 7 * 30 * 3 * 4,
    ]
    for i in range(num_groups):
        dataset = get_dataset(datasets_name[i])
        len_sample = context_length + prediction_length
        it = iter(dataset.train)
        for j in range(num_ts):
            train_entry = next(it)
            unsplit_ts = train_entry["target"]
            # unsplit_start = train_entry['start']
            unsplit_start = pd.Timestamp("1990-01-01")
            for ts_sample_start in range(0, len_per_ts - len_sample,
                                         prediction_length):
                if len_sample > len(unsplit_ts):
                    continue
                ts_slice = unsplit_ts[ts_sample_start:ts_sample_start +
                                      len_sample]
                if len(ts_slice) < len_sample:
                    continue
                nu = 1 + sum(ts_slice) / len_sample
                ts_slice = [i / nu for i in ts_slice]
                whole_data.append({
                    "target":
                    ts_slice,
                    "start":
                    unsplit_start,
                    "feat_static_cat":
                    train_entry["feat_static_cat"],
                })
                dataset_group[i].append({
                    "target":
                    ts_slice,
                    "start":
                    unsplit_start,
                    "feat_static_cat":
                    train_entry["feat_static_cat"],
                })
                # unsplit_start += pd.Timedelta(hours=prediction_length*hours_factor[i])
                unsplit_start += pd.Timedelta(hours=prediction_length)
    # for j in range(len(dataset_group)):
    #    print(len(dataset_group[i]))
    # import pdb;pdb.set_trace()
    print(len(whole_data))
    ret["group_ratio"] = [len(i) / len(whole_data) for i in dataset_group]
    print(ret["group_ratio"])
    random.shuffle(whole_data)
    ret["whole_data"] = ListDataset(whole_data, freq=dataset.metadata.freq)
    group_data_list = []
    for group in dataset_group:
        random.shuffle(group)
        group_data_list.append(ListDataset(group, freq=dataset.metadata.freq))
    ret["group_data"] = group_data_list
    print("write whole data")
    with open("synthetic_" + file_name + "_whole_data.csv", "wb") as output:
        pickle.dump(ret["whole_data"], output)
    print("write group data")
    with open("synthetic_" + file_name + "_group_data.csv", "wb") as output:
        pickle.dump(ret, output)
    return True