Beispiel #1
0
def get_group_data_by_duplicate(name, num_duplicates, num_groups):
    dataset = get_dataset(name)
    dataset_group = [[] for i in range(num_groups)]
    whole_data_list = []
    no_duplicate_whole_data_list = []
    ret = []
    it = iter(dataset.train)
    num_ts = int(dataset.metadata.feat_static_cat[0].cardinality)
    for i in range(num_ts):
        train_entry = next(it)
        no_duplicate_whole_data_list.append({
            "target": train_entry["target"],
            "start": train_entry["start"]
        })
        for j in range(num_duplicates):
            dataset_group[i % num_groups].append({
                "target":
                train_entry["target"],
                "start":
                train_entry["start"],
            })
            whole_data_list.append({
                "target": train_entry["target"],
                "start": train_entry["start"],
            })
    random.shuffle(whole_data_list)
    random.shuffle(no_duplicate_whole_data_list)
    ret.append(
        ListDataset(no_duplicate_whole_data_list, freq=dataset.metadata.freq))
    ret.append(ListDataset(whole_data_list, freq=dataset.metadata.freq))
    for group in dataset_group:
        random.shuffle(group)
        ret.append(ListDataset(group, freq=dataset.metadata.freq))
    return ret, dataset.metadata.freq
Beispiel #2
0
def get_whole_data(name):
    dataset = get_dataset(name)
    dataset_group = []
    it = iter(dataset.train)
    num_ts = int(dataset.metadata.feat_static_cat[0].cardinality)
    for i in range(num_ts):
        train_entry = next(it)
        dataset_group.append({
            "target": train_entry["target"],
            "start": train_entry["start"]
        })
    return ListDataset(dataset_group, freq=dataset.metadata.freq)
Beispiel #3
0
def get_group_data_by_var(name, num_groups, len_sample=9):
    dataset = get_dataset(name)
    dataset_group = [[] for i in range(num_groups)]
    whole_data = []
    ret = []
    it = iter(dataset.train)
    num_ts = int(dataset.metadata.feat_static_cat[0].cardinality)
    group_boundary = [1e3, 5e3, 1e4, 5e4, 1e5, 5e5]
    for i in range(num_ts):
        train_entry = next(it)
        unsplit_ts = train_entry["target"][0:800]
        unsplit_start = train_entry["start"]
        whole_data.append({"target": unsplit_ts, "start": unsplit_start})
        for ts_sample_start in range(len(unsplit_ts) - len_sample):
            group_id = 0
            print(
                torch.var(
                    torch.FloatTensor(
                        unsplit_ts[ts_sample_start:ts_sample_start +
                                   len_sample])))
            continue
            dataset_group[group_id].append({
                "target":
                unsplit_ts[ts_sample_start:ts_sample_start + len_sample],
                "start":
                unsplit_start,
            })
            unsplit_start += pd.Timedelta(hours=1)
    import pdb

    pdb.set_trace()
    random.shuffle(whole_data)
    print("append once")
    ret.append(ListDataset(whole_data, freq=dataset.metadata.freq))
    print("append twice")
    ret.append(ListDataset(whole_data, freq=dataset.metadata.freq))
    print("append data")
    for group in dataset_group:
        random.shuffle(group)
        ret.append(ListDataset(group, freq=dataset.metadata.freq))
    print("write whole data")
    with open("synthetic_traffic_time_whole_data.csv", "wb") as output:
        pickle.dump(ret[0:2], output)
    print("write group data")
    with open("synthetic_traffic_time_group_data.csv", "wb") as output:
        pickle.dump(ret, output)
    return True
Beispiel #4
0
def group_electricity_cv(
    num_ts=10,
    num_groups=14,
    context_length=72,
    prediction_length=12,
    file_name="default",
):
    dataset = get_dataset("electricity", regenerate=True)
    len_sample = context_length + prediction_length
    dataset_group = [[] for i in range(num_groups)]
    train_full_data = []
    test_full_data = []
    ret = dict()
    train_it = iter(dataset.train)
    test_it = iter(dataset.test)
    date_checkpoint = [
        "2012-03-01",
        "2012-06-01",
        "2012-09-01",
        "2012-12-01",
        "2013-03-01",
        "2013-06-01",
        "2013-09-01",
        "2013-12-01",
        "2014-03-01",
    ]
    # get ready the training data
    for i in range(num_ts):
        train_entry = next(train_it)
        unsplit_ts = train_entry["target"]
        unsplit_start = train_entry["start"]
        t = unsplit_start
        start_date = 4

        for ts_sample_start in range(0,
                                     len(unsplit_ts) - len_sample,
                                     prediction_length):
            for j, date_ckpt in enumerate(date_checkpoint):
                if unsplit_start < pd.Timestamp(date_ckpt):
                    sid = j
                    break
                elif unsplit_start > pd.Timestamp(date_checkpoint[-1]):
                    sid = len(date_checkpoint)
                    break
            gid = ((start_date + 1) % 7) + sid * 7
            start_date += 1
            ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample]
            train_full_data.append({
                "target": ts_slice,
                "start": t,
                "feat_static_cat": np.array([gid]),
            })
            dataset_group[gid].append({
                "target": ts_slice,
                "start": t,
                "feat_static_cat": np.array([gid]),
            })
            unsplit_start += pd.Timedelta(hours=prediction_length)

    # get ready the test data
    for i in range(int(num_ts * 0.2)):
        test_entry = next(test_it)
        unsplit_ts = test_entry["target"]
        unsplit_start = test_entry["start"]
        for ts_sample_start in range(0,
                                     len(unsplit_ts) - len_sample,
                                     prediction_length):
            ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample]
            test_full_data.append({
                "target":
                ts_slice,
                "start":
                unsplit_start,
                "feat_static_cat":
                test_entry["feat_static_cat"],
            })

    print(
        "Generating the electricity training data, the total number of training examples:",
        len(train_full_data),
    )
    ret["group_ratio"] = [len(i) / len(train_full_data) for i in dataset_group]
    random.shuffle(train_full_data)
    ret["whole_data"] = ListDataset(train_full_data,
                                    freq=dataset.metadata.freq)
    random.shuffle(test_full_data)
    ret["val_data"] = ListDataset(test_full_data, freq=dataset.metadata.freq)
    group_data_list = []
    for group in dataset_group:
        random.shuffle(group)
        group_data_list.append(ListDataset(group, freq=dataset.metadata.freq))
    ret["group_data"] = group_data_list
    os.makedirs("./dataset", exist_ok=True)
    with open("./dataset/" + file_name + ".csv", "wb") as output:
        pickle.dump(ret, output)
    print("Finished pre-processing of the electricity dataset")
    return True

    dataset = get_dataset("traffic")
    len_sample = context_length + prediction_length
    dataset_group = [[] for i in range(num_groups)]
    train_full_data = []
    test_full_data = []
    ret = dict()
    train_it = iter(dataset.train)
    test_it = iter(dataset.test)
    # num_ts = int(dataset.metadata.feat_static_cat[0].cardinality)
    date_checkpoint = ["2016-01-01"]
    # get ready the training data
    for i in range(num_ts):
        train_entry = next(train_it)
        unsplit_ts = train_entry["target"]
        unsplit_start = train_entry["start"]
        t = unsplit_start
        start_date = 4

        for ts_sample_start in range(0,
                                     len(unsplit_ts) - len_sample,
                                     prediction_length):
            for j, date_ckpt in enumerate(date_checkpoint):
                if unsplit_start < pd.Timestamp(date_ckpt):
                    sid = j
                    break
                elif unsplit_start > pd.Timestamp(date_checkpoint[-1]):
                    sid = len(date_checkpoint)
                    break
            gid = ((start_date + 1) % 7) + sid * 7
            start_date += 1
            ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample]
            train_full_data.append({
                "target":
                ts_slice,
                "start":
                t,
                "feat_static_cat":
                train_entry["feat_static_cat"],
            })
            dataset_group[gid].append({
                "target":
                ts_slice,
                "start":
                t,
                "feat_static_cat":
                train_entry["feat_static_cat"],
            })
            unsplit_start += pd.Timedelta(hours=prediction_length)

    # get ready the test data
    for i in range(int(num_ts * 0.2)):
        test_entry = next(test_it)
        unsplit_ts = test_entry["target"]
        unsplit_start = test_entry["start"]
        for ts_sample_start in range(0,
                                     len(unsplit_ts) - len_sample,
                                     prediction_length):
            ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample]
            test_full_data.append({
                "target":
                ts_slice,
                "start":
                unsplit_start,
                "feat_static_cat":
                test_entry["feat_static_cat"],
            })

    print("total number of training examples: ", len(train_full_data))
    ret["group_ratio"] = [len(i) / len(train_full_data) for i in dataset_group]
    print("ratio for each group: ", ret["group_ratio"])
    random.shuffle(train_full_data)
    ret["whole_data"] = ListDataset(train_full_data,
                                    freq=dataset.metadata.freq)
    random.shuffle(test_full_data)
    ret["val_data"] = ListDataset(test_full_data, freq=dataset.metadata.freq)
    group_data_list = []
    for group in dataset_group:
        random.shuffle(group)
        group_data_list.append(ListDataset(group, freq=dataset.metadata.freq))
    ret["group_data"] = group_data_list
    os.makedirs("./dataset", exist_ok=True)
    with open("./dataset/" + file_name + ".csv", "wb") as output:
        pickle.dump(ret, output)
    return True
Beispiel #5
0
def group_exchangerate_cv(
    num_ts=10,
    num_groups=14,
    context_length=15,
    prediction_length=10,
    file_name="default",
):
    dataset = get_dataset("exchange_rate", regenerate=True)
    len_sample = context_length + prediction_length
    dataset_group = [[] for i in range(num_groups)]
    train_full_data = []
    test_full_data = []
    ret = dict()
    train_it = iter(dataset.train)
    test_it = iter(dataset.test)
    # num_ts = int(dataset.metadata.feat_static_cat[0].cardinality)
    date_checkpoint = ["1994-01-01", "1998-01-01", "2002-01-01"]
    for i in range(num_ts):
        train_entry = next(train_it)
        unsplit_ts = train_entry["target"]
        unsplit_start = train_entry["start"]
        for ts_sample_start in range(0,
                                     len(unsplit_ts) - len_sample,
                                     prediction_length):
            for j, date_ckpt in enumerate(date_checkpoint):
                if unsplit_start < pd.Timestamp(date_ckpt):
                    sid = j
                    break
                elif unsplit_start > pd.Timestamp(date_checkpoint[-1]):
                    sid = len(date_checkpoint)
                    break
            gid = i * 4 + sid
            ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample]
            train_full_data.append({
                "target":
                ts_slice,
                "start":
                unsplit_start,
                "feat_static_cat":
                train_entry["feat_static_cat"],
            })
            dataset_group[gid].append({
                "target":
                ts_slice,
                "start":
                unsplit_start,
                "feat_static_cat":
                train_entry["feat_static_cat"],
            })
            unsplit_start += pd.Timedelta("1D") * prediction_length
    # get ready the test data
    for i in range(int(num_ts * 0.2)):
        test_entry = next(test_it)
        unsplit_ts = test_entry["target"]
        unsplit_start = test_entry["start"]
        for ts_sample_start in range(0,
                                     len(unsplit_ts) - len_sample,
                                     prediction_length):
            ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample]
            test_full_data.append({
                "target":
                ts_slice,
                "start":
                unsplit_start,
                "feat_static_cat":
                test_entry["feat_static_cat"],
            })
    print(
        "Generating the exchange rate training data, the total number of training examples:",
        len(train_full_data),
    )
    ret["group_ratio"] = [len(i) / len(train_full_data) for i in dataset_group]
    random.shuffle(train_full_data)
    ret["whole_data"] = ListDataset(train_full_data,
                                    freq=dataset.metadata.freq)
    random.shuffle(test_full_data)
    ret["val_data"] = ListDataset(test_full_data, freq=dataset.metadata.freq)
    group_data_list = []
    for group in dataset_group:
        random.shuffle(group)
        group_data_list.append(ListDataset(group, freq=dataset.metadata.freq))
    ret["group_data"] = group_data_list
    os.makedirs("./dataset", exist_ok=True)
    with open("./dataset/" + file_name + ".csv", "wb") as output:
        pickle.dump(ret, output)
    print("Finished pre-processing the exchange rate dataset")
    return True
Beispiel #6
0
def get_m4_by_freq(
    context_length=72,
    prediction_length=24,
    len_per_ts=200,
    num_ts=50,
    num_groups=6,
    file_name="m4_freq",
):
    dataset_group = [[] for i in range(num_groups)]
    whole_data = []
    ret = dict()
    datasets_name = [
        "m4_hourly",
        "m4_daily",
        "m4_weekly",
        "m4_monthly",
        "m4_quarterly",
        "m4_yearly",
    ]
    hours_factor = [
        1,
        24,
        24 * 7,
        24 * 7 * 30,
        24 * 7 * 30 * 3,
        24 * 7 * 30 * 3 * 4,
    ]
    for i in range(num_groups):
        dataset = get_dataset(datasets_name[i])
        len_sample = context_length + prediction_length
        it = iter(dataset.train)
        for j in range(num_ts):
            train_entry = next(it)
            unsplit_ts = train_entry["target"]
            # unsplit_start = train_entry['start']
            unsplit_start = pd.Timestamp("1990-01-01")
            for ts_sample_start in range(0, len_per_ts - len_sample,
                                         prediction_length):
                if len_sample > len(unsplit_ts):
                    continue
                ts_slice = unsplit_ts[ts_sample_start:ts_sample_start +
                                      len_sample]
                if len(ts_slice) < len_sample:
                    continue
                nu = 1 + sum(ts_slice) / len_sample
                ts_slice = [i / nu for i in ts_slice]
                whole_data.append({
                    "target":
                    ts_slice,
                    "start":
                    unsplit_start,
                    "feat_static_cat":
                    train_entry["feat_static_cat"],
                })
                dataset_group[i].append({
                    "target":
                    ts_slice,
                    "start":
                    unsplit_start,
                    "feat_static_cat":
                    train_entry["feat_static_cat"],
                })
                # unsplit_start += pd.Timedelta(hours=prediction_length*hours_factor[i])
                unsplit_start += pd.Timedelta(hours=prediction_length)
    # for j in range(len(dataset_group)):
    #    print(len(dataset_group[i]))
    # import pdb;pdb.set_trace()
    print(len(whole_data))
    ret["group_ratio"] = [len(i) / len(whole_data) for i in dataset_group]
    print(ret["group_ratio"])
    random.shuffle(whole_data)
    ret["whole_data"] = ListDataset(whole_data, freq=dataset.metadata.freq)
    group_data_list = []
    for group in dataset_group:
        random.shuffle(group)
        group_data_list.append(ListDataset(group, freq=dataset.metadata.freq))
    ret["group_data"] = group_data_list
    print("write whole data")
    with open("synthetic_" + file_name + "_whole_data.csv", "wb") as output:
        pickle.dump(ret["whole_data"], output)
    print("write group data")
    with open("synthetic_" + file_name + "_group_data.csv", "wb") as output:
        pickle.dump(ret, output)
    return True
Beispiel #7
0
def KMeans_inside_dataset(
    num_ts_=1,
    num_groups=16,
    context_length=72,
    prediction_length=24,
    file_name="default",
):
    dataset = get_dataset("traffic")
    dataset_group = [[] for i in range(num_groups)]
    whole_data = []
    ret = dict()
    it = iter(dataset.train)
    # num_ts = int(dataset.metadata.feat_static_cat[0].cardinality)
    num_ts = num_ts_
    len_sample = context_length + prediction_length
    index = 0
    feature = torch.Tensor([])
    for i in range(num_ts):
        train_entry = next(it)
        target = train_entry["target"]

        for ts_sample_start in range(0,
                                     len(target) - len_sample,
                                     prediction_length):
            ts_slice = target[ts_sample_start:ts_sample_start + len_sample]
            feature = torch.cat((
                feature,
                torch.Tensor([
                    ts_slice.mean(),
                    ts_slice.var(),
                    index % 7,
                    index // 90,
                ]),
            ))
            index += 1
    feature = feature.reshape(index, 4)
    feature = _get_pre_features(feature).contiguous()
    # print(feature)
    # import pdb;pdb.set_trace()
    cl, c = KMeans(feature, num_groups)
    it = iter(dataset.train)
    sample_id = 0
    for i in range(num_ts):
        train_entry = next(it)
        target = train_entry["target"]
        unsplit_start = train_entry["start"]
        for ts_sample_start in range(0,
                                     len(target) - len_sample,
                                     prediction_length):
            ts_slice = target[ts_sample_start:ts_sample_start + len_sample]
            gid = cl[sample_id]
            dataset_group[gid].append({
                "target":
                ts_slice,
                "start":
                unsplit_start,
                "feat_static_cat":
                train_entry["feat_static_cat"],
            })
            whole_data.append({
                "target":
                ts_slice,
                "start":
                unsplit_start,
                "feat_static_cat":
                train_entry["feat_static_cat"],
            })
            unsplit_start += pd.Timedelta(hours=prediction_length)
            sample_id += 1
    print(len(whole_data))
    ret["group_ratio"] = [len(i) / len(whole_data) for i in dataset_group]
    print(ret["group_ratio"])
    random.shuffle(whole_data)
    ret["whole_data"] = ListDataset(whole_data, freq=dataset.metadata.freq)
    group_data_list = []
    for group in dataset_group:
        random.shuffle(group)
        group_data_list.append(ListDataset(group, freq=dataset.metadata.freq))
    ret["group_data"] = group_data_list
    print("write whole data")
    with open("synthetic_" + file_name + "_whole_data.csv", "wb") as output:
        pickle.dump(ret["whole_data"], output)
    print("write group data")
    with open("synthetic_" + file_name + "_group_data.csv", "wb") as output:
        pickle.dump(ret, output)
    return True