def get_group_data_by_duplicate(name, num_duplicates, num_groups): dataset = get_dataset(name) dataset_group = [[] for i in range(num_groups)] whole_data_list = [] no_duplicate_whole_data_list = [] ret = [] it = iter(dataset.train) num_ts = int(dataset.metadata.feat_static_cat[0].cardinality) for i in range(num_ts): train_entry = next(it) no_duplicate_whole_data_list.append({ "target": train_entry["target"], "start": train_entry["start"] }) for j in range(num_duplicates): dataset_group[i % num_groups].append({ "target": train_entry["target"], "start": train_entry["start"], }) whole_data_list.append({ "target": train_entry["target"], "start": train_entry["start"], }) random.shuffle(whole_data_list) random.shuffle(no_duplicate_whole_data_list) ret.append( ListDataset(no_duplicate_whole_data_list, freq=dataset.metadata.freq)) ret.append(ListDataset(whole_data_list, freq=dataset.metadata.freq)) for group in dataset_group: random.shuffle(group) ret.append(ListDataset(group, freq=dataset.metadata.freq)) return ret, dataset.metadata.freq
def get_temperature_data(context_length=24, prediction_length=4, samples_per_ts=2000, num_groups=8): ts_file = pd.read_csv("temperature.csv") city_names = [ "Vancouver", "Los Angeles", "Las Vegas", "San Diego", "Philadelphia", "Montreal", "Boston", "Haifa", ] datetime = ts_file["datetime"] dataset_group = [[] for i in range(num_groups)] whole_data = [] ret = dict() for gid in range(num_groups): ts = ts_file[city_names[gid]] num_samples = 0 index = 0 while True: num_samples += 1 index += 1 ts_slice = torch.tensor(ts[index:index + context_length + prediction_length].values) nu = 1 + sum(ts_slice) / len(ts_slice) ts_slice /= nu if torch.sum(torch.isnan(ts_slice)).item() == 0: dataset_group[gid].append({ "target": ts_slice, "start": pd.Timestamp(datetime[index]), }) whole_data.append({ "target": ts_slice, "start": pd.Timestamp(datetime[index]), }) if num_samples == samples_per_ts: break random.shuffle(whole_data) ret["whole_data"] = ListDataset(whole_data, freq="1H") group_data_list = [] for group in dataset_group: random.shuffle(group) group_data_list.append(ListDataset(group, freq="1H")) ret["group_data"] = group_data_list print("write whole data") with open("synthetic_temperature_whole_data.csv", "wb") as output: pickle.dump(ret["whole_data"], output) print("write group data") with open("synthetic_temperature_group_data.csv", "wb") as output: pickle.dump(ret, output) return True
def get_synthetic_data_mlp(model_name=None, num_groups=8, mean_boundary=0.5, num_duplicates=16): assert num_groups > 1 prediction_length = 1 context_length = 12 device = "cpu" dataset_group = [] whole_data_list = [] start = pd.Timestamp("01-01-2019", freq="1H") for gid in range(num_groups): net = SimpleFeedForwardEstimator( freq="1H", prediction_length=prediction_length, context_length=context_length, ).create_training_network(device) for p in net.parameters(): p.data = torch.normal(0, 0.1, size=p.data.shape) pattern_group = [] # for j in range(num_duplicates): # ts = torch.Uniform(0, 1, size=(1, context_length)) while True: ts = torch.rand(size=(1, context_length)) ts_slice = torch.Tensor(ts[0][-context_length:]).view( 1, context_length) prediction = net.get_distr(ts_slice).sample((1000, )) prediction = sum(prediction) / len(prediction) if abs(torch.norm(prediction)) <= 1: break ts = torch.cat([ts, prediction], dim=1) ts = ts.view(len(ts[0]), ) # [context_length:] for j in range(num_duplicates): ts_sample = ts + torch.normal(0, 0.1, size=ts.shape) whole_data_list.append({"target": ts_sample, "start": start}) pattern_group.append({"target": ts_sample, "start": start}) dataset_group.append(ListDataset(pattern_group, freq="1H")) random.shuffle(whole_data_list) random.shuffle(dataset_group) dataset = ListDataset(whole_data_list, freq="1H") ret = [] ret.append(dataset) ret.append(dataset) dataset_group = [dataset] + dataset_group dataset_group = [dataset] + dataset_group # save to files with open("synthetic_mlp_whole_data.csv", "wb") as output: pickle.dump(ret, output) with open("synthetic_mlp_group_data.csv", "wb") as output: pickle.dump(dataset_group, output) return True
def test_multivariate_grouper_train(univariate_ts, multivariate_ts, train_fill_rule) -> None: univariate_ds = ListDataset(univariate_ts, freq="1D") multivariate_ds = ListDataset(multivariate_ts, freq="1D", one_dim_target=False) grouper = MultivariateGrouper(train_fill_rule=train_fill_rule) assert (list(grouper(univariate_ds))[0]["target"] == list(multivariate_ds) [0]["target"]).all() assert list(grouper(univariate_ds))[0]["start"] == list( multivariate_ds)[0]["start"]
def get_synthetic_data_sin(model_name=None, num_groups=32, mean_boundary=1, num_duplicates=50): assert num_groups > 1 num_time_steps = 100 dataset_group = [] whole_data_list = [] no_duplicate_whole_data_list = [] start = pd.Timestamp("01-01-2019", freq="1D") for gid in range(num_groups): mean = (gid + 1) * mean_boundary base = np.linspace(0, mean, num_time_steps) pattern_group = [] ts = (gid + 1) * torch.sin(torch.FloatTensor(base)).view( 1, num_time_steps) ts += torch.FloatTensor((gid + 1) * base).view(1, num_time_steps) no_duplicate_whole_data_list.append({ "target": ts.view(len(ts[0]), ), "start": start, }) for j in range(num_duplicates): ts += torch.normal(0, 0.1, size=ts.shape) whole_data_list.append({ "target": ts.view(len(ts[0]), ), "start": start, }) pattern_group.append({ "target": ts.view(len(ts[0]), ), "start": start, }) dataset_group.append(ListDataset(pattern_group, freq="1D")) random.shuffle(whole_data_list) random.shuffle(no_duplicate_whole_data_list) ret_whole_dataset = [] dataset = ListDataset(whole_data_list, freq="1D") no_duplicate_dataset = ListDataset(no_duplicate_whole_data_list, freq="1D") ret_whole_dataset.append(dataset) ret_whole_dataset.append(dataset) dataset_group = [dataset] + dataset_group dataset_group = [dataset] + dataset_group # save to files with open("synthetic_complexsin_whole_data.csv", "wb") as output: pickle.dump(ret_whole_dataset, output) with open("synthetic_complexsin_group_data.csv", "wb") as output: pickle.dump(dataset_group, output) return True
def get_synthetic_data(model_name=None, num_groups=8, mean_boundary=1): assert num_groups > 1 prediction_length = 1 context_length = 5 num_time_steps = 1 # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = "cpu" net = SimpleFeedForwardEstimator( freq="1H", prediction_length=prediction_length, context_length=context_length, ).create_training_network(device) delta = 2 * mean_boundary / num_groups dataset_group = [] whole_data_list = [] start = pd.Timestamp("01-01-2019", freq="1H") for gid in range(num_groups): parameter_mean = -mean_boundary + gid * delta # change the parameters of the model for p in net.parameters(): p.data = torch.normal(parameter_mean, 0.1, size=p.data.shape) ts = torch.normal(0, 0.1, size=(1, context_length)) for num_ts in range(num_time_steps): ts_slice = torch.Tensor(ts[0][-context_length:]).view( 1, context_length) prediction = net.get_distr(ts_slice).sample((5000, )) prediction = sum(prediction) / len(prediction) ts = torch.cat([ts, prediction], dim=1) whole_data_list.append({ "target": ts.view(len(ts[0]), )[context_length:], "start": start, }) dataset_group.append( ListDataset( [{ "target": ts.view(len(ts[0]), )[context_length:], "start": start, }], freq="1H", )) dataset = ListDataset(whole_data_list, freq="1H") dataset_group = [dataset] + dataset_group # save to files with open("synthetic_whole_data.csv", "wb") as output: pickle.dump(dataset, output) with open("synthetic_group_data.csv", "wb") as output: pickle.dump(dataset_group, output) return True
def make_test_data(): target = np.array( [0, 0, 0, 0, 10, 10, 20, 20, 30, 30, 40, 50, 59, 60, 60, 70, 80, 90, 100,] ).tolist() np.random.shuffle(target) multi_dim_target = np.array([target, target]).transpose() past_is_pad = np.array([[0] * len(target)]).transpose() past_observed_target = np.array( [[1] * len(target), [1] * len(target)] ).transpose() ds = ListDataset( # Mimic output from InstanceSplitter data_iter=[ { "start": "2012-01-01", "target": multi_dim_target, "past_target": multi_dim_target, "future_target": multi_dim_target, "past_is_pad": past_is_pad, f"past_{FieldName.OBSERVED_VALUES}": past_observed_target, } ], freq="1D", one_dim_target=False, ) return ds
def get_group_data_by_var(name, num_groups, len_sample=9): dataset = get_dataset(name) dataset_group = [[] for i in range(num_groups)] whole_data = [] ret = [] it = iter(dataset.train) num_ts = int(dataset.metadata.feat_static_cat[0].cardinality) group_boundary = [1e3, 5e3, 1e4, 5e4, 1e5, 5e5] for i in range(num_ts): train_entry = next(it) unsplit_ts = train_entry["target"][0:800] unsplit_start = train_entry["start"] whole_data.append({"target": unsplit_ts, "start": unsplit_start}) for ts_sample_start in range(len(unsplit_ts) - len_sample): group_id = 0 print( torch.var( torch.FloatTensor( unsplit_ts[ts_sample_start:ts_sample_start + len_sample]))) continue dataset_group[group_id].append({ "target": unsplit_ts[ts_sample_start:ts_sample_start + len_sample], "start": unsplit_start, }) unsplit_start += pd.Timedelta(hours=1) import pdb pdb.set_trace() random.shuffle(whole_data) print("append once") ret.append(ListDataset(whole_data, freq=dataset.metadata.freq)) print("append twice") ret.append(ListDataset(whole_data, freq=dataset.metadata.freq)) print("append data") for group in dataset_group: random.shuffle(group) ret.append(ListDataset(group, freq=dataset.metadata.freq)) print("write whole data") with open("synthetic_traffic_time_whole_data.csv", "wb") as output: pickle.dump(ret[0:2], output) print("write group data") with open("synthetic_traffic_time_group_data.csv", "wb") as output: pickle.dump(ret, output) return True
def GlounTS(): from pts.dataset import ListDataset from pts.model.deepar import DeepAREstimator from pts import Trainer from pts.dataset import to_pandas # gluonts crash in my sistem. #from gluonts.dataset.common import ListDataset #from gluonts.model.deepar import DeepAREstimator #from gluonts.trainer import Trainer training_data = ListDataset([{"start": df.index[0], "target": df.value[:"2015-03-08 23:22:53"]}], freq="5min") estimator = DeepAREstimator(freq="5min",input_size = 43, prediction_length=forecast_size, trainer=Trainer(epochs=20)) predictor = estimator.train(training_data=training_data) test_data = ListDataset([{"start": df.index[0], "target": df.value[:"2015-03-08 23:22:53"]}], freq="5min") GluonTS_prediction = next(predictor.predict(test_data)) GluonTS_mean_yhat = GluonTS_prediction.mean GluonTS_median_yhat = GluonTS_prediction.median return GluonTS_mean_yhat.tolist(), GluonTS_median_yhat.tolist(), GluonTS_prediction
def test_multivariate_grouper_test(univariate_ts, multivariate_ts, test_fill_rule, max_target_dim) -> None: univariate_ds = ListDataset(univariate_ts, freq="1D") multivariate_ds = ListDataset(multivariate_ts, freq="1D", one_dim_target=False) grouper = MultivariateGrouper( test_fill_rule=test_fill_rule, num_test_dates=2, max_target_dim=max_target_dim, ) for grouped_data, multivariate_data in zip(grouper(univariate_ds), multivariate_ds): assert (grouped_data["target"] == multivariate_data["target"]).all() assert grouped_data["start"] == multivariate_data["start"]
def get_amazon_sales(): f = open("./dataset/sgc_train.json", "r", encoding="utf-8") dataset_group = [[] for i in range(8)] whole_data = [] ret = dict() X = [] Y = [] # split_grid = [0.04, 0.1, 0.25, 0.5, 1, 10, 100, 5000] for line in f.readlines(): # gid = 0 dic = json.loads(line) var = torch.var(torch.FloatTensor(dic["target"])).item() if var > 5000: continue nu = 1 + sum(dic["target"]) / len(dic["target"]) ts = [i / nu for i in dic["target"]] start = dic["start"] # ts = dic['target'] if len(ts) < 28: continue X.append((ts, var)) X = sorted(X, key=lambda x: x[1]) X = [x[0] for x in X] length = int(len(X) / 8) for gid in range(8): for j in range(gid * length, (gid + 1) * length): whole_data.append({"target": X[j], "start": start}) dataset_group[gid].append({"target": X[j], "start": start}) random.shuffle(whole_data) ret["whole_data"] = ListDataset(whole_data, freq="1H") group_list = [] for group in dataset_group: random.shuffle(group) group_list.append(ListDataset(group, freq="1H")) ret["group_data"] = group_list print("write whole data") with open("synthetic_sales_time_whole_data.csv", "wb") as output: pickle.dump(ret["whole_data"], output) print("write group data") with open("synthetic_sales_time_group_data.csv", "wb") as output: pickle.dump(ret, output) return True
def get_whole_data(name): dataset = get_dataset(name) dataset_group = [] it = iter(dataset.train) num_ts = int(dataset.metadata.feat_static_cat[0].cardinality) for i in range(num_ts): train_entry = next(it) dataset_group.append({ "target": train_entry["target"], "start": train_entry["start"] }) return ListDataset(dataset_group, freq=dataset.metadata.freq)
def test_Transformation(): train_length = 100 ds = ListDataset( [{"start": "2012-01-01", "target": [0.2] * train_length}], freq="1D" ) pred_length = 10 t = transform.Chain( trans=[ transform.AddTimeFeatures( start_field=FieldName.START, target_field=FieldName.TARGET, output_field="time_feat", time_features=[ time_feature.DayOfWeek(), time_feature.DayOfMonth(), time_feature.MonthOfYear(), ], pred_length=pred_length, ), transform.AddAgeFeature( target_field=FieldName.TARGET, output_field="age", pred_length=pred_length, log_scale=True, ), transform.AddObservedValuesIndicator( target_field=FieldName.TARGET, output_field="observed_values" ), transform.VstackFeatures( output_field="dynamic_feat", input_fields=["age", "time_feat"], drop_inputs=True, ), transform.InstanceSplitter( target_field=FieldName.TARGET, is_pad_field=FieldName.IS_PAD, start_field=FieldName.START, forecast_start_field=FieldName.FORECAST_START, train_sampler=transform.ExpectedNumInstanceSampler( num_instances=4 ), past_length=train_length, future_length=pred_length, time_series_fields=["dynamic_feat", "observed_values"], ), ] ) for u in t(iter(ds), is_train=True): print(u)
def make_dataset(N, train_length): # generates 2 ** N - 1 timeseries with constant increasing values n = 2 ** N - 1 targets = np.ones((n, train_length)) for i in range(0, n): targets[i, :] = targets[i, :] * i ds = ListDataset( data_iter=[{"start": "2012-01-01", "target": targets[i, :]} for i in range(n)], freq="1D", ) return ds
def point_process_dataset(): ia_times = np.array([0.2, 0.7, 0.2, 0.5, 0.3, 0.3, 0.2, 0.1]) marks = np.array([0, 1, 2, 0, 1, 2, 2, 2]) lds = ListDataset( [{ "target": np.c_[ia_times, marks].T, "start": pd.Timestamp("2011-01-01 00:00:00", freq="H"), "end": pd.Timestamp("2011-01-01 03:00:00", freq="H"), }], freq="H", one_dim_target=False, ) return lds
def test_target_dim_indicator(): target = np.array([0, 2, 3, 10]).tolist() multi_dim_target = np.array([target, target, target, target]) dataset = ListDataset( data_iter=[{"start": "2012-01-01", "target": multi_dim_target}], freq="1D", one_dim_target=False, ) t = transform.Chain( trans=[ transform.TargetDimIndicator( target_field=FieldName.TARGET, field_name="target_dimensions" ) ] ) for data_entry in t(dataset, is_train=True): assert (data_entry["target_dimensions"] == np.array([0, 1, 2, 3])).all()
def group_electricity_cv( num_ts=10, num_groups=14, context_length=72, prediction_length=12, file_name="default", ): dataset = get_dataset("electricity", regenerate=True) len_sample = context_length + prediction_length dataset_group = [[] for i in range(num_groups)] train_full_data = [] test_full_data = [] ret = dict() train_it = iter(dataset.train) test_it = iter(dataset.test) date_checkpoint = [ "2012-03-01", "2012-06-01", "2012-09-01", "2012-12-01", "2013-03-01", "2013-06-01", "2013-09-01", "2013-12-01", "2014-03-01", ] # get ready the training data for i in range(num_ts): train_entry = next(train_it) unsplit_ts = train_entry["target"] unsplit_start = train_entry["start"] t = unsplit_start start_date = 4 for ts_sample_start in range(0, len(unsplit_ts) - len_sample, prediction_length): for j, date_ckpt in enumerate(date_checkpoint): if unsplit_start < pd.Timestamp(date_ckpt): sid = j break elif unsplit_start > pd.Timestamp(date_checkpoint[-1]): sid = len(date_checkpoint) break gid = ((start_date + 1) % 7) + sid * 7 start_date += 1 ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample] train_full_data.append({ "target": ts_slice, "start": t, "feat_static_cat": np.array([gid]), }) dataset_group[gid].append({ "target": ts_slice, "start": t, "feat_static_cat": np.array([gid]), }) unsplit_start += pd.Timedelta(hours=prediction_length) # get ready the test data for i in range(int(num_ts * 0.2)): test_entry = next(test_it) unsplit_ts = test_entry["target"] unsplit_start = test_entry["start"] for ts_sample_start in range(0, len(unsplit_ts) - len_sample, prediction_length): ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample] test_full_data.append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": test_entry["feat_static_cat"], }) print( "Generating the electricity training data, the total number of training examples:", len(train_full_data), ) ret["group_ratio"] = [len(i) / len(train_full_data) for i in dataset_group] random.shuffle(train_full_data) ret["whole_data"] = ListDataset(train_full_data, freq=dataset.metadata.freq) random.shuffle(test_full_data) ret["val_data"] = ListDataset(test_full_data, freq=dataset.metadata.freq) group_data_list = [] for group in dataset_group: random.shuffle(group) group_data_list.append(ListDataset(group, freq=dataset.metadata.freq)) ret["group_data"] = group_data_list os.makedirs("./dataset", exist_ok=True) with open("./dataset/" + file_name + ".csv", "wb") as output: pickle.dump(ret, output) print("Finished pre-processing of the electricity dataset") return True dataset = get_dataset("traffic") len_sample = context_length + prediction_length dataset_group = [[] for i in range(num_groups)] train_full_data = [] test_full_data = [] ret = dict() train_it = iter(dataset.train) test_it = iter(dataset.test) # num_ts = int(dataset.metadata.feat_static_cat[0].cardinality) date_checkpoint = ["2016-01-01"] # get ready the training data for i in range(num_ts): train_entry = next(train_it) unsplit_ts = train_entry["target"] unsplit_start = train_entry["start"] t = unsplit_start start_date = 4 for ts_sample_start in range(0, len(unsplit_ts) - len_sample, prediction_length): for j, date_ckpt in enumerate(date_checkpoint): if unsplit_start < pd.Timestamp(date_ckpt): sid = j break elif unsplit_start > pd.Timestamp(date_checkpoint[-1]): sid = len(date_checkpoint) break gid = ((start_date + 1) % 7) + sid * 7 start_date += 1 ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample] train_full_data.append({ "target": ts_slice, "start": t, "feat_static_cat": train_entry["feat_static_cat"], }) dataset_group[gid].append({ "target": ts_slice, "start": t, "feat_static_cat": train_entry["feat_static_cat"], }) unsplit_start += pd.Timedelta(hours=prediction_length) # get ready the test data for i in range(int(num_ts * 0.2)): test_entry = next(test_it) unsplit_ts = test_entry["target"] unsplit_start = test_entry["start"] for ts_sample_start in range(0, len(unsplit_ts) - len_sample, prediction_length): ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample] test_full_data.append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": test_entry["feat_static_cat"], }) print("total number of training examples: ", len(train_full_data)) ret["group_ratio"] = [len(i) / len(train_full_data) for i in dataset_group] print("ratio for each group: ", ret["group_ratio"]) random.shuffle(train_full_data) ret["whole_data"] = ListDataset(train_full_data, freq=dataset.metadata.freq) random.shuffle(test_full_data) ret["val_data"] = ListDataset(test_full_data, freq=dataset.metadata.freq) group_data_list = [] for group in dataset_group: random.shuffle(group) group_data_list.append(ListDataset(group, freq=dataset.metadata.freq)) ret["group_data"] = group_data_list os.makedirs("./dataset", exist_ok=True) with open("./dataset/" + file_name + ".csv", "wb") as output: pickle.dump(ret, output) return True
def group_exchangerate_cv( num_ts=10, num_groups=14, context_length=15, prediction_length=10, file_name="default", ): dataset = get_dataset("exchange_rate", regenerate=True) len_sample = context_length + prediction_length dataset_group = [[] for i in range(num_groups)] train_full_data = [] test_full_data = [] ret = dict() train_it = iter(dataset.train) test_it = iter(dataset.test) # num_ts = int(dataset.metadata.feat_static_cat[0].cardinality) date_checkpoint = ["1994-01-01", "1998-01-01", "2002-01-01"] for i in range(num_ts): train_entry = next(train_it) unsplit_ts = train_entry["target"] unsplit_start = train_entry["start"] for ts_sample_start in range(0, len(unsplit_ts) - len_sample, prediction_length): for j, date_ckpt in enumerate(date_checkpoint): if unsplit_start < pd.Timestamp(date_ckpt): sid = j break elif unsplit_start > pd.Timestamp(date_checkpoint[-1]): sid = len(date_checkpoint) break gid = i * 4 + sid ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample] train_full_data.append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": train_entry["feat_static_cat"], }) dataset_group[gid].append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": train_entry["feat_static_cat"], }) unsplit_start += pd.Timedelta("1D") * prediction_length # get ready the test data for i in range(int(num_ts * 0.2)): test_entry = next(test_it) unsplit_ts = test_entry["target"] unsplit_start = test_entry["start"] for ts_sample_start in range(0, len(unsplit_ts) - len_sample, prediction_length): ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample] test_full_data.append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": test_entry["feat_static_cat"], }) print( "Generating the exchange rate training data, the total number of training examples:", len(train_full_data), ) ret["group_ratio"] = [len(i) / len(train_full_data) for i in dataset_group] random.shuffle(train_full_data) ret["whole_data"] = ListDataset(train_full_data, freq=dataset.metadata.freq) random.shuffle(test_full_data) ret["val_data"] = ListDataset(test_full_data, freq=dataset.metadata.freq) group_data_list = [] for group in dataset_group: random.shuffle(group) group_data_list.append(ListDataset(group, freq=dataset.metadata.freq)) ret["group_data"] = group_data_list os.makedirs("./dataset", exist_ok=True) with open("./dataset/" + file_name + ".csv", "wb") as output: pickle.dump(ret, output) print("Finished pre-processing the exchange rate dataset") return True
def get_mixed_pattern(unit_length=16, num_duplicates=1000): freq = "1H" context_length = 3 * unit_length prediction_length = unit_length len_sample = context_length + prediction_length dataset_group = [[] for j in range(16)] whole_data = [] val_data = [] ret = dict() start = pd.Timestamp("01-01-2000", freq=freq) patterns = [ ["sin", "linear", "quadratic", "sqrt"], ["sqrt", "quadratic", "linear", "sin"], ["linear", "sqrt", "sin", "quadratic"], ["quadratic", "sin", "sqrt", "linear"], ] pattern_number = 4 for m, pattern in enumerate(patterns): for gid in range(pattern_number): for j in range(num_duplicates): context = torch.arange(context_length, dtype=torch.float) for i in range(1, pattern_number): context[unit_length * (i - 1):unit_length * i] = _get_mixed_pattern( context[unit_length * (i - 1):unit_length * i] - unit_length * (i - 1), pattern[(gid + i) % pattern_number], ) ts_sample = torch.cat([ context, _get_mixed_pattern( torch.arange(prediction_length, dtype=torch.float), pattern[gid], ), ]) whole_data.append({"target": ts_sample, "start": start}) if j % 5 == 0: val_data.append({ "target": ts_sample + torch.normal(0, 1, ts_sample.shape), "start": start, }) dataset_group[m * 4 + gid].append({ "target": ts_sample, "start": start }) print( "Generating the synthetic training data, the total number of training examples:", len(whole_data), ) ret["group_ratio"] = [len(i) / len(whole_data) for i in dataset_group] random.shuffle(whole_data) group_data = [] ret["whole_data"] = ListDataset(whole_data, freq=freq) ret["val_data"] = ListDataset(val_data, freq=freq) for group in dataset_group: random.shuffle(group) group_data.append(ListDataset(group, freq=freq)) ret["group_data"] = group_data # save to files os.makedirs("./dataset", exist_ok=True) with open("./dataset/synthetic.csv", "wb") as output: pickle.dump(ret, output) print("Finished the pre-processing of synthetic dataset") return True
def test_multi_dim_transformation(is_train): train_length = 10 first_dim: list = list(np.arange(1, 11, 1)) first_dim[-1] = "NaN" second_dim: list = list(np.arange(11, 21, 1)) second_dim[0] = "NaN" ds = ListDataset( data_iter=[{ "start": "2012-01-01", "target": [first_dim, second_dim] }], freq="1D", one_dim_target=False, ) pred_length = 2 # Looks weird - but this is necessary to assert the nan entries correctly. first_dim[-1] = np.nan second_dim[0] = np.nan t = transform.Chain(trans=[ transform.AddTimeFeatures( start_field=FieldName.START, target_field=FieldName.TARGET, output_field="time_feat", time_features=[ time_feature.DayOfWeek(), time_feature.DayOfMonth(), time_feature.MonthOfYear(), ], pred_length=pred_length, ), transform.AddAgeFeature( target_field=FieldName.TARGET, output_field="age", pred_length=pred_length, log_scale=True, ), transform.AddObservedValuesIndicator( target_field=FieldName.TARGET, output_field="observed_values", convert_nans=False, ), transform.VstackFeatures( output_field="dynamic_feat", input_fields=["age", "time_feat"], drop_inputs=True, ), transform.InstanceSplitter( target_field=FieldName.TARGET, is_pad_field=FieldName.IS_PAD, start_field=FieldName.START, forecast_start_field=FieldName.FORECAST_START, train_sampler=transform.ExpectedNumInstanceSampler( num_instances=4), past_length=train_length, future_length=pred_length, time_series_fields=["dynamic_feat", "observed_values"], time_first=False, ), ]) if is_train: for u in t(iter(ds), is_train=True): assert_shape(u["past_target"], (2, 10)) assert_shape(u["past_dynamic_feat"], (4, 10)) assert_shape(u["past_observed_values"], (2, 10)) assert_shape(u["future_target"], (2, 2)) assert_padded_array( u["past_observed_values"], np.array([[1.0] * 9 + [0.0], [0.0] + [1.0] * 9]), u["past_is_pad"], ) assert_padded_array( u["past_target"], np.array([first_dim, second_dim]), u["past_is_pad"], ) else: for u in t(iter(ds), is_train=False): assert_shape(u["past_target"], (2, 10)) assert_shape(u["past_dynamic_feat"], (4, 10)) assert_shape(u["past_observed_values"], (2, 10)) assert_shape(u["future_target"], (2, 0)) assert_padded_array( u["past_observed_values"], np.array([[1.0] * 9 + [0.0], [0.0] + [1.0] * 9]), u["past_is_pad"], ) assert_padded_array( u["past_target"], np.array([first_dim, second_dim]), u["past_is_pad"], )
import os os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' url = "https://raw.githubusercontent.com/numenta/NAB/master/data/realTweets/Twitter_volume_AMZN.csv" path = "/Users/seenli/Documents/workspace/code/pytorch_learn2/time_series_DL/Twitter_volume_AMZN.csv" # df = pd.read_csv(url, header=0, index_col=0, parse_dates=True) df = pd.read_csv(path, header=0, index_col=0, parse_dates=True) df[:100].plot(linewidth=2) plt.grid(which='both') plt.show() training_data = ListDataset([{ "start": df.index[0], "target": df.value[:"2015-04-05 00:00:00"] }], freq="5min") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") estimator = DeepAREstimator(freq="5min", prediction_length=12, input_size=43, trainer=Trainer(epochs=20, device=device)) # print('.....') predictor = estimator.train(training_data=training_data) test_data = ListDataset(
def KMeans_m5_dataset( num_ts_=1, num_groups=16, context_length=72, prediction_length=24, file_name="default", ): df = pd.read_csv("sales_train_evaluation.csv") dataset_group = [[] for i in range(num_groups)] whole_data = [] ret = dict() # num_ts = int(dataset.metadata.feat_static_cat[0].cardinality) num_ts = num_ts_ len_sample = context_length + prediction_length # compute mean and variance df = df.iloc[:num_ts, :] df["mean"] = df.iloc[:, 1947 - len_sample:1947].mean(axis=1) df["var"] = df.iloc[:, 1947 - len_sample:1947].var(axis=1) df["mean"] = (df["mean"] - df["mean"].min()) / (df["mean"].max() - df["mean"].min()) df["var"] = (df["var"] - df["var"].min()) / (df["var"].max() - df["var"].min()) df_feature = df.iloc[:, 2:6] df_feature = pd.get_dummies(df_feature, dummy_na=True) df_feature = pd.concat([df_feature, df.iloc[:, -2]], axis=1) feature = torch.from_numpy(df_feature.to_numpy()).contiguous() cl, c = KMeans(feature, num_groups) # print(cl) # import pdb;pdb.set_trace() sample_id = 0 for i in range(num_ts): ts_slice = df.iloc[i:i + 1, 1947 - len_sample:1947] ts_slice = torch.from_numpy(ts_slice.to_numpy())[0] # print(ts_slice) # import pdb;pdb.set_trace() gid = cl[sample_id] unsplit_start = pd.Timestamp("1990-01-01") dataset_group[gid].append({ "target": ts_slice, "start": unsplit_start, } # , 'feat_static_cat': train_entry['feat_static_cat']} ) whole_data.append({ "target": ts_slice, "start": unsplit_start, } # , 'feat_static_cat': train_entry['feat_static_cat']} ) sample_id += 1 print(len(whole_data)) ret["group_ratio"] = [len(i) / len(whole_data) for i in dataset_group] print(ret["group_ratio"]) random.shuffle(whole_data) ret["whole_data"] = ListDataset(whole_data, freq="1H") group_data_list = [] for group in dataset_group: random.shuffle(group) group_data_list.append(ListDataset(group, freq="1H")) ret["group_data"] = group_data_list print("write whole data") with open("synthetic_" + file_name + "_whole_data.csv", "wb") as output: pickle.dump(ret["whole_data"], output) print("write group data") with open("synthetic_" + file_name + "_group_data.csv", "wb") as output: pickle.dump(ret, output) return True
def get_synthetic_data_linear( context_length=24, prediction_length=8, num_groups=8, steps_per_ts=1, num_duplicates=16, ): assert num_groups > 1 freq = "1H" len_sample = context_length + prediction_length dataset_group = [] whole_data = [] ret = dict() start = pd.Timestamp("01-01-2000", freq=freq) for gid in range(num_groups): model1 = torch.nn.Linear(context_length, prediction_length) model2 = torch.nn.Linear(context_length, prediction_length) # model1 = torch.sin # model2 = torch.cos pattern_group1 = [] pattern_group2 = [] sample_context = torch.rand(context_length) for t_step in range(2 * steps_per_ts): while True: with torch.no_grad(): if t_step <= steps_per_ts: prediction = model1(sample_context) else: prediction = model2(sample_context) if ( torch.norm(prediction) < prediction_length ): # and prediction_length*0.1 < torch.norm(prediction): # prediction = torch.sin(prediction) # prediction /= torch.max(prediction) break # prediction *= 10 # prediction += torch.normal(0, 0.1, size=prediction.shape) ts_sample = torch.cat([sample_context, prediction]) # print(ts_sample) for j in range(num_duplicates): ts_sample += torch.normal(0, 0.1, size=ts_sample.shape) whole_data.append({"target": ts_sample, "start": start}) if t_step <= steps_per_ts: pattern_group1.append({ "target": ts_sample, "start": start }) else: pattern_group2.append({ "target": ts_sample, "start": start }) sample_context = ts_sample[-context_length:] start += pd.Timedelta(hours=prediction_length) dataset_group.append(ListDataset(pattern_group1, freq=freq)) """ dataset_group.append( ListDataset( pattern_group2, freq=freq ) ) """ print(len(whole_data)) ret["group_ratio"] = [len(i) / len(whole_data) for i in dataset_group] print(ret["group_ratio"]) random.shuffle(whole_data) ret["whole_data"] = ListDataset(whole_data, freq=freq) ret["group_data"] = dataset_group # save to files with open("synthetic_linear_new_whole_data.csv", "wb") as output: pickle.dump(ret["whole_data"], output) with open("synthetic_linear_new_group_data.csv", "wb") as output: pickle.dump(ret, output) return True
def KMeans_inside_dataset( num_ts_=1, num_groups=16, context_length=72, prediction_length=24, file_name="default", ): dataset = get_dataset("traffic") dataset_group = [[] for i in range(num_groups)] whole_data = [] ret = dict() it = iter(dataset.train) # num_ts = int(dataset.metadata.feat_static_cat[0].cardinality) num_ts = num_ts_ len_sample = context_length + prediction_length index = 0 feature = torch.Tensor([]) for i in range(num_ts): train_entry = next(it) target = train_entry["target"] for ts_sample_start in range(0, len(target) - len_sample, prediction_length): ts_slice = target[ts_sample_start:ts_sample_start + len_sample] feature = torch.cat(( feature, torch.Tensor([ ts_slice.mean(), ts_slice.var(), index % 7, index // 90, ]), )) index += 1 feature = feature.reshape(index, 4) feature = _get_pre_features(feature).contiguous() # print(feature) # import pdb;pdb.set_trace() cl, c = KMeans(feature, num_groups) it = iter(dataset.train) sample_id = 0 for i in range(num_ts): train_entry = next(it) target = train_entry["target"] unsplit_start = train_entry["start"] for ts_sample_start in range(0, len(target) - len_sample, prediction_length): ts_slice = target[ts_sample_start:ts_sample_start + len_sample] gid = cl[sample_id] dataset_group[gid].append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": train_entry["feat_static_cat"], }) whole_data.append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": train_entry["feat_static_cat"], }) unsplit_start += pd.Timedelta(hours=prediction_length) sample_id += 1 print(len(whole_data)) ret["group_ratio"] = [len(i) / len(whole_data) for i in dataset_group] print(ret["group_ratio"]) random.shuffle(whole_data) ret["whole_data"] = ListDataset(whole_data, freq=dataset.metadata.freq) group_data_list = [] for group in dataset_group: random.shuffle(group) group_data_list.append(ListDataset(group, freq=dataset.metadata.freq)) ret["group_data"] = group_data_list print("write whole data") with open("synthetic_" + file_name + "_whole_data.csv", "wb") as output: pickle.dump(ret["whole_data"], output) print("write group data") with open("synthetic_" + file_name + "_group_data.csv", "wb") as output: pickle.dump(ret, output) return True
def get_m4_by_freq( context_length=72, prediction_length=24, len_per_ts=200, num_ts=50, num_groups=6, file_name="m4_freq", ): dataset_group = [[] for i in range(num_groups)] whole_data = [] ret = dict() datasets_name = [ "m4_hourly", "m4_daily", "m4_weekly", "m4_monthly", "m4_quarterly", "m4_yearly", ] hours_factor = [ 1, 24, 24 * 7, 24 * 7 * 30, 24 * 7 * 30 * 3, 24 * 7 * 30 * 3 * 4, ] for i in range(num_groups): dataset = get_dataset(datasets_name[i]) len_sample = context_length + prediction_length it = iter(dataset.train) for j in range(num_ts): train_entry = next(it) unsplit_ts = train_entry["target"] # unsplit_start = train_entry['start'] unsplit_start = pd.Timestamp("1990-01-01") for ts_sample_start in range(0, len_per_ts - len_sample, prediction_length): if len_sample > len(unsplit_ts): continue ts_slice = unsplit_ts[ts_sample_start:ts_sample_start + len_sample] if len(ts_slice) < len_sample: continue nu = 1 + sum(ts_slice) / len_sample ts_slice = [i / nu for i in ts_slice] whole_data.append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": train_entry["feat_static_cat"], }) dataset_group[i].append({ "target": ts_slice, "start": unsplit_start, "feat_static_cat": train_entry["feat_static_cat"], }) # unsplit_start += pd.Timedelta(hours=prediction_length*hours_factor[i]) unsplit_start += pd.Timedelta(hours=prediction_length) # for j in range(len(dataset_group)): # print(len(dataset_group[i])) # import pdb;pdb.set_trace() print(len(whole_data)) ret["group_ratio"] = [len(i) / len(whole_data) for i in dataset_group] print(ret["group_ratio"]) random.shuffle(whole_data) ret["whole_data"] = ListDataset(whole_data, freq=dataset.metadata.freq) group_data_list = [] for group in dataset_group: random.shuffle(group) group_data_list.append(ListDataset(group, freq=dataset.metadata.freq)) ret["group_data"] = group_data_list print("write whole data") with open("synthetic_" + file_name + "_whole_data.csv", "wb") as output: pickle.dump(ret["whole_data"], output) print("write group data") with open("synthetic_" + file_name + "_group_data.csv", "wb") as output: pickle.dump(ret, output) return True