def test_tsdataset_split(self): df = get_multi_id_ts_df() tsdata_train, tsdata_valid, tsdata_test =\ TSDataset.from_pandas(df, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id", with_split=True, val_ratio=0.1, test_ratio=0.1, largest_look_back=5, largest_horizon=2) assert set(np.unique(tsdata_train.to_pandas()["id"])) == {"00", "01"} assert set(np.unique(tsdata_valid.to_pandas()["id"])) == {"00", "01"} assert set(np.unique(tsdata_test.to_pandas()["id"])) == {"00", "01"} assert len(tsdata_train.to_pandas()) == (50 * 0.8)*2 assert len(tsdata_valid.to_pandas()) == (50 * 0.1 + 5 + 2 - 1)*2 assert len(tsdata_test.to_pandas()) == (50 * 0.1 + 5 + 2 - 1)*2
def test_tsdataset_rolling_feature_multiple(self): df = get_multi_id_ts_df() horizon = random.randint(2, 10) lookback = random.randint(2, 20) tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id") tsdata.gen_rolling_feature(settings="minimal", window_size=lookback) tsdata._check_basic_invariants() tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id") tsdata.gen_rolling_feature(settings="minimal", window_size=lookback, n_jobs=2) tsdata._check_basic_invariants() # roll train tsdata.roll(lookback=lookback, horizon=horizon) x, y = tsdata.to_numpy() feature_num = len(tsdata.feature_col) + len(tsdata.target_col) assert x.shape == ((50 - lookback - horizon + 1) * 2, lookback, feature_num) assert y.shape == ((50 - lookback - horizon + 1) * 2, horizon, 1) tsdata.roll(lookback=lookback, horizon=horizon, id_sensitive=True) x, y = tsdata.to_numpy() assert x.shape == ((50 - lookback - horizon + 1), lookback, feature_num * 2) assert y.shape == ((50 - lookback - horizon + 1), horizon, 2) tsdata._check_basic_invariants()
def test_dt_sorted(self): df = pd.DataFrame({ "datetime": np.array(['20000101', '20000102', '20000102', '20000101']), "value": np.array([1.9, 2.3, 2.4, 2.6]), "id": np.array(['00', '01', '00', '01']) }) tsdata = TSDataset.from_pandas(df, target_col='value', dt_col='datetime') with pytest.raises(RuntimeError): tsdata._check_basic_invariants(strict_check=True)
def test_non_pd_datetime(self): df = get_non_dt() tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value", extra_feature_col="extra feature", id_col="id") with pytest.raises(AssertionError): tsdata.resample('2D') with pytest.raises(AssertionError): tsdata.gen_dt_feature() with pytest.raises(AssertionError): tsdata.gen_rolling_feature(settings="minimal", window_size=1000) tsdata._check_basic_invariants()
def test_tsdataset_initialization(self): df = get_ts_df() # legal input tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id") assert tsdata._id_list == ['00'] assert tsdata.feature_col == ["extra feature"] assert tsdata.target_col == ["value"] assert tsdata.dt_col == "datetime" assert tsdata._is_pd_datetime tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col=["value"], extra_feature_col="extra feature", id_col="id") assert tsdata._id_list == ['00'] assert tsdata.feature_col == ["extra feature"] assert tsdata.target_col == ["value"] assert tsdata.dt_col == "datetime" assert tsdata._is_pd_datetime tsdata = TSDataset.from_pandas(df.drop(columns=["id"]), dt_col="datetime", target_col=["value"], extra_feature_col="extra feature") assert tsdata._id_list == ['0'] assert tsdata.feature_col == ["extra feature"] assert tsdata.target_col == ["value"] assert tsdata.dt_col == "datetime" assert tsdata._is_pd_datetime # illegal input with pytest.raises(AssertionError): tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col=["value"], extra_feature_col="extra feature", id_col=0) with pytest.raises(AssertionError): tsdata = TSDataset.from_pandas(df, dt_col=0, target_col=["value"], extra_feature_col="extra feature", id_col="id") with pytest.raises(AssertionError): tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col=0, extra_feature_col="extra feature", id_col="id") with pytest.raises(AssertionError): tsdata = TSDataset.from_pandas(0, dt_col="datetime", target_col=["value"], extra_feature_col="extra feature", id_col="id") with pytest.raises(AssertionError): tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col=["value1"], extra_feature_col="extra feature", id_col="id")
def test_tsdataset_roll_single_id(self): df = get_ts_df() horizon = random.randint(1, 10) lookback = random.randint(1, 20) tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id") with pytest.raises(RuntimeError): tsdata.to_numpy() # roll train tsdata.roll(lookback=lookback, horizon=horizon) x, y = tsdata.to_numpy() assert x.shape == (len(df) - lookback - horizon + 1, lookback, 2) assert y.shape == (len(df) - lookback - horizon + 1, horizon, 1) tsdata.roll(lookback=lookback, horizon=horizon, feature_col=["extra feature"], target_col="value") x, y = tsdata.to_numpy() assert x.shape == (len(df) - lookback - horizon + 1, lookback, 2) assert y.shape == (len(df) - lookback - horizon + 1, horizon, 1) tsdata.roll(lookback=lookback, horizon=horizon, feature_col=[], target_col="value") x, y = tsdata.to_numpy() assert x.shape == (len(df) - lookback - horizon + 1, lookback, 1) assert y.shape == (len(df) - lookback - horizon + 1, horizon, 1) # roll test horizon = 0 lookback = random.randint(1, 20) tsdata.roll(lookback=lookback, horizon=horizon) x, y = tsdata.to_numpy() assert x.shape == (len(df) - lookback - horizon + 1, lookback, 2) assert y is None tsdata._check_basic_invariants()
def test_check_scale_sequence(self): df = get_multi_id_ts_df() # with split is True. td_train, td_valid, td_test = TSDataset.from_pandas( df, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id", with_split=True, val_ratio=0.1, test_ratio=0.1) from sklearn.preprocessing import StandardScaler stand = StandardScaler() with pytest.raises(AssertionError): for tsdata in [td_train, td_valid, td_test]: tsdata.scale(stand, fit=False) tsdata._check_basic_invariants()
def test_select_feature(self): sample_num = np.random.randint(100, 200) df = pd.DataFrame({ "datetime": pd.date_range('1/1/2019', periods=sample_num), "value": np.random.randn(sample_num), "id": np.array(['00'] * sample_num) }) train_ts, val_ts, _ = TSDataset.from_pandas(df, target_col=['value'], dt_col='datetime', id_col='id', with_split=True, val_ratio=0.1) search_space = { 'hidden_dim': hp.grid_search([32, 64]), 'layer_num': hp.randint(1, 3), 'lr': hp.choice([0.001, 0.003, 0.01]), 'dropout': hp.uniform(0.1, 0.2) } input_feature_dim, output_feature_dim = 1, 1 auto_estimator = AutoTSEstimator(model='lstm', search_space=search_space, past_seq_len=6, future_seq_len=1, input_feature_num=input_feature_dim, output_target_num=output_feature_dim, selected_features="auto", metric="mse", loss=torch.nn.MSELoss(), cpus_per_trial=2, name="auto_trainer") auto_estimator.fit(data=train_ts, epochs=1, batch_size=hp.choice([32, 64]), validation_data=val_ts, n_sampling=1) config = auto_estimator.get_best_config() assert config['past_seq_len'] == 6
def test_tsdataset_roll_multi_id(self): df = get_multi_id_ts_df() horizon = random.randint(1, 10) lookback = random.randint(1, 20) tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id") # test train tsdata.roll(lookback=lookback, horizon=horizon, id_sensitive=True) x, y = tsdata.to_numpy() assert x.shape == ((50-lookback-horizon+1), lookback, 4) assert y.shape == ((50-lookback-horizon+1), horizon, 2) tsdata.roll(lookback=lookback, horizon=horizon) x, y = tsdata.to_numpy() assert x.shape == ((50-lookback-horizon+1)*2, lookback, 2) assert y.shape == ((50-lookback-horizon+1)*2, horizon, 1) tsdata._check_basic_invariants()
def test_tsdataset_to_torch_loader_roll(self): df_single_id = get_ts_df() df_multi_id = get_multi_id_ts_df() for df in [df_single_id, df_multi_id]: horizon = random.randint(1, 10) lookback = random.randint(1, 20) batch_size = 32 tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id") # train torch_loader = tsdata.to_torch_data_loader(batch_size=32, roll=True, lookback=lookback, horizon=horizon) for x_batch, y_batch in torch_loader: assert tuple(x_batch.size()) == (batch_size, lookback, 2) assert tuple(y_batch.size()) == (batch_size, horizon, 1) break # test torch_loader = tsdata.to_torch_data_loader(batch_size=32, roll=True, lookback=lookback, horizon=0) for x_batch in torch_loader: assert tuple(x_batch.size()) == (batch_size, lookback, 2) break # specify feature_col torch_loader = tsdata.to_torch_data_loader(batch_size=32, roll=True, lookback=lookback, horizon=horizon, feature_col=[]) for x_batch, y_batch in torch_loader: assert tuple(x_batch.size()) == (batch_size, lookback, 1) assert tuple(y_batch.size()) == (batch_size, horizon, 1) break
def assert_equal_with_tsdataset( df, horizon, lookback, feature_num=1, ): # get results rolled by tsdata.roll extra_feature_col = None if feature_num == 0 else ["extra feature"] tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value", extra_feature_col=extra_feature_col, id_col="id") tsdata.roll(lookback=lookback, horizon=horizon) x, y = tsdata.to_numpy() # get results rolled by RollDataset roll_dataset = RollDataset(df=df, lookback=lookback, horizon=horizon, feature_col=tsdata.feature_col, target_col=tsdata.target_col, id_col=tsdata.id_col) assert len(roll_dataset) == len(x) for i in range(len(x)): if horizon != 0: # for train and y is not None. xi, yi = x[i], y[i] roll_dataset_xi, roll_dataset_yi = roll_dataset[i] np.testing.assert_array_almost_equal( xi, roll_dataset_xi.detach().numpy()) np.testing.assert_array_almost_equal( yi, roll_dataset_yi.detach().numpy()) else: # for test, y is None. xi = x[i] roll_dataset_xi = roll_dataset[i] np.testing.assert_array_almost_equal( xi, roll_dataset_xi.detach().numpy())
def test_tsdataset_datetime_feature(self): df = get_multi_id_ts_df() tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id") tsdata.gen_dt_feature() assert set(tsdata.to_pandas().columns) == { 'IS_AWAKE(datetime)', 'IS_BUSY_HOURS(datetime)', 'HOUR(datetime)', 'DAY(datetime)', 'IS_WEEKEND(datetime)', 'WEEKDAY(datetime)', 'MONTH(datetime)', 'DAYOFYEAR(datetime)', 'WEEKOFYEAR(datetime)', 'MINUTE(datetime)', 'extra feature', 'value', 'datetime', 'id' } assert set(tsdata.feature_col) == { 'IS_AWAKE(datetime)', 'IS_BUSY_HOURS(datetime)', 'HOUR(datetime)', 'DAY(datetime)', 'IS_WEEKEND(datetime)', 'WEEKDAY(datetime)', 'MONTH(datetime)', 'DAYOFYEAR(datetime)', 'WEEKOFYEAR(datetime)', 'MINUTE(datetime)', 'extra feature' }
def test_tsdataset_roll_order(self): df = pd.DataFrame({"datetime": np.array(['1/1/2019', '1/1/2019', '1/2/2019', '1/2/2019']), "value": np.array([1.9, 2.3, 2.4, 2.6]), "id": np.array(['00', '01', '00', '01']), "extra feature1": np.array([1, 0, 3, 0]), "extra feature2": np.array([2, 9, 4, 2])}) tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value", extra_feature_col=["extra feature1", "extra feature2"], id_col="id") x, y = tsdata.roll(lookback=1, horizon=1, id_sensitive=False).to_numpy() assert x.shape == (2, 1, 3) and y.shape == (2, 1, 1) assert np.array_equal(x, np.array([[[1.9, 1, 2]], [[2.3, 0, 9]]])) assert np.array_equal(y, np.array([[[2.4]], [[2.6]]])) x, y = tsdata.roll(lookback=1, horizon=1, id_sensitive=True).to_numpy() assert x.shape == (1, 1, 6) and y.shape == (1, 1, 2) assert np.array_equal(x, np.array([[[1.9, 2.3, 1, 2, 0, 9]]])) assert np.array_equal(y, np.array([[[2.4, 2.6]]]))
def test_tsdataset_to_torch_loader(self): df = get_ts_df() horizon = random.randint(1, 10) lookback = random.randint(1, 20) batch_size = random.randint(16, 32) tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id") with pytest.raises(RuntimeError): tsdata.to_torch_data_loader() tsdata.roll(lookback=lookback, horizon=horizon) loader = tsdata.to_torch_data_loader(batch_size=batch_size, lookback=lookback, horizon=horizon) for x_batch, y_batch in loader: assert tuple(x_batch.size()) == (batch_size, lookback, 2) assert tuple(y_batch.size()) == (batch_size, horizon, 1) break
def test_tsdataset_to_torch_loader_roll(self): df_single_id = get_ts_df() df_multi_id = get_multi_id_ts_df() for df in [df_single_id, df_multi_id]: horizon = random.randint(1, 10) lookback = random.randint(1, 20) batch_size = random.randint(16, 32) tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id") # train torch_loader = tsdata.to_torch_data_loader(batch_size=batch_size, roll=True, lookback=lookback, horizon=horizon) for x_batch, y_batch in torch_loader: assert tuple(x_batch.size()) == (batch_size, lookback, 2) assert tuple(y_batch.size()) == (batch_size, horizon, 1) break # test torch_loader = tsdata.to_torch_data_loader(batch_size=batch_size, roll=True, lookback=lookback, horizon=0) for x_batch in torch_loader: assert tuple(x_batch.size()) == (batch_size, lookback, 2) break # specify feature_col torch_loader = tsdata.to_torch_data_loader(batch_size=batch_size, roll=True, lookback=lookback, horizon=horizon, feature_col=[]) for x_batch, y_batch in torch_loader: assert tuple(x_batch.size()) == (batch_size, lookback, 1) assert tuple(y_batch.size()) == (batch_size, horizon, 1) break # Non-subset relationship with pytest.raises(ValueError): tsdata.to_torch_data_loader( batch_size=batch_size, roll=True, lookback=lookback, horizon=horizon, target_col=['value', 'extra feature']) # specify horizon_list horizon_list = [1, 3, 5] torch_loader = tsdata.to_torch_data_loader(batch_size=batch_size, roll=True, lookback=lookback, horizon=horizon_list) for x_batch, y_batch in torch_loader: assert tuple(x_batch.size()) == (batch_size, lookback, 2) assert tuple(y_batch.size()) == (batch_size, len(horizon_list), 1) break # multi target_col tsdata = TSDataset.from_pandas( df, dt_col="datetime", target_col=["value", "extra feature"], id_col="id") torch_loader = tsdata.to_torch_data_loader(batch_size=batch_size, roll=True, lookback=lookback, horizon=horizon) for x_batch, y_batch in torch_loader: assert tuple(x_batch.size()) == (batch_size, lookback, 2) assert tuple(y_batch.size()) == (batch_size, horizon, 2) break
def test_tsdataset_global_feature_multiple(self): df = get_multi_id_ts_df() tsdata = TSDataset.from_pandas(df, dt_col="datetime", target_col="value", extra_feature_col=["extra feature"], id_col="id") tsdata.gen_global_feature(settings="minimal") tsdata._check_basic_invariants()