def test_normalize_on(): feature_df_list = reduce(add, [[pd.DataFrame({"time": pd.to_datetime(list(range(50)), unit="s"), "A": range(1, 51), "B": range(101, 151), "y": np.ones(50)}), pd.DataFrame({"time": pd.to_datetime(list(range(50)), unit="s"), "A": range(51, 101), "B": range(151, 201), "y": np.ones(50)})] for _ in range(5)], []) meta = StorageMeta() storage = BatchStorageMemory(meta) translate = Translate(features=["A", "B"], look_back=0, look_forward=0, n_seconds=1, normalize=True, verbose=True) batch_generator = Builder(storage, translate, batch_size=10, pseudo_stratify=False) batch_generator.generate_and_save_batches(feature_df_list) tools.assert_almost_equal(translate.scaler.mean_[0], 50, delta=1) tools.assert_almost_equal(translate.scaler.mean_[1], 150, delta=1) for batch in storage._data.values(): # all batches have monotonically increasing numbers (range used to create data) assert np.diff(batch["features"][:, 0, 0]).all() # feature A assert np.diff(batch["features"][:, 0, 1]).all() # feature B
def test_normalize_off(): feature_df_list = [ pd.DataFrame({ "time": pd.to_datetime(list(range(160)), unit="s"), "A": range(160), "B": range(160), "y": np.ones(160) }) for _ in range(1) ] meta = StorageMeta() storage = BatchStorageMemory(meta) translate = Translate(features=["A", "B"], look_back=0, look_forward=0, n_seconds=1, normalize=False) batch_generator = Builder(storage, translate, batch_size=16, pseudo_stratify=False) batch_generator.generate_and_save_batches(feature_df_list) for batch in storage._data.values(): # all batches have monotonically increasing numbers (range used to create data) assert np.diff(batch["features"][:, 0, 0]).all() # feature A assert np.diff(batch["features"][:, 0, 1]).all() # feature B
def test_builder_stratify(): feature_set = sorted(["A", "B"]) feature_df_list = [ pd.DataFrame({ "time": pd.to_datetime(list(range(160)), unit="s"), "A": np.ones(160), "B": np.ones(160), "y": np.ones(160) }) for _ in range(1) ] meta = StorageMeta(validation_split=0.5) storage = BatchStorageMemory(meta) translate = Translate(features=feature_set, look_back=0, look_forward=0, n_seconds=1) batch_generator = Builder(storage, translate, batch_size=16, stratify_nbatch_groupings=3, pseudo_stratify=True) batch_generator.generate_and_save_batches(feature_df_list) assert batch_generator._stratify tools.eq_(len(meta.train.ids), 5) tools.eq_(len(meta.validation.ids), 5)
def test_builder_config(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="test_bucket") feature_set = sorted(["A", "B"]) feature_df_list = [ pd.DataFrame({ "time": pd.to_datetime(list(range(32)), unit="s"), "A": np.ones(32), "B": np.ones(32), "y": np.ones(32) }) for _ in range(1) ] batch_generator = Builder.s3_builder_factory(conn.Bucket("test_bucket"), feature_set, look_back=2, look_forward=2, batch_size=16, batch_seconds=1) batch_generator.generate_and_save_batches(feature_df_list) tools.eq_(batch_generator.batch_size, 16) assert not batch_generator._stratify
def test_validation_gen_window_split(): feature_set = sorted(["A", "B"]) feature_df_list = [ pd.DataFrame({ "time": pd.to_datetime(list(range(70)), unit="s"), "A": np.ones(70), "B": np.ones(70), "y": np.ones(70) }) for _ in range(1) ] batch_generator = Builder.memory_builder_factory(feature_set, look_back=6, look_forward=0, batch_size=8, batch_seconds=1, validation_split=0.5) batch_generator.generate_and_save_batches(feature_df_list) validation_generator = BatchGenerator(batch_generator.storage, is_validation=True, batch_split=8) X, y = validation_generator[0] tools.eq_(X.shape, (1, 7, 2)) tools.eq_(y.shape, (1, )) tools.eq_(len([(x, y) for x, y in validation_generator]), 32) assert np.array_equal(X, np.zeros(X.shape))
def test_generator(): feature_set = sorted(["A", "B"]) feature_df_list = [ pd.DataFrame({ "time": pd.to_datetime(list(range(32)), unit="s"), "A": np.ones(32), "B": np.ones(32), "y": np.ones(32) }) for _ in range(1) ] batch_generator = Builder.memory_builder_factory(feature_set, look_back=2, look_forward=2, batch_size=16, batch_seconds=1) batch_generator.generate_and_save_batches(feature_df_list) train_generator = BatchGenerator(batch_generator.storage, is_validation=False, seed=42) X, y = train_generator[0] tools.eq_(X.shape, (16, 5, 2)) assert np.array_equal(X, np.zeros(X.shape))
def test_save_and_load_meta(): feature_df_list = [ pd.DataFrame({ "time": pd.to_datetime(list(range(160)), unit="s"), "A": range(160), "B": range(160), "y": np.ones(160) }) for _ in range(1) ] meta = StorageMeta() storage = BatchStorageMemory(meta) translate = Translate(features=["A", "B"], look_back=0, look_forward=0, n_seconds=1, normalize=False) batch_generator = Builder(storage, translate, batch_size=16, pseudo_stratify=False) batch_generator.generate_and_save_batches(feature_df_list) batch_generator.save_meta() translate = Translate(features=["A", "B"], look_back=99, look_forward=99, n_seconds=99, normalize=True) batch_generator_reload = Builder(storage, translate, batch_size=99, pseudo_stratify=False) batch_generator_reload.load_meta() tools.eq_(batch_generator.batch_size, batch_generator_reload.batch_size) tools.eq_(translate._features, translate._features) tools.eq_(translate._look_forward, translate._look_forward) tools.eq_(translate._look_back, translate._look_back) tools.eq_(translate._n_seconds, translate._n_seconds) tools.eq_(translate._normalize, translate._normalize)
def test_builder_storage_meta_validation(): feature_set = sorted(["A", "B"]) feature_df_list = [ pd.DataFrame({ "time": pd.to_datetime(list(range(35)), unit="s"), "A": np.ones(35), "B": np.ones(35), "y": np.ones(35) }) for _ in range(1) ] meta = StorageMeta(validation_split=0.5) storage = BatchStorageMemory(meta) translate = Translate(features=feature_set, look_back=2, look_forward=1, n_seconds=1) batch_generator = Builder(storage, translate, batch_size=16) batch_generator.generate_and_save_batches(feature_df_list) tools.eq_(len(meta.train.ids), 1) tools.eq_(len(meta.validation.ids), 1)
def test_translate_config(): feature_set = sorted(["A", "B"]) feature_df_list = [pd.DataFrame({"time": pd.to_datetime(list(range(32)), unit="s"), "A": np.ones(32), "B": np.ones(32), "y": np.ones(32)}) for _ in range(1)] batch_generator = Builder.memory_builder_factory(feature_set, look_back=3, look_forward=2, batch_size=16, batch_seconds=1) batch_generator.generate_and_save_batches(feature_df_list) tools.eq_(batch_generator.translate._features, list(feature_set)) tools.eq_(batch_generator.translate.look_forward, 2) tools.eq_(batch_generator.translate.look_back, 3) tools.eq_(batch_generator.translate._n_seconds, 1)
"look_forward": 30, # sequence model / RNN timesteps looking forward (total window = look_back + look_forward + 1) "batch_size": 1024, # size of training/val batches "stride": 2, "batch_seconds": timesteps_seconds, # timestep size in seconds "validation_split": 0.5, # train/test split "pseudo_stratify": True, # stratify batches (done streaming so pseudo-stratification) "stratify_nbatch_groupings": 10, # number of batches to look at for stratification ratios "n_workers": None, # n_workers for ProcessPoolExecutor. None means ProcessPoolExecutor(n_workers=None) / default "seed": 42, # random seed for repeatability "normalize": True, # use StandardScaler to normalize features "session_norm_filter": session_filter, "verbose": True # debug logs } # Create builder for saving to files batch_generator = Builder.file_builder_factory(**file_batch_config) # Generate batches start = time.perf_counter() batch_generator.generate_and_save_batches(dataset) logger.info(f"Total Duration: {time.perf_counter() - start}") # Train and validation generators that can be passed to tf/keras fit_generator train_generator = BatchGenerator(batch_generator.storage, is_validation=False) val_generator = BatchGenerator(batch_generator.storage, is_validation=True) # Consume in sample code for stats train_batches = list(train_generator) val_batches = list(val_generator) logger.info(f"num training batches: {len(train_batches)}, num validation batches: {len(val_batches)}")
def test_no_dataset(): batch_generator = Builder.memory_builder_factory([], 0, 0, 1) batch_generator.generate_and_save_batches([])