예제 #1
0
def test_normalize_on():
    feature_df_list = reduce(add, [[pd.DataFrame({"time": pd.to_datetime(list(range(50)), unit="s"),
                                                  "A": range(1, 51),
                                                  "B": range(101, 151),
                                                  "y": np.ones(50)}),
                                    pd.DataFrame({"time": pd.to_datetime(list(range(50)), unit="s"),
                                                  "A": range(51, 101),
                                                  "B": range(151, 201),
                                                  "y": np.ones(50)})]
                                   for _ in range(5)], [])

    meta = StorageMeta()
    storage = BatchStorageMemory(meta)
    translate = Translate(features=["A", "B"], look_back=0, look_forward=0, n_seconds=1, normalize=True, verbose=True)
    batch_generator = Builder(storage,
                              translate,
                              batch_size=10,
                              pseudo_stratify=False)

    batch_generator.generate_and_save_batches(feature_df_list)

    tools.assert_almost_equal(translate.scaler.mean_[0], 50, delta=1)
    tools.assert_almost_equal(translate.scaler.mean_[1], 150, delta=1)

    for batch in storage._data.values():
        # all batches have monotonically increasing numbers (range used to create data)
        assert np.diff(batch["features"][:, 0, 0]).all()  # feature A
        assert np.diff(batch["features"][:, 0, 1]).all()  # feature B
예제 #2
0
def test_normalize_off():
    feature_df_list = [
        pd.DataFrame({
            "time": pd.to_datetime(list(range(160)), unit="s"),
            "A": range(160),
            "B": range(160),
            "y": np.ones(160)
        }) for _ in range(1)
    ]

    meta = StorageMeta()
    storage = BatchStorageMemory(meta)
    translate = Translate(features=["A", "B"],
                          look_back=0,
                          look_forward=0,
                          n_seconds=1,
                          normalize=False)
    batch_generator = Builder(storage,
                              translate,
                              batch_size=16,
                              pseudo_stratify=False)

    batch_generator.generate_and_save_batches(feature_df_list)

    for batch in storage._data.values():
        # all batches have monotonically increasing numbers (range used to create data)
        assert np.diff(batch["features"][:, 0, 0]).all()  # feature A
        assert np.diff(batch["features"][:, 0, 1]).all()  # feature B
예제 #3
0
def test_builder_stratify():
    feature_set = sorted(["A", "B"])

    feature_df_list = [
        pd.DataFrame({
            "time": pd.to_datetime(list(range(160)), unit="s"),
            "A": np.ones(160),
            "B": np.ones(160),
            "y": np.ones(160)
        }) for _ in range(1)
    ]

    meta = StorageMeta(validation_split=0.5)
    storage = BatchStorageMemory(meta)
    translate = Translate(features=feature_set,
                          look_back=0,
                          look_forward=0,
                          n_seconds=1)
    batch_generator = Builder(storage,
                              translate,
                              batch_size=16,
                              stratify_nbatch_groupings=3,
                              pseudo_stratify=True)

    batch_generator.generate_and_save_batches(feature_df_list)

    assert batch_generator._stratify
    tools.eq_(len(meta.train.ids), 5)
    tools.eq_(len(meta.validation.ids), 5)
예제 #4
0
def test_builder_config():
    conn = boto3.resource("s3", region_name="us-east-1")
    conn.create_bucket(Bucket="test_bucket")

    feature_set = sorted(["A", "B"])

    feature_df_list = [
        pd.DataFrame({
            "time": pd.to_datetime(list(range(32)), unit="s"),
            "A": np.ones(32),
            "B": np.ones(32),
            "y": np.ones(32)
        }) for _ in range(1)
    ]

    batch_generator = Builder.s3_builder_factory(conn.Bucket("test_bucket"),
                                                 feature_set,
                                                 look_back=2,
                                                 look_forward=2,
                                                 batch_size=16,
                                                 batch_seconds=1)
    batch_generator.generate_and_save_batches(feature_df_list)

    tools.eq_(batch_generator.batch_size, 16)
    assert not batch_generator._stratify
예제 #5
0
def test_validation_gen_window_split():
    feature_set = sorted(["A", "B"])

    feature_df_list = [
        pd.DataFrame({
            "time": pd.to_datetime(list(range(70)), unit="s"),
            "A": np.ones(70),
            "B": np.ones(70),
            "y": np.ones(70)
        }) for _ in range(1)
    ]

    batch_generator = Builder.memory_builder_factory(feature_set,
                                                     look_back=6,
                                                     look_forward=0,
                                                     batch_size=8,
                                                     batch_seconds=1,
                                                     validation_split=0.5)
    batch_generator.generate_and_save_batches(feature_df_list)
    validation_generator = BatchGenerator(batch_generator.storage,
                                          is_validation=True,
                                          batch_split=8)

    X, y = validation_generator[0]
    tools.eq_(X.shape, (1, 7, 2))
    tools.eq_(y.shape, (1, ))
    tools.eq_(len([(x, y) for x, y in validation_generator]), 32)
    assert np.array_equal(X, np.zeros(X.shape))
예제 #6
0
def test_generator():
    feature_set = sorted(["A", "B"])

    feature_df_list = [
        pd.DataFrame({
            "time": pd.to_datetime(list(range(32)), unit="s"),
            "A": np.ones(32),
            "B": np.ones(32),
            "y": np.ones(32)
        }) for _ in range(1)
    ]

    batch_generator = Builder.memory_builder_factory(feature_set,
                                                     look_back=2,
                                                     look_forward=2,
                                                     batch_size=16,
                                                     batch_seconds=1)

    batch_generator.generate_and_save_batches(feature_df_list)
    train_generator = BatchGenerator(batch_generator.storage,
                                     is_validation=False,
                                     seed=42)

    X, y = train_generator[0]
    tools.eq_(X.shape, (16, 5, 2))
    assert np.array_equal(X, np.zeros(X.shape))
예제 #7
0
def test_save_and_load_meta():
    feature_df_list = [
        pd.DataFrame({
            "time": pd.to_datetime(list(range(160)), unit="s"),
            "A": range(160),
            "B": range(160),
            "y": np.ones(160)
        }) for _ in range(1)
    ]

    meta = StorageMeta()
    storage = BatchStorageMemory(meta)
    translate = Translate(features=["A", "B"],
                          look_back=0,
                          look_forward=0,
                          n_seconds=1,
                          normalize=False)
    batch_generator = Builder(storage,
                              translate,
                              batch_size=16,
                              pseudo_stratify=False)

    batch_generator.generate_and_save_batches(feature_df_list)
    batch_generator.save_meta()

    translate = Translate(features=["A", "B"],
                          look_back=99,
                          look_forward=99,
                          n_seconds=99,
                          normalize=True)
    batch_generator_reload = Builder(storage,
                                     translate,
                                     batch_size=99,
                                     pseudo_stratify=False)
    batch_generator_reload.load_meta()

    tools.eq_(batch_generator.batch_size, batch_generator_reload.batch_size)
    tools.eq_(translate._features, translate._features)
    tools.eq_(translate._look_forward, translate._look_forward)
    tools.eq_(translate._look_back, translate._look_back)
    tools.eq_(translate._n_seconds, translate._n_seconds)
    tools.eq_(translate._normalize, translate._normalize)
예제 #8
0
def test_builder_storage_meta_validation():
    feature_set = sorted(["A", "B"])

    feature_df_list = [
        pd.DataFrame({
            "time": pd.to_datetime(list(range(35)), unit="s"),
            "A": np.ones(35),
            "B": np.ones(35),
            "y": np.ones(35)
        }) for _ in range(1)
    ]

    meta = StorageMeta(validation_split=0.5)
    storage = BatchStorageMemory(meta)
    translate = Translate(features=feature_set,
                          look_back=2,
                          look_forward=1,
                          n_seconds=1)
    batch_generator = Builder(storage, translate, batch_size=16)

    batch_generator.generate_and_save_batches(feature_df_list)

    tools.eq_(len(meta.train.ids), 1)
    tools.eq_(len(meta.validation.ids), 1)
예제 #9
0
def test_translate_config():
    feature_set = sorted(["A", "B"])

    feature_df_list = [pd.DataFrame({"time": pd.to_datetime(list(range(32)), unit="s"),
                                     "A": np.ones(32),
                                     "B": np.ones(32),
                                     "y": np.ones(32)})
                       for _ in range(1)]

    batch_generator = Builder.memory_builder_factory(feature_set, look_back=3, look_forward=2, batch_size=16,
                                                     batch_seconds=1)
    batch_generator.generate_and_save_batches(feature_df_list)

    tools.eq_(batch_generator.translate._features, list(feature_set))
    tools.eq_(batch_generator.translate.look_forward, 2)
    tools.eq_(batch_generator.translate.look_back, 3)
    tools.eq_(batch_generator.translate._n_seconds, 1)
예제 #10
0
    "look_forward": 30,  # sequence model / RNN timesteps looking forward (total window = look_back + look_forward + 1)
    "batch_size": 1024,  # size of training/val batches
    "stride": 2,
    "batch_seconds": timesteps_seconds,  # timestep size in seconds
    "validation_split": 0.5,  # train/test split
    "pseudo_stratify": True,  # stratify batches (done streaming so pseudo-stratification)
    "stratify_nbatch_groupings": 10,  # number of batches to look at for stratification ratios
    "n_workers": None,  # n_workers for ProcessPoolExecutor. None means ProcessPoolExecutor(n_workers=None) / default
    "seed": 42,  # random seed for repeatability
    "normalize": True,  # use StandardScaler to normalize features
    "session_norm_filter": session_filter,
    "verbose": True  # debug logs
}

# Create builder for saving to files
batch_generator = Builder.file_builder_factory(**file_batch_config)

# Generate batches
start = time.perf_counter()
batch_generator.generate_and_save_batches(dataset)
logger.info(f"Total Duration: {time.perf_counter() - start}")

# Train and validation generators that can be passed to tf/keras fit_generator
train_generator = BatchGenerator(batch_generator.storage, is_validation=False)
val_generator = BatchGenerator(batch_generator.storage, is_validation=True)

# Consume in sample code for stats
train_batches = list(train_generator)
val_batches = list(val_generator)

logger.info(f"num training batches: {len(train_batches)}, num validation batches: {len(val_batches)}")
예제 #11
0
def test_no_dataset():
    batch_generator = Builder.memory_builder_factory([], 0, 0, 1)
    batch_generator.generate_and_save_batches([])