Example #1
0
def test_ngram_length_1_tf(synthetic_dataset):
    """Test to verify that ngram generalize to support length 1"""
    dataset_dicts = synthetic_dataset.data
    fields = {0: [TestSchema.id, TestSchema.id2]}
    ngram = NGram(fields=fields,
                  delta_threshold=0.012,
                  timestamp_field=TestSchema.id)
    reader = Reader(synthetic_dataset.url,
                    schema_fields=ngram,
                    shuffle_options=ShuffleOptions(True, 5),
                    reader_pool=DummyPool())
    with tf.Session() as sess:
        for _ in range(10):
            actual = sess.run(tf_tensors(reader))
            expected_ngram = _get_named_tuple_from_ngram(
                ngram, dataset_dicts, actual[min(actual.keys())].id)
            _assert_equal_ngram(actual, expected_ngram)

    reader.stop()
    reader.join()
Example #2
0
def _test_continuous_ngram(ngram_fields, dataset_num_files_1):
    """Test continuous ngram of a certain length. Continuous here refers to
    that this reader will always return consecutive ngrams due to shuffle being false
    and partition being 1."""

    ngram = NGram(fields=ngram_fields,
                  delta_threshold=10,
                  timestamp_field=TestSchema.id)
    with Reader(schema_fields=ngram,
                dataset_url=dataset_num_files_1.url,
                reader_pool=ThreadPool(1),
                shuffle_options=ShuffleOptions(False)) as reader:
        expected_id = 0

        for _ in range(ngram.length):
            actual = next(reader)
            expected_ngram = _get_named_tuple_from_ngram(
                ngram, dataset_num_files_1.data, expected_id)
            np.testing.assert_equal(actual, expected_ngram)
            expected_id = expected_id + 1
Example #3
0
def test_ngram_validation():
    """Test to verify that ngram validation work as expected."""

    fields = {
        0: [
            TestSchema.id, TestSchema.id2, TestSchema.image_png,
            TestSchema.matrix
        ],
        1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
    }

    with pytest.raises(ValueError):
        # delta threshold must be an int
        NGram(fields=fields,
              delta_threshold='abc',
              timestamp_field=TestSchema.id)

    with pytest.raises(ValueError):
        # timestamp_field must be a field
        NGram(fields=fields, delta_threshold=5, timestamp_field=5)

    with pytest.raises(ValueError):
        # Fields must be a dict
        NGram(fields=[], delta_threshold=5, timestamp_field=TestSchema.id)

    with pytest.raises(ValueError):
        # Each value in fields must be an array
        NGram(fields={0: 'test'},
              delta_threshold=5,
              timestamp_field=TestSchema.id)

    with pytest.raises(ValueError):
        # timestamp_overlap must be bool
        NGram(fields=fields,
              delta_threshold=0.5,
              timestamp_field=TestSchema.id,
              timestamp_overlap=2)

    # Check some positive cases
    NGram(fields=fields, delta_threshold=0.5, timestamp_field=TestSchema.id)
    NGram(fields=fields,
          delta_threshold=Decimal('0.5'),
          timestamp_field=TestSchema.id)
Example #4
0
def test_ngram_shuffle_drop_ratio(synthetic_dataset, reader_factory):
    """Test to verify the shuffle drop ratio work as expected."""
    fields = {
        -2: [TestSchema.id, TestSchema.id2, TestSchema.matrix],
        -1: [TestSchema.id, TestSchema.id2, TestSchema.image_png],
        0: [TestSchema.id, TestSchema.id2, TestSchema.decimal],
        1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
        2: [TestSchema.id, TestSchema.id2]
    }
    ngram = NGram(fields=fields, delta_threshold=10, timestamp_field=TestSchema.id)
    with reader_factory(synthetic_dataset.url,
                        schema_fields=ngram,
                        shuffle_row_groups=False) as reader:
        unshuffled = [row[0].id for row in reader]
    with reader_factory(synthetic_dataset.url,
                        schema_fields=ngram,
                        shuffle_row_groups=True,
                        shuffle_row_drop_partitions=6) as reader:
        shuffled = [row[0].id for row in reader]
    assert len(unshuffled) == len(shuffled)
    assert unshuffled != shuffled
Example #5
0
def test_ngram_basic_longer_no_overlap(synthetic_dataset, reader_factory):
    """Tests basic ngram with no delta threshold with no overlaps of timestamps."""
    fields = {
        -5: [TestSchema.id, TestSchema.id2, TestSchema.matrix],
        -4: [TestSchema.id, TestSchema.id2, TestSchema.image_png],
        -3: [TestSchema.id, TestSchema.id2, TestSchema.decimal],
        -2: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
        -1: [TestSchema.id, TestSchema.id2]
    }

    dataset_dicts = synthetic_dataset.data
    ngram = NGram(fields=fields, delta_threshold=10, timestamp_field=TestSchema.id, timestamp_overlap=False)
    with reader_factory(synthetic_dataset.url, schema_fields=ngram, shuffle_row_groups=False) as reader:
        timestamps_seen = set()
        for actual in reader:
            expected_ngram = _get_named_tuple_from_ngram(ngram, dataset_dicts, actual[min(actual.keys())].id)
            np.testing.assert_equal(actual, expected_ngram)
            for step in actual.values():
                timestamp = step.id
                assert timestamp not in timestamps_seen
                timestamps_seen.add(timestamp)
Example #6
0
def _test_continuous_ngram_returns(ngram_fields, ts_field, dataset_num_files_1,
                                   reader_factory):
    """Test continuous ngram of a certain length. Continuous here refers to
    that this reader will always return consecutive ngrams due to shuffle being false
    and partition being 1. Returns the ngram object"""

    ngram = NGram(fields=ngram_fields,
                  delta_threshold=10,
                  timestamp_field=ts_field)
    with reader_factory(dataset_num_files_1.url,
                        schema_fields=ngram,
                        shuffle_row_groups=False) as reader:
        expected_id = 0

        for _ in range(ngram.length):
            actual = next(reader)
            expected_ngram = _get_named_tuple_from_ngram(
                ngram, dataset_num_files_1.data, expected_id)
            np.testing.assert_equal(actual, expected_ngram)
            expected_id = expected_id + 1

    return ngram
Example #7
0
def test_ngram_delta_threshold(dataset_0_3_8_10_11_20_23):
    """Test to verify that delta threshold work as expected in one partition in the same ngram
    and between consecutive ngrams. delta threshold here refers that each ngram must not be
    more than delta threshold apart for the field specified by timestamp_field."""

    fields = {
        0: [
            TestSchema.id, TestSchema.id2, TestSchema.image_png,
            TestSchema.matrix
        ],
        1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
    }
    ngram = NGram(fields=fields,
                  delta_threshold=4,
                  timestamp_field=TestSchema.id)
    with Reader(schema_fields=ngram,
                dataset_url=dataset_0_3_8_10_11_20_23.url,
                reader_pool=ThreadPool(1),
                shuffle_options=ShuffleOptions(False)) as reader:
        # NGrams expected: (0, 3), (8, 10), (10, 11)

        first_item = next(reader)
        expected_ngram = _get_named_tuple_from_ngram(
            ngram, dataset_0_3_8_10_11_20_23.data, 0)
        np.testing.assert_equal(first_item, expected_ngram)

        second_item = next(reader)
        expected_ngram = _get_named_tuple_from_ngram(
            ngram, dataset_0_3_8_10_11_20_23.data, 3)
        np.testing.assert_equal(second_item, expected_ngram)

        third_item = next(reader)
        expected_ngram = _get_named_tuple_from_ngram(
            ngram, dataset_0_3_8_10_11_20_23.data, 5)
        np.testing.assert_equal(third_item, expected_ngram)

        with pytest.raises(StopIteration):
            next(reader)
Example #8
0
def test_ngram_delta_small_threshold():
    """Test to verify that a small threshold work in ngrams."""

    with temporary_directory() as tmp_dir:
        tmp_url = 'file://{}'.format(tmp_dir)
        ids = range(0, 99, 5)
        create_test_dataset(tmp_url, ids)

        fields = {
            0: [
                TestSchema.id, TestSchema.id2, TestSchema.image_png,
                TestSchema.matrix
            ],
            1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
        }
        ngram = NGram(fields=fields,
                      delta_threshold=1,
                      timestamp_field=TestSchema.id)
        with Reader(schema_fields=ngram,
                    dataset_url=tmp_url,
                    reader_pool=ThreadPool(10)) as reader:
            with pytest.raises(StopIteration):
                next(reader)
Example #9
0
def test_ngram_shuffle_drop_ratio(synthetic_dataset):
    """Test to verify the shuffle drop ratio work as expected."""
    fields = {
        -2: [TestSchema.id, TestSchema.id2, TestSchema.matrix],
        -1: [TestSchema.id, TestSchema.id2, TestSchema.image_png],
        0: [TestSchema.id, TestSchema.id2, TestSchema.decimal],
        1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
        2: [TestSchema.id, TestSchema.id2]
    }
    ngram = NGram(fields=fields,
                  delta_threshold=10,
                  timestamp_field=TestSchema.id)
    with Reader(synthetic_dataset.url,
                schema_fields=ngram,
                shuffle_options=ShuffleOptions(False),
                reader_pool=DummyPool()) as reader:
        unshuffled = [row[0].id for row in reader]
    with Reader(synthetic_dataset.url,
                schema_fields=ngram,
                shuffle_options=ShuffleOptions(True, 6),
                reader_pool=DummyPool()) as reader:
        shuffled = [row[0].id for row in reader]
    assert len(unshuffled) == len(shuffled)
    assert unshuffled != shuffled
Example #10
0
def test_dataset_on_ngram_not_supported(synthetic_dataset, reader_factory):
    ngram = NGram({0: list(_EXCLUDE_FIELDS), 1: [TestSchema.id]}, 100, TestSchema.id)
    with reader_factory(synthetic_dataset.url, schema_fields=ngram) as reader:
        with pytest.raises(NotImplementedError):
            make_petastorm_dataset(reader)