Ejemplo n.º 1
0
def test_gcs_text():

    # set up
    set_up()

    w = BatchWriter(
        inner_writer=GoogleCloudStorageWriter,
        project="testing",
        blob_size=1024,
        format="jsonl",
        dataset=f"{BUCKET_NAME}/test/gcs/dataset/text",
    )
    for i in range(250):
        w.append({"index": i + 300})
    w.finalize()

    # read the files we've just written, we should be able to
    # read over both paritions.
    r = Reader(
        inner_reader=GoogleCloudStorageReader,
        project="testing",
        dataset=f"{BUCKET_NAME}/test/gcs/dataset/text",
        persistence=STORAGE_CLASS.MEMORY,
    )

    assert r.count() == 250, r
Ejemplo n.º 2
0
def test_index():
    # step back through time
    shutil.rmtree("_temp/data/tweets", ignore_errors=True)

    r = Reader(inner_reader=DiskReader,
               dataset="tests/data/tweets",
               raw_path=True)
    w = BatchWriter(inner_writer=DiskWriter,
                    dataset="_temp/data/tweets",
                    index_on=["username"])
    for item in r:
        w.append(item)
    w.finalize()
    index = glob.glob("_temp/data/tweets/**/*username.idx", recursive=True)
    assert len(index) == 1, index

    with open(index[0], "rb") as f:
        idx = f.read()

    # test the recently created index outside the reader
    i = Index(io.BytesIO(idx))
    assert i.search("SwiftOnSecurity") == []
    assert i.search("BBCNews") == [1, 2, 4, 24, 25, 44], i.search("BBCNews")

    # test the filter with an index
    ri = Reader(
        inner_reader=DiskReader,
        dataset="_temp/data/tweets",
        filters="username = '******'",
    )
    ri = list(ri)

    assert len(ri) == 6
Ejemplo n.º 3
0
def test_reader_filters_single_filter():
    """ensure the reader filter is working as expected"""
    r = Reader(
        inner_reader=DiskReader,
        dataset="tests/data/tweets/",
        raw_path=True,
        filters="username == 'NBCNews'",
        persistence=STORAGE_CLASS.MEMORY,
    )
    assert r.count() == 44, r.count()
Ejemplo n.º 4
0
def test_reader_filters_multiple_filter():
    """ensure the reader filter is working as expected"""
    r = Reader(
        inner_reader=DiskReader,
        dataset="tests/data/tweets/",
        raw_path=True,
        filters="username = '******' and timestamp >= '2020-01-12T07:11:04'",
        persistence=STORAGE_CLASS.MEMORY,
    )
    assert r.count() == 34, r
def test_can_read_parquet():
    r = Reader(
        inner_reader=DiskReader,
        dataset="tests/data/formats/parquet",
        raw_path=True,
        persistence=STORAGE_CLASS.MEMORY,
    )

    assert r.count() == 57581, r.count()
    assert isinstance(r.first(), dict)
Ejemplo n.º 6
0
def test_reader_partitions_read_without_referring_to_partition():
    """
    test if we reference a folder with partitions (by_) without referencing the
    partition, we pick a partition and read it like it's not there
    """
    DATA_DATE = datetime.date(2020, 2, 3)
    records = Reader(
        dataset="tests/data/partitioned",
        inner_reader=DiskReader,
        start_date=DATA_DATE,
        end_date=DATA_DATE,
        persistence=STORAGE_CLASS.MEMORY,
    )
    assert records.count() == 25, records.count()
Ejemplo n.º 7
0
def test_disk_text():

    try:

        w = BatchWriter(
            inner_writer=DiskWriter,
            blob_size=1024,
            format="jsonl",
            dataset=f"_temp/test/gcs/dataset/text",
        )
        for i in range(250):
            w.append({"index": i + 300})
        w.finalize()

        # read the files we've just written, we should be able to
        # read over both paritions.
        r = Reader(
            inner_reader=DiskReader,
            dataset=f"_temp/test/gcs/dataset/text",
        )
        l = list(r)

        assert len(l) == 250, len(l)
    except Exception as e:  # pragma: no cover
        raise e
Ejemplo n.º 8
0
def test_gcs_parquet():

    try:
        # set up the stub
        set_up()

        w = BatchWriter(
            inner_writer=GoogleCloudStorageWriter,
            project="testing",
            format="parquet",
            dataset=f"{BUCKET_NAME}/test/gcs/dataset",
        )
        for i in range(100):
            w.append({"$$": i * 300})
        w.finalize()

        # read the files we've just written, we should be able to
        # read over both paritions.
        r = Reader(
            inner_reader=GoogleCloudStorageReader,
            project="testing",
            dataset=f"{BUCKET_NAME}/test/gcs/dataset",
        )
        l = list(r)
        assert isinstance(l[0], dict)
        assert len(l) == 100, len(l)
    except Exception as e:  # pragma: no cover
        raise e
Ejemplo n.º 9
0
def test_reader_writer_format_parquet():
    do_writer_compressed("parquet")
    g = glob.glob("_temp/**/*.parquet", recursive=True)
    assert len(g) > 0, g
    r = Reader(inner_reader=DiskReader, dataset="_temp")
    l = len(list(r))
    shutil.rmtree("_temp", ignore_errors=True)
    assert l == 200000, l
Ejemplo n.º 10
0
def test_reader_writer():

    do_writer()

    r = Reader(inner_reader=DiskReader, dataset="_temp")
    l = len(list(r))
    shutil.rmtree("_temp", ignore_errors=True)
    assert l == 200000, l
Ejemplo n.º 11
0
def test_reader_filters_no_filter():
    """ensure the reader filter is working as expected"""
    r = Reader(inner_reader=DiskReader,
               dataset="tests/data/tweets/",
               raw_path=True)
    for index, item in enumerate(r):
        pass
    assert index == 49, index
Ejemplo n.º 12
0
def test_reader_writer_format_default():
    do_writer_default()
    g = glob.glob("_temp/**/*.zstd", recursive=True)
    assert len(g) > 0, g

    r = Reader(inner_reader=DiskReader, dataset="_temp")
    l = len(list(r))
    shutil.rmtree("_temp", ignore_errors=True)
    assert l == 200000, l
Ejemplo n.º 13
0
def test_cursor():
    """
    Test that when we break a read in two, we read the right amount of records.
    """

    import json

    test_counter = 0
    number_of_records = get_records()
    lim = number_of_records // 4 * 3
    hashes = []

    reader = Reader(inner_reader=DiskReader,
                    dataset="tests/data/tweets/",
                    partitions=[])

    for row in reader["tweet"].take(lim):
        hashes.append(hash(json.dumps(row)))
        test_counter += 1
    cursor = reader.cursor

    print(cursor)
    assert cursor["location"] == (
        (lim - 1) % 25), f"{cursor['location']}, {lim}, {(lim % 25)}"
    assert cursor["partition"] == 5122091051124077700, cursor["partition"]

    reader = Reader(
        inner_reader=DiskReader,
        dataset="tests/data/tweets/",
        partitions=[],
        cursor=cursor,
    )

    for i, row in enumerate(reader["tweet"].take(100)):
        hashes.append(hash(json.dumps(row)))
        test_counter += 1

    # we should have read the number of expected records
    assert (number_of_records == test_counter
            ), f"{number_of_records} - {test_counter}, {i}"
    # we shouldn't have captured any duplicates
    assert len(hashes) == len(
        set(hashes)), f"{len(hashes)} == {len(set(hashes))}"
Ejemplo n.º 14
0
def time_it(dataset, username):
    start = time.perf_counter_ns()
    reader = Reader(
        inner_reader=DiskReader,
        dataset=dataset,
        raw_path=True,
        filters=("user_name", "==", username),
    )
    res = [r for r in reader]
    print(len(res))
    return (time.perf_counter_ns() - start) / 1e9
Ejemplo n.º 15
0
 def _inner(p, pt):
     # step back through time
     r = Reader(
         inner_reader=DiskReader,
         dataset=p,
         partitions=pt,
         start_date=datetime.date(2021, 1, 1),
         end_date=datetime.date(2021, 1, 1),
         freshness_limit="30d",
     )
     assert len(list(r)) == 50
Ejemplo n.º 16
0
 def _inner(p, pt):
     # step back through time
     r = Reader(
         inner_reader=DiskReader,
         dataset=p,
         partitions=pt,
         start_date=datetime.date(2021, 1, 1),
         end_date=datetime.date(2021, 1, 1),
         freshness_limit="5d",
     )
     with pytest.raises(DataNotFoundError):
         assert len(list(r)) == 0
Ejemplo n.º 17
0
def test_ignore_flag():
    """
    test we ignore invalidated frames
    """
    DATA_DATE = datetime.date(2021, 3, 29)
    records = Reader(
        dataset="tests/data/framed",
        inner_reader=DiskReader,
        start_date=DATA_DATE,
        end_date=DATA_DATE,
    )
    print(next(records))
    assert next(records).get("test") == 1
Ejemplo n.º 18
0
def test_ignore_flag_step_back_days():
    """
    test that we step back a day if all of the frames have been invalidated
    """
    DATA_DATE = datetime.date(2021, 3, 30)
    records = Reader(
        dataset="tests/data/framed",
        inner_reader=DiskReader,
        start_date=DATA_DATE,
        end_date=DATA_DATE,
        freshness_limit="24h",
    )
    print(next(records))
Ejemplo n.º 19
0
def test_ignore_non_complete_frames():
    """
    test we ignore frames without the complete flag
    """
    DATA_DATE = datetime.date(2021, 3, 28)
    records = Reader(
        dataset="tests/data/framed",
        inner_reader=DiskReader,
        start_date=DATA_DATE,
        end_date=DATA_DATE,
    )
    print(next(records))
    assert next(records).get("test") == 1
Ejemplo n.º 20
0
def test_reader_writer_format_jsonl():

    do_writer_compressed("jsonl")

    g = glob.glob("_temp/**/*.jsonl", recursive=True)
    assert len(g) > 0, g

    c = glob.glob("_temp/**/*.complete", recursive=True)
    len(c) == 0, c

    r = Reader(inner_reader=DiskReader, dataset="_temp")
    l = len(list(r))
    shutil.rmtree("_temp", ignore_errors=True)
    assert l == 200000, l
Ejemplo n.º 21
0
def test_dataset_prefix_validator():

    with pytest.raises(AccessDenied):
        reader = Reader(dataset="dataset", valid_dataset_prefixes=["drive/"])

    with pytest.raises(AccessDenied):
        reader = Reader(dataset="dataset",
                        valid_dataset_prefixes=["list", "of", "items"])

    # no whitelist - allow all
    reader = Reader(project="", dataset="dataset")

    # a list of one
    reader = Reader(project="",
                    dataset="dataset",
                    valid_dataset_prefixes=["dataset"])

    # a list of many
    reader = Reader(
        project="",
        dataset="dataset",
        valid_dataset_prefixes=["on", "the", "list", "dataset"],
    )
Ejemplo n.º 22
0
def test_reader_all_good():
    failed = False

    try:
        reader = Reader(
            project="",
            select="a, b",
            dataset="",
            start_date=datetime.datetime.now(),
            end_date=datetime.datetime.now(),
        )
    except InvalidReaderConfigError:
        failed = True

    assert not failed
Ejemplo n.º 23
0
def test_cursor_as_text():

    offsets = [1, 6, 8, 13, 22]

    for offset in offsets:
        reader = Reader(
            inner_reader=DiskReader,
            dataset="tests/data/tweets/",
            partitions=[],
            cursor='{"partition": 5122091051124077700, "location": ' +
            str(offset) + ', "map":"80" }',
        )
        l = list(reader)
        # 24 because we count from zero (the first row in the file is the 0th record)
        assert len(
            l) + offset == 24, f"{len(l) + offset} == {24}, {reader.cursor}"
Ejemplo n.º 24
0
def test_base():

    reader = Reader(inner_reader=DiskReader,
                    dataset="tests/data/tweets/",
                    partitions=[])

    # we read 50 records
    for i in range(50):
        next(reader)
        assert reader.cursor["location"] == (i % 25), reader.cursor

    # we can't read 51
    with pytest.raises(StopIteration):
        next(reader)

    # range 50 actually is 50
    assert len(range(50)) == 50
Ejemplo n.º 25
0
def test_move_to_cursor():

    offsets = [1, 6, 8, 13, 22]

    for offset in offsets:
        reader = Reader(inner_reader=DiskReader,
                        dataset="tests/data/tweets/",
                        partitions=[])
        next(reader)
        steps = 1
        while reader.cursor["location"] < offset:
            steps += 1
            next(reader)

        assert offset == reader.cursor["location"]
        l = len(list(reader))
        # we stepped offset number of records and then read l more, this should be 50
        assert steps + l == 50
def test_writer_backout():

    if Path(TEST_FOLDER).exists():  # pragma: no cover
        shutil.rmtree(TEST_FOLDER)

    w = StreamWriter(
        dataset=TEST_FOLDER,
        inner_writer=DiskWriter,
        schema=Schema(SCHEMA),
        idle_timeout_seconds=1,
    )

    for record in DATA_SET:
        w.append(record)

    time.sleep(4)

    r = Reader(dataset=TEST_FOLDER, inner_reader=DiskReader)

    assert len(list(r)) == 8
Ejemplo n.º 27
0
def get_data():
    r = Reader(inner_reader=DiskReader,
               dataset="tests/data/tweets",
               raw_path=True)
    return r
Ejemplo n.º 28
0
    )
    res = [r for r in reader]
    print(len(res))
    return (time.perf_counter_ns() - start) / 1e9


import os, sys

sys.path.insert(1, os.path.join(sys.path[0], "../.."))
from mabel.data import Reader
from mabel.adapters.disk import DiskReader
from mabel.logging import get_logger

get_logger().setLevel(100)

user_name = "Verizon Support"

print("indexed\t:", time_it("tests/data/index/is", user_name))
print("not indexed\t:", time_it("tests/data/index/not", user_name))
print("indexed\t:", time_it("tests/data/index/is", user_name + "bb"))
print("not indexed\t:", time_it("tests/data/index/not", user_name + "bb"))


reader = Reader(
    inner_reader=DiskReader,
    dataset="tests/data/index/is",
    raw_path=True,
)
idx = Index.build_index(reader, "user_name")
idx.dump("tests/data/index/is/tweets.jsonl.user_name.idx")
Ejemplo n.º 29
0
            augmented_chunk = carry_forward + chunk
            lines = augmented_chunk.split(delimiter)
            carry_forward = lines.pop()
            yield from lines
        if carry_forward:
            yield carry_forward


schema = Schema(schema_definition)
lines = read_jsonl("tests/data/index/not/tweets.jsonl")

writer = BatchWriter(
    inner_writer=DiskWriter,
    dataset="_temp/idx",
    # schema=schema,
    indexes=["user_name"],
)

for record in lines:
    writer.append(record)
writer.finalize()

reader = Reader(inner_reader=DiskReader,
                dataset="_temp/idx",
                filters=("user_name", "==", "Remy"))
i = 0
for i, r in enumerate(reader):
    print(i, r)

print(i)
Ejemplo n.º 30
0
def get_records():
    r = Reader(inner_reader=DiskReader,
               dataset="tests/data/tweets/",
               partitions=[])
    return len(list(r))