def test_gcs_text(): # set up set_up() w = BatchWriter( inner_writer=GoogleCloudStorageWriter, project="testing", blob_size=1024, format="jsonl", dataset=f"{BUCKET_NAME}/test/gcs/dataset/text", ) for i in range(250): w.append({"index": i + 300}) w.finalize() # read the files we've just written, we should be able to # read over both paritions. r = Reader( inner_reader=GoogleCloudStorageReader, project="testing", dataset=f"{BUCKET_NAME}/test/gcs/dataset/text", persistence=STORAGE_CLASS.MEMORY, ) assert r.count() == 250, r
def test_index(): # step back through time shutil.rmtree("_temp/data/tweets", ignore_errors=True) r = Reader(inner_reader=DiskReader, dataset="tests/data/tweets", raw_path=True) w = BatchWriter(inner_writer=DiskWriter, dataset="_temp/data/tweets", index_on=["username"]) for item in r: w.append(item) w.finalize() index = glob.glob("_temp/data/tweets/**/*username.idx", recursive=True) assert len(index) == 1, index with open(index[0], "rb") as f: idx = f.read() # test the recently created index outside the reader i = Index(io.BytesIO(idx)) assert i.search("SwiftOnSecurity") == [] assert i.search("BBCNews") == [1, 2, 4, 24, 25, 44], i.search("BBCNews") # test the filter with an index ri = Reader( inner_reader=DiskReader, dataset="_temp/data/tweets", filters="username = '******'", ) ri = list(ri) assert len(ri) == 6
def test_reader_filters_single_filter(): """ensure the reader filter is working as expected""" r = Reader( inner_reader=DiskReader, dataset="tests/data/tweets/", raw_path=True, filters="username == 'NBCNews'", persistence=STORAGE_CLASS.MEMORY, ) assert r.count() == 44, r.count()
def test_reader_filters_multiple_filter(): """ensure the reader filter is working as expected""" r = Reader( inner_reader=DiskReader, dataset="tests/data/tweets/", raw_path=True, filters="username = '******' and timestamp >= '2020-01-12T07:11:04'", persistence=STORAGE_CLASS.MEMORY, ) assert r.count() == 34, r
def test_can_read_parquet(): r = Reader( inner_reader=DiskReader, dataset="tests/data/formats/parquet", raw_path=True, persistence=STORAGE_CLASS.MEMORY, ) assert r.count() == 57581, r.count() assert isinstance(r.first(), dict)
def test_reader_partitions_read_without_referring_to_partition(): """ test if we reference a folder with partitions (by_) without referencing the partition, we pick a partition and read it like it's not there """ DATA_DATE = datetime.date(2020, 2, 3) records = Reader( dataset="tests/data/partitioned", inner_reader=DiskReader, start_date=DATA_DATE, end_date=DATA_DATE, persistence=STORAGE_CLASS.MEMORY, ) assert records.count() == 25, records.count()
def test_disk_text(): try: w = BatchWriter( inner_writer=DiskWriter, blob_size=1024, format="jsonl", dataset=f"_temp/test/gcs/dataset/text", ) for i in range(250): w.append({"index": i + 300}) w.finalize() # read the files we've just written, we should be able to # read over both paritions. r = Reader( inner_reader=DiskReader, dataset=f"_temp/test/gcs/dataset/text", ) l = list(r) assert len(l) == 250, len(l) except Exception as e: # pragma: no cover raise e
def test_gcs_parquet(): try: # set up the stub set_up() w = BatchWriter( inner_writer=GoogleCloudStorageWriter, project="testing", format="parquet", dataset=f"{BUCKET_NAME}/test/gcs/dataset", ) for i in range(100): w.append({"$$": i * 300}) w.finalize() # read the files we've just written, we should be able to # read over both paritions. r = Reader( inner_reader=GoogleCloudStorageReader, project="testing", dataset=f"{BUCKET_NAME}/test/gcs/dataset", ) l = list(r) assert isinstance(l[0], dict) assert len(l) == 100, len(l) except Exception as e: # pragma: no cover raise e
def test_reader_writer_format_parquet(): do_writer_compressed("parquet") g = glob.glob("_temp/**/*.parquet", recursive=True) assert len(g) > 0, g r = Reader(inner_reader=DiskReader, dataset="_temp") l = len(list(r)) shutil.rmtree("_temp", ignore_errors=True) assert l == 200000, l
def test_reader_writer(): do_writer() r = Reader(inner_reader=DiskReader, dataset="_temp") l = len(list(r)) shutil.rmtree("_temp", ignore_errors=True) assert l == 200000, l
def test_reader_filters_no_filter(): """ensure the reader filter is working as expected""" r = Reader(inner_reader=DiskReader, dataset="tests/data/tweets/", raw_path=True) for index, item in enumerate(r): pass assert index == 49, index
def test_reader_writer_format_default(): do_writer_default() g = glob.glob("_temp/**/*.zstd", recursive=True) assert len(g) > 0, g r = Reader(inner_reader=DiskReader, dataset="_temp") l = len(list(r)) shutil.rmtree("_temp", ignore_errors=True) assert l == 200000, l
def test_cursor(): """ Test that when we break a read in two, we read the right amount of records. """ import json test_counter = 0 number_of_records = get_records() lim = number_of_records // 4 * 3 hashes = [] reader = Reader(inner_reader=DiskReader, dataset="tests/data/tweets/", partitions=[]) for row in reader["tweet"].take(lim): hashes.append(hash(json.dumps(row))) test_counter += 1 cursor = reader.cursor print(cursor) assert cursor["location"] == ( (lim - 1) % 25), f"{cursor['location']}, {lim}, {(lim % 25)}" assert cursor["partition"] == 5122091051124077700, cursor["partition"] reader = Reader( inner_reader=DiskReader, dataset="tests/data/tweets/", partitions=[], cursor=cursor, ) for i, row in enumerate(reader["tweet"].take(100)): hashes.append(hash(json.dumps(row))) test_counter += 1 # we should have read the number of expected records assert (number_of_records == test_counter ), f"{number_of_records} - {test_counter}, {i}" # we shouldn't have captured any duplicates assert len(hashes) == len( set(hashes)), f"{len(hashes)} == {len(set(hashes))}"
def time_it(dataset, username): start = time.perf_counter_ns() reader = Reader( inner_reader=DiskReader, dataset=dataset, raw_path=True, filters=("user_name", "==", username), ) res = [r for r in reader] print(len(res)) return (time.perf_counter_ns() - start) / 1e9
def _inner(p, pt): # step back through time r = Reader( inner_reader=DiskReader, dataset=p, partitions=pt, start_date=datetime.date(2021, 1, 1), end_date=datetime.date(2021, 1, 1), freshness_limit="30d", ) assert len(list(r)) == 50
def _inner(p, pt): # step back through time r = Reader( inner_reader=DiskReader, dataset=p, partitions=pt, start_date=datetime.date(2021, 1, 1), end_date=datetime.date(2021, 1, 1), freshness_limit="5d", ) with pytest.raises(DataNotFoundError): assert len(list(r)) == 0
def test_ignore_flag(): """ test we ignore invalidated frames """ DATA_DATE = datetime.date(2021, 3, 29) records = Reader( dataset="tests/data/framed", inner_reader=DiskReader, start_date=DATA_DATE, end_date=DATA_DATE, ) print(next(records)) assert next(records).get("test") == 1
def test_ignore_flag_step_back_days(): """ test that we step back a day if all of the frames have been invalidated """ DATA_DATE = datetime.date(2021, 3, 30) records = Reader( dataset="tests/data/framed", inner_reader=DiskReader, start_date=DATA_DATE, end_date=DATA_DATE, freshness_limit="24h", ) print(next(records))
def test_ignore_non_complete_frames(): """ test we ignore frames without the complete flag """ DATA_DATE = datetime.date(2021, 3, 28) records = Reader( dataset="tests/data/framed", inner_reader=DiskReader, start_date=DATA_DATE, end_date=DATA_DATE, ) print(next(records)) assert next(records).get("test") == 1
def test_reader_writer_format_jsonl(): do_writer_compressed("jsonl") g = glob.glob("_temp/**/*.jsonl", recursive=True) assert len(g) > 0, g c = glob.glob("_temp/**/*.complete", recursive=True) len(c) == 0, c r = Reader(inner_reader=DiskReader, dataset="_temp") l = len(list(r)) shutil.rmtree("_temp", ignore_errors=True) assert l == 200000, l
def test_dataset_prefix_validator(): with pytest.raises(AccessDenied): reader = Reader(dataset="dataset", valid_dataset_prefixes=["drive/"]) with pytest.raises(AccessDenied): reader = Reader(dataset="dataset", valid_dataset_prefixes=["list", "of", "items"]) # no whitelist - allow all reader = Reader(project="", dataset="dataset") # a list of one reader = Reader(project="", dataset="dataset", valid_dataset_prefixes=["dataset"]) # a list of many reader = Reader( project="", dataset="dataset", valid_dataset_prefixes=["on", "the", "list", "dataset"], )
def test_reader_all_good(): failed = False try: reader = Reader( project="", select="a, b", dataset="", start_date=datetime.datetime.now(), end_date=datetime.datetime.now(), ) except InvalidReaderConfigError: failed = True assert not failed
def test_cursor_as_text(): offsets = [1, 6, 8, 13, 22] for offset in offsets: reader = Reader( inner_reader=DiskReader, dataset="tests/data/tweets/", partitions=[], cursor='{"partition": 5122091051124077700, "location": ' + str(offset) + ', "map":"80" }', ) l = list(reader) # 24 because we count from zero (the first row in the file is the 0th record) assert len( l) + offset == 24, f"{len(l) + offset} == {24}, {reader.cursor}"
def test_base(): reader = Reader(inner_reader=DiskReader, dataset="tests/data/tweets/", partitions=[]) # we read 50 records for i in range(50): next(reader) assert reader.cursor["location"] == (i % 25), reader.cursor # we can't read 51 with pytest.raises(StopIteration): next(reader) # range 50 actually is 50 assert len(range(50)) == 50
def test_move_to_cursor(): offsets = [1, 6, 8, 13, 22] for offset in offsets: reader = Reader(inner_reader=DiskReader, dataset="tests/data/tweets/", partitions=[]) next(reader) steps = 1 while reader.cursor["location"] < offset: steps += 1 next(reader) assert offset == reader.cursor["location"] l = len(list(reader)) # we stepped offset number of records and then read l more, this should be 50 assert steps + l == 50
def test_writer_backout(): if Path(TEST_FOLDER).exists(): # pragma: no cover shutil.rmtree(TEST_FOLDER) w = StreamWriter( dataset=TEST_FOLDER, inner_writer=DiskWriter, schema=Schema(SCHEMA), idle_timeout_seconds=1, ) for record in DATA_SET: w.append(record) time.sleep(4) r = Reader(dataset=TEST_FOLDER, inner_reader=DiskReader) assert len(list(r)) == 8
def get_data(): r = Reader(inner_reader=DiskReader, dataset="tests/data/tweets", raw_path=True) return r
) res = [r for r in reader] print(len(res)) return (time.perf_counter_ns() - start) / 1e9 import os, sys sys.path.insert(1, os.path.join(sys.path[0], "../..")) from mabel.data import Reader from mabel.adapters.disk import DiskReader from mabel.logging import get_logger get_logger().setLevel(100) user_name = "Verizon Support" print("indexed\t:", time_it("tests/data/index/is", user_name)) print("not indexed\t:", time_it("tests/data/index/not", user_name)) print("indexed\t:", time_it("tests/data/index/is", user_name + "bb")) print("not indexed\t:", time_it("tests/data/index/not", user_name + "bb")) reader = Reader( inner_reader=DiskReader, dataset="tests/data/index/is", raw_path=True, ) idx = Index.build_index(reader, "user_name") idx.dump("tests/data/index/is/tweets.jsonl.user_name.idx")
augmented_chunk = carry_forward + chunk lines = augmented_chunk.split(delimiter) carry_forward = lines.pop() yield from lines if carry_forward: yield carry_forward schema = Schema(schema_definition) lines = read_jsonl("tests/data/index/not/tweets.jsonl") writer = BatchWriter( inner_writer=DiskWriter, dataset="_temp/idx", # schema=schema, indexes=["user_name"], ) for record in lines: writer.append(record) writer.finalize() reader = Reader(inner_reader=DiskReader, dataset="_temp/idx", filters=("user_name", "==", "Remy")) i = 0 for i, r in enumerate(reader): print(i, r) print(i)
def get_records(): r = Reader(inner_reader=DiskReader, dataset="tests/data/tweets/", partitions=[]) return len(list(r))