def test_hourly_partitions(): nw = BatchWriter( inner_writer=NullWriter, dataset="bucket/path", partitions=["year_{yyyy}/month_{mm}/day_{dd}/by_hour/hour={HH}"], ) for i in range(1): nw.append({"@": [" "] * BLOB_SIZE}) res = nw.finalize() assert "by_hour/hour=" in res
def do_writer_default(): w = BatchWriter(inner_writer=DiskWriter, dataset="_temp") for i in range(int(1e5)): w.append({"Barney Stinson": "Lorenzo Von Matterhorn"}) w.append({"Laszlo Cravensworth": "Jackie Daytona"}) w.finalize() del w
def do_writer_compressed(algo): w = BatchWriter(inner_writer=DiskWriter, dataset="_temp", format=algo) for i in range(int(1e5)): w.append({"test": True}) w.append({"test": False}) w.finalize() del w
def do_writer_abs(): w = BatchWriter( inner_writer=DiskWriter, dataset=os.getcwd() + "/_temp", date=datetime.date.today(), ) for i in range(int(1e5)): w.append({"Barney Stinson": "Lorenzo Von Matterhorn"}) w.append({"Laszlo Cravensworth": "Jackie Daytona"}) w.finalize()
def test_index(): # step back through time shutil.rmtree("_temp/data/tweets", ignore_errors=True) r = Reader(inner_reader=DiskReader, dataset="tests/data/tweets", raw_path=True) w = BatchWriter(inner_writer=DiskWriter, dataset="_temp/data/tweets", index_on=["username"]) for item in r: w.append(item) w.finalize() index = glob.glob("_temp/data/tweets/**/*username.idx", recursive=True) assert len(index) == 1, index with open(index[0], "rb") as f: idx = f.read() # test the recently created index outside the reader i = Index(io.BytesIO(idx)) assert i.search("SwiftOnSecurity") == [] assert i.search("BBCNews") == [1, 2, 4, 24, 25, 44], i.search("BBCNews") # test the filter with an index ri = Reader( inner_reader=DiskReader, dataset="_temp/data/tweets", filters="username = '******'", ) ri = list(ri) assert len(ri) == 6
def test_disk_text(): try: w = BatchWriter( inner_writer=DiskWriter, blob_size=1024, format="jsonl", dataset=f"_temp/test/gcs/dataset/text", ) for i in range(250): w.append({"index": i + 300}) w.finalize() # read the files we've just written, we should be able to # read over both paritions. r = Reader( inner_reader=DiskReader, dataset=f"_temp/test/gcs/dataset/text", ) l = list(r) assert len(l) == 250, len(l) except Exception as e: # pragma: no cover raise e
def do_writer(): w = BatchWriter(inner_writer=FileWriter, dataset="tests/data/framed", date=datetime.date.today()) for i in range(int(1e5)): w.append({"test": 2}) w.finalize()
def test_gcs_text(): # set up set_up() w = BatchWriter( inner_writer=GoogleCloudStorageWriter, project="testing", blob_size=1024, format="jsonl", dataset=f"{BUCKET_NAME}/test/gcs/dataset/text", ) for i in range(250): w.append({"index": i + 300}) w.finalize() # read the files we've just written, we should be able to # read over both paritions. r = Reader( inner_reader=GoogleCloudStorageReader, project="testing", dataset=f"{BUCKET_NAME}/test/gcs/dataset/text", persistence=STORAGE_CLASS.MEMORY, ) assert r.count() == 250, r
def test_gcs_parquet(): try: # set up the stub set_up() w = BatchWriter( inner_writer=GoogleCloudStorageWriter, project="testing", format="parquet", dataset=f"{BUCKET_NAME}/test/gcs/dataset", ) for i in range(100): w.append({"$$": i * 300}) w.finalize() # read the files we've just written, we should be able to # read over both paritions. r = Reader( inner_reader=GoogleCloudStorageReader, project="testing", dataset=f"{BUCKET_NAME}/test/gcs/dataset", ) l = list(r) assert isinstance(l[0], dict) assert len(l) == 100, len(l) except Exception as e: # pragma: no cover raise e
def execute_test(compress, schema, reader): # reader = read_jsonl('tweets.jsonl') res = [] for i in range(10): writer = BatchWriter( inner_writer=NullWriter, dataset="_tests/{datefolders}", format=compress, schema=schema, metadata={"test_data": True}, ) start = time.perf_counter_ns() for record in reader: writer.append(record) writer.finalize() res.append((time.perf_counter_ns() - start) / 1e9) return statistics.mean(res)
def test_using_batch_writer(): errored = False # try: if True: _create_bucket() w = BatchWriter( inner_writer=MinIoWriter, end_point=os.getenv("MINIO_END_POINT"), access_key=os.getenv("MINIO_ACCESS_KEY"), secret_key=os.getenv("MINIO_SECRET_KEY"), secure=False, dataset=f"{BUCKET_NAME}/test_writer", ) for member in VAMPIRIC_COUNCIL: w.append(member) w.finalize() # except Exception as a: # print(a) # errored = True assert not errored
while len(chunk) > 0: chunk = f.read(chunk_size) augmented_chunk = carry_forward + chunk lines = augmented_chunk.split(delimiter) carry_forward = lines.pop() yield from lines if carry_forward: yield carry_forward schema = Schema(schema_definition) lines = read_jsonl("tests/data/index/not/tweets.jsonl") writer = BatchWriter( inner_writer=DiskWriter, dataset="_temp/idx", # schema=schema, indexes=["user_name"], ) for record in lines: writer.append(record) writer.finalize() reader = Reader(inner_reader=DiskReader, dataset="_temp/idx", filters=("user_name", "==", "Remy")) i = 0 for i, r in enumerate(reader): print(i, r) print(i)