def test_gcs_binary(): # set up set_up() w = BatchWriter( inner_writer=GoogleCloudStorageWriter, project="testing", blob_size=1024, dataset=f"{BUCKET_NAME}/test/gcs/dataset/binary", ) for i in range(200): w.append({"index": i + 300}) w.finalize() # read the files we've just written, we should be able to # read over both paritions. r = Reader( inner_reader=GoogleCloudStorageReader, project="testing", dataset=f"{BUCKET_NAME}/test/gcs/dataset/binary", persistence=STORAGE_CLASS.MEMORY, ) assert r.count() == 200, r.count()
def test_reader_filters_single_filter(): """ensure the reader filter is working as expected""" r = Reader( inner_reader=DiskReader, dataset="tests/data/tweets/", raw_path=True, filters="username == 'NBCNews'", persistence=STORAGE_CLASS.MEMORY, ) assert r.count() == 44, r.count()
def test_can_read_xml(): r = Reader( inner_reader=DiskReader, dataset="tests/data/formats/xml", raw_path=True, persistence=STORAGE_CLASS.MEMORY, ) assert r.count() == 1, r.count() assert isinstance(r.first(), dict), r.first()
def test_reader_partitions_read_without_referring_to_partition(): """ test if we reference a folder with partitions (by_) without referencing the partition, we pick a partition and read it like it's not there """ DATA_DATE = datetime.date(2020, 2, 3) records = Reader( dataset="tests/data/partitioned", inner_reader=DiskReader, start_date=DATA_DATE, end_date=DATA_DATE, persistence=STORAGE_CLASS.MEMORY, ) assert records.count() == 25, records.count()
def test_reader_filters_multiple_filter(): """ensure the reader filter is working as expected""" r = Reader( inner_reader=DiskReader, dataset="tests/data/tweets/", raw_path=True, filters="username = '******' and timestamp >= '2020-01-12T07:11:04'", persistence=STORAGE_CLASS.MEMORY, ) assert r.count() == 34, r
def test_reader_partitions_read_referring_to_specific_partition(): """ test if we reference a folder with partitions (by_) without referencing the partition, we pick a partition and read it like it's not there """ DATA_DATE = datetime.date(2020, 2, 3) records = Reader( dataset="tests/data/partitioned", partitions=["year_{yyyy}/month_{mm}/day_{dd}"], partition_filter=("userid", "=", "14173315"), inner_reader=DiskReader, start_date=DATA_DATE, end_date=DATA_DATE, persistence=STORAGE_CLASS.MEMORY, ) assert records.count() == 5, records.count() DATA_DATE = datetime.date(2020, 2, 3) records = Reader( dataset="tests/data/partitioned", partitions=["year_{yyyy}/month_{mm}/day_{dd}"], partition_filter=("username", "=", "BBCNews"), inner_reader=DiskReader, start_date=DATA_DATE, end_date=DATA_DATE, persistence=STORAGE_CLASS.MEMORY, ) assert records.count() == 4, records.count() with pytest.raises(DataNotFoundError): DATA_DATE = datetime.date(2020, 2, 3) records = Reader( dataset="tests/data/partitioned", partitions=["year_{yyyy}/month_{mm}/day_{dd}"], partition_filter=("username", "=", "CNNNews"), inner_reader=DiskReader, start_date=DATA_DATE, end_date=DATA_DATE, persistence=STORAGE_CLASS.MEMORY, ) assert records.count() == 0, records.count()