Esempio n. 1
0
def test_gcs_binary():

    # set up
    set_up()

    w = BatchWriter(
        inner_writer=GoogleCloudStorageWriter,
        project="testing",
        blob_size=1024,
        dataset=f"{BUCKET_NAME}/test/gcs/dataset/binary",
    )
    for i in range(200):
        w.append({"index": i + 300})
    w.finalize()

    # read the files we've just written, we should be able to
    # read over both paritions.
    r = Reader(
        inner_reader=GoogleCloudStorageReader,
        project="testing",
        dataset=f"{BUCKET_NAME}/test/gcs/dataset/binary",
        persistence=STORAGE_CLASS.MEMORY,
    )

    assert r.count() == 200, r.count()
Esempio n. 2
0
def test_reader_filters_single_filter():
    """ensure the reader filter is working as expected"""
    r = Reader(
        inner_reader=DiskReader,
        dataset="tests/data/tweets/",
        raw_path=True,
        filters="username == 'NBCNews'",
        persistence=STORAGE_CLASS.MEMORY,
    )
    assert r.count() == 44, r.count()
def test_can_read_xml():
    r = Reader(
        inner_reader=DiskReader,
        dataset="tests/data/formats/xml",
        raw_path=True,
        persistence=STORAGE_CLASS.MEMORY,
    )

    assert r.count() == 1, r.count()
    assert isinstance(r.first(), dict), r.first()
Esempio n. 4
0
def test_reader_partitions_read_without_referring_to_partition():
    """
    test if we reference a folder with partitions (by_) without referencing the
    partition, we pick a partition and read it like it's not there
    """
    DATA_DATE = datetime.date(2020, 2, 3)
    records = Reader(
        dataset="tests/data/partitioned",
        inner_reader=DiskReader,
        start_date=DATA_DATE,
        end_date=DATA_DATE,
        persistence=STORAGE_CLASS.MEMORY,
    )
    assert records.count() == 25, records.count()
Esempio n. 5
0
def test_reader_filters_multiple_filter():
    """ensure the reader filter is working as expected"""
    r = Reader(
        inner_reader=DiskReader,
        dataset="tests/data/tweets/",
        raw_path=True,
        filters="username = '******' and timestamp >= '2020-01-12T07:11:04'",
        persistence=STORAGE_CLASS.MEMORY,
    )
    assert r.count() == 34, r
Esempio n. 6
0
def test_reader_partitions_read_referring_to_specific_partition():
    """
    test if we reference a folder with partitions (by_) without referencing the
    partition, we pick a partition and read it like it's not there
    """
    DATA_DATE = datetime.date(2020, 2, 3)
    records = Reader(
        dataset="tests/data/partitioned",
        partitions=["year_{yyyy}/month_{mm}/day_{dd}"],
        partition_filter=("userid", "=", "14173315"),
        inner_reader=DiskReader,
        start_date=DATA_DATE,
        end_date=DATA_DATE,
        persistence=STORAGE_CLASS.MEMORY,
    )
    assert records.count() == 5, records.count()

    DATA_DATE = datetime.date(2020, 2, 3)
    records = Reader(
        dataset="tests/data/partitioned",
        partitions=["year_{yyyy}/month_{mm}/day_{dd}"],
        partition_filter=("username", "=", "BBCNews"),
        inner_reader=DiskReader,
        start_date=DATA_DATE,
        end_date=DATA_DATE,
        persistence=STORAGE_CLASS.MEMORY,
    )
    assert records.count() == 4, records.count()

    with pytest.raises(DataNotFoundError):
        DATA_DATE = datetime.date(2020, 2, 3)
        records = Reader(
            dataset="tests/data/partitioned",
            partitions=["year_{yyyy}/month_{mm}/day_{dd}"],
            partition_filter=("username", "=", "CNNNews"),
            inner_reader=DiskReader,
            start_date=DATA_DATE,
            end_date=DATA_DATE,
            persistence=STORAGE_CLASS.MEMORY,
        )
        assert records.count() == 0, records.count()