Beispiel #1
0
def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
    reference_file = "TestOrcFile.test1.orc"
    pdf_fname = datadir / reference_file
    gdf_fname = tmpdir.join("chunked_gdf.orc")

    try:
        orcfile = pa.orc.ORCFile(pdf_fname)
    except Exception as excpr:
        if type(excpr).__name__ == "ArrowIOError":
            pytest.skip(".orc file is not found")
        else:
            print(type(excpr).__name__)

    columns = [
        "boolean1",
        "byte1",
        "short1",
        "int1",
        "long1",
        "float1",
        "double1",
    ]
    pdf = orcfile.read(columns=columns).to_pandas()
    gdf = cudf.from_pandas(pdf)
    expect = pd.concat([pdf, pdf]).reset_index(drop=True)

    writer = ORCWriter(gdf_fname, statistics=stats_freq)
    writer.write_table(gdf)
    writer.write_table(gdf)
    writer.close()

    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()

    assert_eq(expect, got)
Beispiel #2
0
def test_chunked_orc_writer(datadir, tmpdir, reference_file, columns,
                            compression):
    pdf_fname = datadir / reference_file
    gdf_fname = tmpdir.join("chunked_gdf.orc")

    try:
        orcfile = pa.orc.ORCFile(pdf_fname)
    except Exception as excpr:
        if type(excpr).__name__ == "ArrowIOError":
            pytest.skip(".orc file is not found")
        else:
            print(type(excpr).__name__)

    pdf = orcfile.read(columns=columns).to_pandas()
    gdf = cudf.from_pandas(pdf)
    expect = pd.concat([pdf, pdf]).reset_index(drop=True)

    writer = ORCWriter(gdf_fname, compression=compression)
    writer.write_table(gdf)
    writer.write_table(gdf)
    writer.close()

    got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()

    assert_eq(expect, got)
Beispiel #3
0
def test_chunked_orc_writer_strings(tmpdir, dtypes):
    gdf_fname = tmpdir.join("chunked_gdf_strings.orc")

    gdf = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1)
    pdf = gdf.to_pandas()
    expect = pd.concat([pdf, pdf]).reset_index(drop=True)
    writer = ORCWriter(gdf_fname)
    writer.write_table(gdf)
    writer.write_table(gdf)
    writer.close()

    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()

    assert_eq(expect, got)
Beispiel #4
0
def test_chunked_orc_writer_lists():
    num_rows = 12345
    pdf_in = pd.DataFrame({
        "ls": [[str(i), str(2 * i)] for i in range(num_rows)],
        "ld": [[dec(i / 2)] * 5 for i in range(num_rows)],
    })

    gdf = cudf.from_pandas(pdf_in)
    expect = pd.concat([pdf_in, pdf_in]).reset_index(drop=True)

    buffer = BytesIO()
    writer = ORCWriter(buffer)
    writer.write_table(gdf)
    writer.write_table(gdf)
    writer.close()

    got = pa.orc.ORCFile(buffer).read().to_pandas()
    assert_eq(expect, got)