Beispiel #1
0
def test_no_data(tmpdir):
    filepath = tmpdir + "no_data.avro"
    schema = {
        "name":
        "Weather",
        "type":
        "record",
        "fields": [
            {
                "name": "station",
                "type": "string"
            },
            {
                "name": "time",
                "type": "long"
            },
            {
                "name": "temp",
                "type": "int"
            },
        ],
    }
    parsed_schema = fa.parse_schema(schema)
    with open(filepath, "wb") as out:
        fa.writer(out, parsed_schema, [])

    df = cudf.read_avro(filepath)

    # fastavro returns an empty dataframe, need to verify manually
    assert_eq(df.shape, (0, 3))
    dtypes = df.dtypes.values.tolist()
    assert_eq(dtypes, [np.dtype("O"), np.dtype("int64"), np.dtype("int32")])
    col_names = df.columns.tolist()
    assert_eq(col_names, ["station", "time", "temp"])
Beispiel #2
0
def cudf_from_avro_util(schema, records):

    schema = [] if schema is None else fastavro.parse_schema(schema)
    buffer = io.BytesIO()
    fastavro.writer(buffer, schema, records)
    buffer.seek(0)
    return cudf.read_avro(buffer)
Beispiel #3
0
def test_empty_dataframe(tmpdir):
    filepath = tmpdir + "empty.avro"
    # write empty dataframe
    with open(filepath, "wb") as out:
        fa.writer(out, [], [])

    df = cudf.read_avro(filepath)
    assert_eq(df, cudf.DataFrame())
Beispiel #4
0
def avro_reader_test(input_tuple, columns, skiprows, num_rows):
    pdf, parquet_buffer = input_tuple
    expected_pdf = pdf[skiprows:]
    if num_rows is not None:
        expected_pdf = expected_pdf.head(num_rows)
    if skiprows is not None or num_rows is not None:
        expected_pdf = expected_pdf.reset_index(drop=True)

    gdf = cudf.read_avro(parquet_buffer,
                         columns=columns,
                         skiprows=skiprows,
                         num_rows=num_rows)
    compare_dataframe(expected_pdf, gdf)
def test_avro_compression(rows, codec):
    schema = {
        "name":
        "root",
        "type":
        "record",
        "fields": [
            {
                "name": "0",
                "type": "int"
            },
            {
                "name": "1",
                "type": "string"
            },
        ],
    }

    df = rand_dataframe(
        [
            {
                "dtype": "int32",
                "null_frequency": 0,
                "cardinality": 1000
            },
            {
                "dtype": "str",
                "null_frequency": 0,
                "cardinality": 100,
                "max_string_length": 10,
            },
        ],
        rows,
    )
    expected_df = cudf.DataFrame.from_arrow(df)

    records = df.to_pandas().to_dict(orient="records")

    buffer = io.BytesIO()
    fastavro.writer(buffer, schema, records, codec=codec)
    buffer.seek(0)
    got_df = cudf.read_avro(buffer)

    assert_eq(expected_df, got_df)
Beispiel #6
0
def test_read_avro(datadir, hdfs, test_url):
    fname = datadir / "avro" / "example.avro"
    # Read from local file system as buffer
    with open(fname, mode="rb") as f:
        buffer = BytesIO(f.read())
    # Write to hdfs
    hdfs.upload(basedir + "/file.avro", buffer)

    if test_url:
        hd_fpath = f"hdfs://{host}:{port}{basedir}/file.avro"
    else:
        hd_fpath = f"hdfs://{basedir}/file.avro"

    got = cudf.read_avro(hd_fpath)
    with open(fname, mode="rb") as f:
        expect = pd.DataFrame.from_records(fa.reader(f))

    for col in expect.columns:
        expect[col] = expect[col].astype(got[col].dtype)
    assert_eq(expect, got)
Beispiel #7
0
def test_avro_reader_basic(datadir, inputfile, columns, engine):
    path = datadir / inputfile
    try:
        reader = fa.reader(open(path, "rb"))
    except Exception as excpr:
        if type(excpr).__name__ == "FileNotFoundError":
            pytest.skip(".avro file is not found")
        else:
            print(type(excpr).__name__)

    expect = pd.DataFrame.from_records(reader)
    got = cudf.read_avro(path, engine=engine, columns=columns)

    # PANDAS uses NaN to represent invalid data, which forces float dtype
    # For comparison, we can replace NaN with 0 and cast to the cuDF dtype
    # FASTAVRO produces int64 columns from avro int32 dtype, so convert
    # it back to int32 here
    for col in expect.columns:
        expect[col] = expect[col].astype(got[col].dtype)

    assert_eq(expect, got, check_categorical=False)