def _to_parquet(tfrecords, file, output_dir, chunks, convert_lists):
    out = []
    i = 0
    w = ParquetWriter(output_dir + file.split("/")[-1].split(".")[0] +
                      ".parquet",
                      index=False)
    for tfrecord in tqdm(tfrecords):
        row = {key: val.numpy() for key, val in tfrecord.items()}
        out.append(row)
        i += 1
        if i == chunks:
            df = cudf.DataFrame(out)
            if convert_lists:
                df = _convert_lists(df)
            w.write_table(df)
            i = 0
            out = []
            del df
            gc.collect()
    if len(out) > 0:
        df = cudf.DataFrame(out)
        if convert_lists:
            df = _convert_lists(df)
        w.write_table(df)
        del df
        gc.collect()
    w.close()
Esempio n. 2
0
def test_parquet_writer_bytes_io(simple_gdf):
    output = BytesIO()

    writer = ParquetWriter(output)
    writer.write_table(simple_gdf)
    writer.write_table(simple_gdf)
    writer.close()

    assert_eq(cudf.read_parquet(output), cudf.concat([simple_gdf, simple_gdf]))
Esempio n. 3
0
def test_parquet_writer_gpu_chunked(tmpdir, simple_pdf, simple_gdf):
    gdf_fname = tmpdir.join("gdf.parquet")

    writer = ParquetWriter(gdf_fname)
    writer.write_table(simple_gdf)
    writer.write_table(simple_gdf)
    writer.close()

    assert_eq(pd.read_parquet(gdf_fname), pd.concat([simple_pdf, simple_pdf]))
Esempio n. 4
0
def _write_gb_stats(dfs, base_path, col_group, on_host, concat_groups,
                    name_sep):
    if concat_groups and len(col_group) > 1:
        col_group = [_make_name(*col_group, sep=name_sep)]
    if isinstance(col_group, str):
        col_group = [col_group]

    rel_path = "cat_stats.%s.parquet" % (_make_name(*col_group, sep=name_sep))
    path = os.path.join(base_path, rel_path)
    pwriter = None
    pa_schema = None
    if not on_host:
        pwriter = ParquetWriter(path, compression=None)

    # Loop over dfs and append to file
    # TODO: For high-cardinality columns, should support
    #       Dask-based to_parquet call here (but would need to
    #       support directory reading within dependent ops)
    n_writes = 0
    for df in dfs:
        if len(df):
            if on_host:
                # Use pyarrow
                pa_table = pa.Table.from_pandas(df,
                                                schema=pa_schema,
                                                preserve_index=False)
                if pwriter is None:
                    pa_schema = pa_table.schema
                    pwriter = pq.ParquetWriter(path,
                                               pa_schema,
                                               compression=None)
                pwriter.write_table(pa_table)
            else:
                # Use CuDF
                df.reset_index(drop=True, inplace=True)
                pwriter.write_table(df)
            n_writes += 1

    # No data to write
    if n_writes == 0:
        raise RuntimeError("GroupbyStatistics result is empty.")

    # Close writer and return path
    pwriter.close()

    return path
Esempio n. 5
0
def test_parquet_writer_chunked_metadata(tmpdir, simple_pdf, simple_gdf):
    gdf_fname = tmpdir.join("gdf.parquet")
    test_path = "test/path"

    writer = ParquetWriter(gdf_fname)
    writer.write_table(simple_gdf)
    writer.write_table(simple_gdf)
    meta_byte_array = writer.close(metadata_file_path=test_path)
    fmd = pq.ParquetFile(BytesIO(meta_byte_array)).metadata

    assert fmd.num_rows == 2 * len(simple_gdf)
    assert fmd.num_row_groups == 2

    for r in range(fmd.num_row_groups):
        for c in range(fmd.num_columns):
            assert fmd.row_group(r).column(c).file_path == test_path
def _convert_file(path, name, out_dir, gpu_mem_frac, fs, cols, dtypes):
    fn = f"{name}.parquet"
    out_path = fs.sep.join([out_dir, f"{name}.parquet"])
    writer = ParquetWriter(out_path, compression=None)
    for gdf in nvt.Dataset(
            path,
            engine="csv",
            names=cols,
            part_memory_fraction=gpu_mem_frac,
            sep='\t',
            dtypes=dtypes,
    ).to_iter():
        writer.write_table(gdf)
        del gdf
    md = writer.close(metadata_file_path=fn)
    return md