Example #1
0
 def get_pq_writer(self, prefix, s, mem):
     pw, fil = self.pq_writer_cache.get(prefix, (None, None))
     if pw is None:
         if mem:
             fil = BytesIO()
             pw = ParquetWriter(fil, compression=None)
             self.pq_writer_cache[prefix] = (pw, fil)
         else:
             outfile_id = guid() + ".parquet"
             full_path = ".".join([prefix, outfile_id])
             pw = ParquetWriter(full_path, compression=None)
             self.pq_writer_cache[prefix] = (pw, full_path)
     return pw
def _to_parquet(tfrecords, file, output_dir, chunks, convert_lists):
    out = []
    i = 0
    w = ParquetWriter(output_dir + file.split("/")[-1].split(".")[0] +
                      ".parquet",
                      index=False)
    for tfrecord in tqdm(tfrecords):
        row = {key: val.numpy() for key, val in tfrecord.items()}
        out.append(row)
        i += 1
        if i == chunks:
            df = cudf.DataFrame(out)
            if convert_lists:
                df = _convert_lists(df)
            w.write_table(df)
            i = 0
            out = []
            del df
            gc.collect()
    if len(out) > 0:
        df = cudf.DataFrame(out)
        if convert_lists:
            df = _convert_lists(df)
        w.write_table(df)
        del df
        gc.collect()
    w.close()
Example #3
0
def test_parquet_writer_chunked_metadata(tmpdir, simple_pdf, simple_gdf):
    gdf_fname = tmpdir.join("gdf.parquet")
    test_path = "test/path"

    writer = ParquetWriter(gdf_fname)
    writer.write_table(simple_gdf)
    writer.write_table(simple_gdf)
    meta_byte_array = writer.close(metadata_file_path=test_path)
    fmd = pq.ParquetFile(BytesIO(meta_byte_array)).metadata

    assert fmd.num_rows == 2 * len(simple_gdf)
    assert fmd.num_row_groups == 2

    for r in range(fmd.num_row_groups):
        for c in range(fmd.num_columns):
            assert fmd.row_group(r).column(c).file_path == test_path
def _convert_file(path, name, out_dir, gpu_mem_frac, fs, cols, dtypes):
    fn = f"{name}.parquet"
    out_path = fs.sep.join([out_dir, f"{name}.parquet"])
    writer = ParquetWriter(out_path, compression=None)
    for gdf in nvt.Dataset(
            path,
            engine="csv",
            names=cols,
            part_memory_fraction=gpu_mem_frac,
            sep='\t',
            dtypes=dtypes,
    ).to_iter():
        writer.write_table(gdf)
        del gdf
    md = writer.close(metadata_file_path=fn)
    return md
Example #5
0
def test_parquet_writer_bytes_io(simple_gdf):
    output = BytesIO()

    writer = ParquetWriter(output)
    writer.write_table(simple_gdf)
    writer.write_table(simple_gdf)
    writer.close()

    assert_eq(cudf.read_parquet(output), cudf.concat([simple_gdf, simple_gdf]))
Example #6
0
def test_parquet_writer_gpu_chunked(tmpdir, simple_pdf, simple_gdf):
    gdf_fname = tmpdir.join("gdf.parquet")

    writer = ParquetWriter(gdf_fname)
    writer.write_table(simple_gdf)
    writer.write_table(simple_gdf)
    writer.close()

    assert_eq(pd.read_parquet(gdf_fname), pd.concat([simple_pdf, simple_pdf]))
Example #7
0
    def __init__(self, out_dir, num_out_files=30, num_threads=4):
        self.queue = queue.Queue(num_threads)
        self.write_locks = [threading.Lock() for _ in range(num_out_files)]
        self.writer_files = [os.path.join(out_dir, f"{i}.parquet") for i in range(num_out_files)]
        self.writers = [ParquetWriter(f, compression=None) for f in self.writer_files]
        self.b_idxs = np.arange(num_out_files)
        self.num_threads = num_threads
        self.num_out_files = num_out_files

        # signifies that end-of-data and that the thread should shut down
        self._eod = object()

        for _ in range(num_threads):
            write_thread = threading.Thread(target=self._write_thread, daemon=True)
            write_thread.start()
Example #8
0
def _write_gb_stats(dfs, base_path, col_group, on_host, concat_groups,
                    name_sep):
    if concat_groups and len(col_group) > 1:
        col_group = [_make_name(*col_group, sep=name_sep)]
    if isinstance(col_group, str):
        col_group = [col_group]

    rel_path = "cat_stats.%s.parquet" % (_make_name(*col_group, sep=name_sep))
    path = os.path.join(base_path, rel_path)
    pwriter = None
    pa_schema = None
    if not on_host:
        pwriter = ParquetWriter(path, compression=None)

    # Loop over dfs and append to file
    # TODO: For high-cardinality columns, should support
    #       Dask-based to_parquet call here (but would need to
    #       support directory reading within dependent ops)
    n_writes = 0
    for df in dfs:
        if len(df):
            if on_host:
                # Use pyarrow
                pa_table = pa.Table.from_pandas(df,
                                                schema=pa_schema,
                                                preserve_index=False)
                if pwriter is None:
                    pa_schema = pa_table.schema
                    pwriter = pq.ParquetWriter(path,
                                               pa_schema,
                                               compression=None)
                pwriter.write_table(pa_table)
            else:
                # Use CuDF
                df.reset_index(drop=True, inplace=True)
                pwriter.write_table(df)
            n_writes += 1

    # No data to write
    if n_writes == 0:
        raise RuntimeError("GroupbyStatistics result is empty.")

    # Close writer and return path
    pwriter.close()

    return path