def get_pq_writer(self, prefix, s, mem): pw, fil = self.pq_writer_cache.get(prefix, (None, None)) if pw is None: if mem: fil = BytesIO() pw = ParquetWriter(fil, compression=None) self.pq_writer_cache[prefix] = (pw, fil) else: outfile_id = guid() + ".parquet" full_path = ".".join([prefix, outfile_id]) pw = ParquetWriter(full_path, compression=None) self.pq_writer_cache[prefix] = (pw, full_path) return pw
def _to_parquet(tfrecords, file, output_dir, chunks, convert_lists): out = [] i = 0 w = ParquetWriter(output_dir + file.split("/")[-1].split(".")[0] + ".parquet", index=False) for tfrecord in tqdm(tfrecords): row = {key: val.numpy() for key, val in tfrecord.items()} out.append(row) i += 1 if i == chunks: df = cudf.DataFrame(out) if convert_lists: df = _convert_lists(df) w.write_table(df) i = 0 out = [] del df gc.collect() if len(out) > 0: df = cudf.DataFrame(out) if convert_lists: df = _convert_lists(df) w.write_table(df) del df gc.collect() w.close()
def test_parquet_writer_chunked_metadata(tmpdir, simple_pdf, simple_gdf): gdf_fname = tmpdir.join("gdf.parquet") test_path = "test/path" writer = ParquetWriter(gdf_fname) writer.write_table(simple_gdf) writer.write_table(simple_gdf) meta_byte_array = writer.close(metadata_file_path=test_path) fmd = pq.ParquetFile(BytesIO(meta_byte_array)).metadata assert fmd.num_rows == 2 * len(simple_gdf) assert fmd.num_row_groups == 2 for r in range(fmd.num_row_groups): for c in range(fmd.num_columns): assert fmd.row_group(r).column(c).file_path == test_path
def _convert_file(path, name, out_dir, gpu_mem_frac, fs, cols, dtypes): fn = f"{name}.parquet" out_path = fs.sep.join([out_dir, f"{name}.parquet"]) writer = ParquetWriter(out_path, compression=None) for gdf in nvt.Dataset( path, engine="csv", names=cols, part_memory_fraction=gpu_mem_frac, sep='\t', dtypes=dtypes, ).to_iter(): writer.write_table(gdf) del gdf md = writer.close(metadata_file_path=fn) return md
def test_parquet_writer_bytes_io(simple_gdf): output = BytesIO() writer = ParquetWriter(output) writer.write_table(simple_gdf) writer.write_table(simple_gdf) writer.close() assert_eq(cudf.read_parquet(output), cudf.concat([simple_gdf, simple_gdf]))
def test_parquet_writer_gpu_chunked(tmpdir, simple_pdf, simple_gdf): gdf_fname = tmpdir.join("gdf.parquet") writer = ParquetWriter(gdf_fname) writer.write_table(simple_gdf) writer.write_table(simple_gdf) writer.close() assert_eq(pd.read_parquet(gdf_fname), pd.concat([simple_pdf, simple_pdf]))
def __init__(self, out_dir, num_out_files=30, num_threads=4): self.queue = queue.Queue(num_threads) self.write_locks = [threading.Lock() for _ in range(num_out_files)] self.writer_files = [os.path.join(out_dir, f"{i}.parquet") for i in range(num_out_files)] self.writers = [ParquetWriter(f, compression=None) for f in self.writer_files] self.b_idxs = np.arange(num_out_files) self.num_threads = num_threads self.num_out_files = num_out_files # signifies that end-of-data and that the thread should shut down self._eod = object() for _ in range(num_threads): write_thread = threading.Thread(target=self._write_thread, daemon=True) write_thread.start()
def _write_gb_stats(dfs, base_path, col_group, on_host, concat_groups, name_sep): if concat_groups and len(col_group) > 1: col_group = [_make_name(*col_group, sep=name_sep)] if isinstance(col_group, str): col_group = [col_group] rel_path = "cat_stats.%s.parquet" % (_make_name(*col_group, sep=name_sep)) path = os.path.join(base_path, rel_path) pwriter = None pa_schema = None if not on_host: pwriter = ParquetWriter(path, compression=None) # Loop over dfs and append to file # TODO: For high-cardinality columns, should support # Dask-based to_parquet call here (but would need to # support directory reading within dependent ops) n_writes = 0 for df in dfs: if len(df): if on_host: # Use pyarrow pa_table = pa.Table.from_pandas(df, schema=pa_schema, preserve_index=False) if pwriter is None: pa_schema = pa_table.schema pwriter = pq.ParquetWriter(path, pa_schema, compression=None) pwriter.write_table(pa_table) else: # Use CuDF df.reset_index(drop=True, inplace=True) pwriter.write_table(df) n_writes += 1 # No data to write if n_writes == 0: raise RuntimeError("GroupbyStatistics result is empty.") # Close writer and return path pwriter.close() return path