def _write_parquet( df, paths, compression="snappy", index=None, statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, row_group_size_bytes=None, row_group_size_rows=None, partitions_info=None, **kwargs, ): if is_list_like(paths) and len(paths) > 1: if partitions_info is None: ValueError("partition info is required for multiple paths") elif not is_list_like(partitions_info): ValueError("partition info must be list-like for multiple paths") elif not len(paths) == len(partitions_info): ValueError("partitions_info and paths must be of same size") if is_list_like(partitions_info) and len(partitions_info) > 1: if not is_list_like(paths): ValueError("paths must be list-like when partitions_info provided") paths_or_bufs = [ ioutils.get_writer_filepath_or_buffer(path, mode="wb", **kwargs) for path in paths ] common_args = { "index": index, "compression": compression, "statistics": statistics, "metadata_file_path": metadata_file_path, "int96_timestamps": int96_timestamps, "row_group_size_bytes": row_group_size_bytes, "row_group_size_rows": row_group_size_rows, "partitions_info": partitions_info, } if all([ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs]): with ExitStack() as stack: fsspec_objs = [stack.enter_context(file) for file in paths_or_bufs] file_objs = [ ioutils.get_IOBase_writer(file_obj) for file_obj in fsspec_objs ] write_parquet_res = libparquet.write_parquet( df, filepaths_or_buffers=file_objs, **common_args) else: write_parquet_res = libparquet.write_parquet( df, filepaths_or_buffers=paths_or_bufs, **common_args) return write_parquet_res
def to_parquet( df, path, engine="cudf", compression="snappy", index=None, partition_cols=None, statistics="ROWGROUP", metadata_file_path=None, *args, **kwargs, ): """{docstring}""" if engine == "cudf": if partition_cols: write_to_dataset( df, path, partition_cols=partition_cols, preserve_index=index, **kwargs, ) return # Ensure that no columns dtype is 'category' for col in df.columns: if df[col].dtype.name == "category": raise ValueError( "'category' column dtypes are currently not " + "supported by the gpu accelerated parquet writer" ) return libparquet.write_parquet( df, path, index, compression=compression, statistics=statistics, metadata_file_path=metadata_file_path, ) else: # If index is empty set it to the expected default value of True if index is None: index = True pa_table = df.to_arrow(preserve_index=index) return pq.write_to_dataset( pa_table, path, partition_cols=partition_cols, *args, **kwargs )
def to_parquet( df, path, engine="cudf", compression="snappy", index=None, partition_cols=None, statistics="ROWGROUP", metadata_file_path=None, *args, **kwargs, ): """{docstring}""" if engine == "cudf": if partition_cols: write_to_dataset( df, root_path=path, partition_cols=partition_cols, preserve_index=index, **kwargs, ) return # Ensure that no columns dtype is 'category' for col in df.columns: if df[col].dtype.name == "category": raise ValueError( "'category' column dtypes are currently not " + "supported by the gpu accelerated parquet writer") path_or_buf = ioutils.get_writer_filepath_or_buffer(path, mode="wb", **kwargs) if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) write_parquet_res = libparquet.write_parquet( df, path=file_obj, index=index, compression=compression, statistics=statistics, metadata_file_path=metadata_file_path, ) else: write_parquet_res = libparquet.write_parquet( df, path=path_or_buf, index=index, compression=compression, statistics=statistics, metadata_file_path=metadata_file_path, ) return write_parquet_res else: # If index is empty set it to the expected default value of True if index is None: index = True pa_table = df.to_arrow(preserve_index=index) return pq.write_to_dataset( pa_table, root_path=path, partition_cols=partition_cols, *args, **kwargs, )