def _to_parquet( df: pd.DataFrame, schema: pa.Schema, index: bool, compression: Optional[str], compression_ext: str, cpus: int, dtype: Dict[str, str], boto3_session: Optional[boto3.Session], s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool, path: Optional[str] = None, path_root: Optional[str] = None, max_rows_by_file: Optional[int] = 0, ) -> List[str]: if path is None and path_root is not None: file_path: str = f"{path_root}{uuid.uuid4().hex}{compression_ext}.parquet" elif path is not None and path_root is None: file_path = path else: raise RuntimeError("path and path_root received at the same time.") _logger.debug("file_path: %s", file_path) table: pa.Table = pyarrow.Table.from_pandas(df=df, schema=schema, nthreads=cpus, preserve_index=index, safe=True) for col_name, col_type in dtype.items(): if col_name in table.column_names: col_index = table.column_names.index(col_name) pyarrow_dtype = _data_types.athena2pyarrow(col_type) field = pa.field(name=col_name, type=pyarrow_dtype) table = table.set_column( col_index, field, table.column(col_name).cast(pyarrow_dtype)) _logger.debug("Casting column %s (%s) to %s (%s)", col_name, col_index, col_type, pyarrow_dtype) if max_rows_by_file is not None and max_rows_by_file > 0: paths: List[str] = _to_parquet_chunked( file_path=file_path, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, compression=compression, table=table, max_rows_by_file=max_rows_by_file, num_of_rows=df.shape[0], cpus=cpus, ) else: with _new_writer( file_path=file_path, compression=compression, schema=table.schema, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads, ) as writer: writer.write_table(table) paths = [file_path] return paths
def _to_parquet_file( df: pd.DataFrame, path: str, schema: pa.Schema, index: bool, compression: Optional[str], cpus: int, fs: s3fs.S3FileSystem, dtype: Dict[str, str], ) -> str: table: pa.Table = pyarrow.Table.from_pandas(df=df, schema=schema, nthreads=cpus, preserve_index=index, safe=True) for col_name, col_type in dtype.items(): if col_name in table.column_names: col_index = table.column_names.index(col_name) pyarrow_dtype = _data_types.athena2pyarrow(col_type) field = pa.field(name=col_name, type=pyarrow_dtype) table = table.set_column(col_index, field, table.column(col_name).cast(pyarrow_dtype)) _logger.debug("Casting column %s (%s) to %s (%s)", col_name, col_index, col_type, pyarrow_dtype) pyarrow.parquet.write_table( table=table, where=path, write_statistics=True, use_dictionary=True, filesystem=fs, coerce_timestamps="ms", compression=compression, flavor="spark", ) return path
def _to_parquet_file( df: pd.DataFrame, schema: pa.Schema, index: bool, compression: Optional[str], compression_ext: str, cpus: int, dtype: Dict[str, str], boto3_session: Optional[boto3.Session], s3_additional_kwargs: Optional[Dict[str, str]], path: Optional[str] = None, path_root: Optional[str] = None, ) -> str: if path is None and path_root is not None: file_path: str = f"{path_root}{uuid.uuid4().hex}{compression_ext}.parquet" elif path is not None and path_root is None: file_path = path else: raise RuntimeError("path and path_root received at the same time.") _logger.debug("file_path: %s", file_path) table: pa.Table = pyarrow.Table.from_pandas(df=df, schema=schema, nthreads=cpus, preserve_index=index, safe=True) for col_name, col_type in dtype.items(): if col_name in table.column_names: col_index = table.column_names.index(col_name) pyarrow_dtype = _data_types.athena2pyarrow(col_type) field = pa.field(name=col_name, type=pyarrow_dtype) table = table.set_column( col_index, field, table.column(col_name).cast(pyarrow_dtype)) _logger.debug("Casting column %s (%s) to %s (%s)", col_name, col_index, col_type, pyarrow_dtype) fs: s3fs.S3FileSystem = _utils.get_fs( s3fs_block_size=33_554_432, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, # 32 MB (32 * 2**20) ) with pyarrow.parquet.ParquetWriter( where=file_path, write_statistics=True, use_dictionary=True, filesystem=fs, coerce_timestamps="ms", compression=compression, flavor="spark", schema=table.schema, ) as writer: writer.write_table(table) return file_path