Exemple #1
0
def _to_parquet(
    df: pd.DataFrame,
    schema: pa.Schema,
    index: bool,
    compression: Optional[str],
    compression_ext: str,
    cpus: int,
    dtype: Dict[str, str],
    boto3_session: Optional[boto3.Session],
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: bool,
    path: Optional[str] = None,
    path_root: Optional[str] = None,
    max_rows_by_file: Optional[int] = 0,
) -> List[str]:
    if path is None and path_root is not None:
        file_path: str = f"{path_root}{uuid.uuid4().hex}{compression_ext}.parquet"
    elif path is not None and path_root is None:
        file_path = path
    else:
        raise RuntimeError("path and path_root received at the same time.")
    _logger.debug("file_path: %s", file_path)
    table: pa.Table = pyarrow.Table.from_pandas(df=df,
                                                schema=schema,
                                                nthreads=cpus,
                                                preserve_index=index,
                                                safe=True)
    for col_name, col_type in dtype.items():
        if col_name in table.column_names:
            col_index = table.column_names.index(col_name)
            pyarrow_dtype = _data_types.athena2pyarrow(col_type)
            field = pa.field(name=col_name, type=pyarrow_dtype)
            table = table.set_column(
                col_index, field,
                table.column(col_name).cast(pyarrow_dtype))
            _logger.debug("Casting column %s (%s) to %s (%s)", col_name,
                          col_index, col_type, pyarrow_dtype)
    if max_rows_by_file is not None and max_rows_by_file > 0:
        paths: List[str] = _to_parquet_chunked(
            file_path=file_path,
            boto3_session=boto3_session,
            s3_additional_kwargs=s3_additional_kwargs,
            compression=compression,
            table=table,
            max_rows_by_file=max_rows_by_file,
            num_of_rows=df.shape[0],
            cpus=cpus,
        )
    else:
        with _new_writer(
                file_path=file_path,
                compression=compression,
                schema=table.schema,
                boto3_session=boto3_session,
                s3_additional_kwargs=s3_additional_kwargs,
                use_threads=use_threads,
        ) as writer:
            writer.write_table(table)
        paths = [file_path]
    return paths
Exemple #2
0
def _to_parquet_file(
    df: pd.DataFrame,
    path: str,
    schema: pa.Schema,
    index: bool,
    compression: Optional[str],
    cpus: int,
    fs: s3fs.S3FileSystem,
    dtype: Dict[str, str],
) -> str:
    table: pa.Table = pyarrow.Table.from_pandas(df=df, schema=schema, nthreads=cpus, preserve_index=index, safe=True)
    for col_name, col_type in dtype.items():
        if col_name in table.column_names:
            col_index = table.column_names.index(col_name)
            pyarrow_dtype = _data_types.athena2pyarrow(col_type)
            field = pa.field(name=col_name, type=pyarrow_dtype)
            table = table.set_column(col_index, field, table.column(col_name).cast(pyarrow_dtype))
            _logger.debug("Casting column %s (%s) to %s (%s)", col_name, col_index, col_type, pyarrow_dtype)
    pyarrow.parquet.write_table(
        table=table,
        where=path,
        write_statistics=True,
        use_dictionary=True,
        filesystem=fs,
        coerce_timestamps="ms",
        compression=compression,
        flavor="spark",
    )
    return path
def _to_parquet_file(
    df: pd.DataFrame,
    schema: pa.Schema,
    index: bool,
    compression: Optional[str],
    compression_ext: str,
    cpus: int,
    dtype: Dict[str, str],
    boto3_session: Optional[boto3.Session],
    s3_additional_kwargs: Optional[Dict[str, str]],
    path: Optional[str] = None,
    path_root: Optional[str] = None,
) -> str:
    if path is None and path_root is not None:
        file_path: str = f"{path_root}{uuid.uuid4().hex}{compression_ext}.parquet"
    elif path is not None and path_root is None:
        file_path = path
    else:
        raise RuntimeError("path and path_root received at the same time.")
    _logger.debug("file_path: %s", file_path)
    table: pa.Table = pyarrow.Table.from_pandas(df=df,
                                                schema=schema,
                                                nthreads=cpus,
                                                preserve_index=index,
                                                safe=True)
    for col_name, col_type in dtype.items():
        if col_name in table.column_names:
            col_index = table.column_names.index(col_name)
            pyarrow_dtype = _data_types.athena2pyarrow(col_type)
            field = pa.field(name=col_name, type=pyarrow_dtype)
            table = table.set_column(
                col_index, field,
                table.column(col_name).cast(pyarrow_dtype))
            _logger.debug("Casting column %s (%s) to %s (%s)", col_name,
                          col_index, col_type, pyarrow_dtype)
    fs: s3fs.S3FileSystem = _utils.get_fs(
        s3fs_block_size=33_554_432,
        session=boto3_session,
        s3_additional_kwargs=s3_additional_kwargs,  # 32 MB (32 * 2**20)
    )
    with pyarrow.parquet.ParquetWriter(
            where=file_path,
            write_statistics=True,
            use_dictionary=True,
            filesystem=fs,
            coerce_timestamps="ms",
            compression=compression,
            flavor="spark",
            schema=table.schema,
    ) as writer:
        writer.write_table(table)
    return file_path