def _read_text_file( path: str, parser_func: Callable[..., pd.DataFrame], path_root: Optional[str], boto3_session: Union[boto3.Session, Dict[str, Optional[str]]], pandas_kwargs: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]], dataset: bool, ) -> pd.DataFrame: fs: s3fs.S3FileSystem = _utils.get_fs( s3fs_block_size=134_217_728, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, # 128 MB (128 * 2**20) ) mode, encoding, newline = _get_read_details(path=path, pandas_kwargs=pandas_kwargs) with _utils.open_file(fs=fs, path=path, mode=mode, encoding=encoding, newline=newline) as f: df: pd.DataFrame = parser_func(f, **pandas_kwargs) return _apply_partitions(df=df, dataset=dataset, path=path, path_root=path_root)
def _read_parquet_row_group( row_group: int, path: str, columns: Optional[List[str]], categories: Optional[List[str]], boto3_primitives: _utils.Boto3PrimitivesType, s3_additional_kwargs: Optional[Dict[str, str]], ) -> pa.Table: boto3_session: boto3.Session = _utils.boto3_from_primitives( primitives=boto3_primitives) fs: s3fs.S3FileSystem = _utils.get_fs( s3fs_block_size=134_217_728, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, # 128 MB (128 * 2**20) ) with _utils.open_file(fs=fs, path=path, mode="rb") as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile( source=f, read_dictionary=categories) num_row_groups: int = pq_file.num_row_groups _logger.debug("Reading Row Group %s/%s [multi-threaded]", row_group + 1, num_row_groups) return pq_file.read_row_group(i=row_group, columns=columns, use_threads=False, use_pandas_metadata=False)
def _read_text_chunked( paths: List[str], chunksize: int, parser_func: Callable[..., pd.DataFrame], path_root: Optional[str], boto3_session: boto3.Session, pandas_kwargs: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]], dataset: bool, ) -> Iterator[pd.DataFrame]: for path in paths: _logger.debug("path: %s", path) fs: s3fs.S3FileSystem = _utils.get_fs( s3fs_block_size=8_388_608, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, # 8 MB (8 * 2**20) ) mode, encoding, newline = _get_read_details( path=path, pandas_kwargs=pandas_kwargs) with _utils.open_file(fs=fs, path=path, mode=mode, encoding=encoding, newline=newline) as f: reader: pandas.io.parsers.TextFileReader = parser_func( f, chunksize=chunksize, **pandas_kwargs) for df in reader: yield _apply_partitions(df=df, dataset=dataset, path=path, path_root=path_root)
def _to_text( file_format: str, df: pd.DataFrame, boto3_session: Optional[boto3.Session], s3_additional_kwargs: Optional[Dict[str, str]], path: Optional[str] = None, path_root: Optional[str] = None, **pandas_kwargs, ) -> str: if df.empty is True: raise exceptions.EmptyDataFrame() if path is None and path_root is not None: file_path: str = f"{path_root}{uuid.uuid4().hex}.{file_format}" elif path is not None and path_root is None: file_path = path else: raise RuntimeError("path and path_root received at the same time.") fs: s3fs.S3FileSystem = _utils.get_fs( s3fs_block_size=33_554_432, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, # 32 MB (32 * 2**20) ) encoding: Optional[str] = pandas_kwargs.get("encoding", None) newline: Optional[str] = pandas_kwargs.get("line_terminator", None) with _utils.open_file(fs=fs, path=file_path, mode="w", encoding=encoding, newline=newline) as f: _logger.debug("pandas_kwargs: %s", pandas_kwargs) if file_format == "csv": df.to_csv(f, **pandas_kwargs) elif file_format == "json": df.to_json(f, **pandas_kwargs) return file_path
def _read_parquet_metadata_file( path: str, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], ) -> Dict[str, str]: fs: s3fs.S3FileSystem = _utils.get_fs( s3fs_block_size=4_194_304, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs # 4 MB (4 * 2**20) ) with _utils.open_file(fs=fs, path=path, mode="rb") as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile( source=f) return _data_types.athena_types_from_pyarrow_schema( schema=pq_file.schema.to_arrow_schema(), partitions=None)[0]
def _count_row_groups( path: str, categories: Optional[List[str]], boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], ) -> int: fs: s3fs.S3FileSystem = _utils.get_fs( s3fs_block_size=4_194_304, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs # 4 MB (4 * 2**20) ) with _utils.open_file(fs=fs, path=path, mode="rb") as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile( source=f, read_dictionary=categories) return pq_file.num_row_groups
def _read_parquet_file( path: str, columns: Optional[List[str]], categories: Optional[List[str]], boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], ) -> pa.Table: fs: s3fs.S3FileSystem = _utils.get_fs( s3fs_block_size=134_217_728, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, # 128 MB (128 * 2**20) ) with _utils.open_file(fs=fs, path=path, mode="rb") as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile( source=f, read_dictionary=categories) return pq_file.read(columns=columns, use_threads=False, use_pandas_metadata=False)
def _read_parquet_chunked( paths: List[str], chunked: Union[bool, int], columns: Optional[List[str]], categories: Optional[List[str]], validate_schema: bool, safe: bool, boto3_session: boto3.Session, dataset: bool, path_root: Optional[str], s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool, ) -> Iterator[pd.DataFrame]: next_slice: Optional[pd.DataFrame] = None fs: s3fs.S3FileSystem = _utils.get_fs( s3fs_block_size=8_388_608, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs # 8 MB (8 * 2**20) ) last_schema: Optional[Dict[str, str]] = None last_path: str = "" for path in paths: with _utils.open_file(fs=fs, path=path, mode="rb") as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile( source=f, read_dictionary=categories) schema: Dict[str, str] = _data_types.athena_types_from_pyarrow_schema( schema=pq_file.schema.to_arrow_schema(), partitions=None)[0] if validate_schema is True and last_schema is not None: if schema != last_schema: raise exceptions.InvalidSchemaConvergence( f"Was detect at least 2 different schemas:\n" f" - {last_path} -> {last_schema}\n" f" - {path} -> {schema}") last_schema = schema last_path = path num_row_groups: int = pq_file.num_row_groups _logger.debug("num_row_groups: %s", num_row_groups) for i in range(num_row_groups): _logger.debug("Reading Row Group %s...", i) df: pd.DataFrame = _arrowtable2df( table=pq_file.read_row_group(i=i, columns=columns, use_threads=use_threads, use_pandas_metadata=False), categories=categories, safe=safe, use_threads=use_threads, dataset=dataset, path=path, path_root=path_root, ) if chunked is True: yield df elif isinstance(chunked, int) and chunked > 0: if next_slice is not None: df = pd.concat(objs=[next_slice, df], ignore_index=True, sort=False, copy=False) while len(df.index) >= chunked: yield df.iloc[:chunked] df = df.iloc[chunked:] if df.empty: next_slice = None else: next_slice = df else: raise exceptions.InvalidArgument(f"chunked: {chunked}") if next_slice is not None: yield next_slice