def _arrowtable2df( table: pa.Table, categories: Optional[List[str]], safe: bool, use_threads: bool, dataset: bool, path: str, path_root: Optional[str], ) -> pd.DataFrame: metadata: Dict[str, Any] = {} if table.schema.metadata is not None and b"pandas" in table.schema.metadata: metadata = json.loads(table.schema.metadata[b"pandas"]) df: pd.DataFrame = _apply_partitions( df=table.to_pandas( use_threads=use_threads, split_blocks=True, self_destruct=True, integer_object_nulls=False, date_as_object=True, ignore_metadata=True, strings_to_categorical=False, safe=safe, categories=categories, types_mapper=_data_types.pyarrow2pandas_extension, ), dataset=dataset, path=path, path_root=path_root, ) df = _utils.ensure_df_is_mutable(df=df) if metadata: _logger.debug("metadata: %s", metadata) df = _apply_index(df=df, metadata=metadata) df = _apply_timezone(df=df, metadata=metadata) return df
def _arrowtable2df( table: pa.Table, categories: Optional[List[str]], safe: bool, use_threads: bool, dataset: bool, path: str, path_root: Optional[str], ) -> pd.DataFrame: df: pd.DataFrame = _apply_partitions( df=table.to_pandas( use_threads=use_threads, split_blocks=True, self_destruct=True, integer_object_nulls=False, date_as_object=True, ignore_metadata=True, categories=categories, safe=safe, types_mapper=_data_types.pyarrow2pandas_extension, ), dataset=dataset, path=path, path_root=path_root, ) return _utils.ensure_df_is_mutable(df=df)
def _read_text_chunked( paths: List[str], chunksize: int, parser_func: Callable[..., pd.DataFrame], path_root: Optional[str], boto3_session: boto3.Session, pandas_kwargs: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]], dataset: bool, use_threads: bool, ) -> Iterator[pd.DataFrame]: for path in paths: _logger.debug("path: %s", path) mode, encoding, newline = _get_read_details( path=path, pandas_kwargs=pandas_kwargs) with open_s3_object( path=path, mode=mode, s3_block_size=10_485_760, # 10 MB (10 * 2**20) encoding=encoding, use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, newline=newline, boto3_session=boto3_session, ) as f: reader: pandas.io.parsers.TextFileReader = parser_func( f, chunksize=chunksize, **pandas_kwargs) for df in reader: yield _apply_partitions(df=df, dataset=dataset, path=path, path_root=path_root)
def _read_text_file( path: str, parser_func: Callable[..., pd.DataFrame], path_root: Optional[str], boto3_session: Union[boto3.Session, Dict[str, Optional[str]]], pandas_kwargs: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]], dataset: bool, ) -> pd.DataFrame: fs: s3fs.S3FileSystem = _utils.get_fs( s3fs_block_size=134_217_728, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, # 128 MB (128 * 2**20) ) mode, encoding, newline = _get_read_details(path=path, pandas_kwargs=pandas_kwargs) with _utils.open_file(fs=fs, path=path, mode=mode, encoding=encoding, newline=newline) as f: df: pd.DataFrame = parser_func(f, **pandas_kwargs) return _apply_partitions(df=df, dataset=dataset, path=path, path_root=path_root)
def _read_text_chunked( paths: List[str], chunksize: int, parser_func: Callable[..., pd.DataFrame], path_root: Optional[str], boto3_session: boto3.Session, pandas_kwargs: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]], dataset: bool, ) -> Iterator[pd.DataFrame]: for path in paths: _logger.debug("path: %s", path) fs: s3fs.S3FileSystem = _utils.get_fs( s3fs_block_size=8_388_608, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, # 8 MB (8 * 2**20) ) mode, encoding, newline = _get_read_details( path=path, pandas_kwargs=pandas_kwargs) with _utils.open_file(fs=fs, path=path, mode=mode, encoding=encoding, newline=newline) as f: reader: pandas.io.parsers.TextFileReader = parser_func( f, chunksize=chunksize, **pandas_kwargs) for df in reader: yield _apply_partitions(df=df, dataset=dataset, path=path, path_root=path_root)
def _read_text_file( path: str, parser_func: Callable[..., pd.DataFrame], path_root: Optional[str], boto3_session: Union[boto3.Session, Dict[str, Optional[str]]], pandas_kwargs: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]], dataset: bool, use_threads: bool, ) -> pd.DataFrame: mode, encoding, newline = _get_read_details(path=path, pandas_kwargs=pandas_kwargs) with open_s3_object( path=path, mode=mode, use_threads=use_threads, s3_block_size=-1, # One shot download encoding=encoding, s3_additional_kwargs=s3_additional_kwargs, newline=newline, boto3_session=boto3_session, ) as f: df: pd.DataFrame = parser_func(f, **pandas_kwargs) return _apply_partitions(df=df, dataset=dataset, path=path, path_root=path_root)
def _read_text_file( path: str, version_id: Optional[str], parser_func: Callable[..., pd.DataFrame], path_root: Optional[str], boto3_session: Union[boto3.Session, _utils.Boto3PrimitivesType], pandas_kwargs: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]], dataset: bool, use_threads: Union[bool, int], ) -> pd.DataFrame: boto3_session = _utils.ensure_session(boto3_session) mode, encoding, newline = _get_read_details(path=path, pandas_kwargs=pandas_kwargs) try: with open_s3_object( path=path, version_id=version_id, mode=mode, use_threads=use_threads, s3_block_size=-1, # One shot download encoding=encoding, s3_additional_kwargs=s3_additional_kwargs, newline=newline, boto3_session=boto3_session, ) as f: df: pd.DataFrame = parser_func(f, **pandas_kwargs) except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "404": raise exceptions.NoFilesFound(f"No files Found on: {path}.") raise e return _apply_partitions(df=df, dataset=dataset, path=path, path_root=path_root)
) -> pd.DataFrame: mode, encoding, newline = _get_read_details(path=path, pandas_kwargs=pandas_kwargs) with open_s3_object( path=path, mode=mode, use_threads=use_threads, s3_block_size=134_217_728, # 128 MB (128 * 2**20) encoding=encoding, s3_additional_kwargs=s3_additional_kwargs, newline=newline, boto3_session=boto3_session, ) as f: df: pd.DataFrame = parser_func(f, **pandas_kwargs) return _apply_partitions(df=df, dataset=dataset, path=path, path_root=path_root) def _read_text( parser_func: Callable[..., pd.DataFrame], path: Union[str, List[str]], path_suffix: Union[str, List[str], None], path_ignore_suffix: Union[str, List[str], None], use_threads: bool, last_modified_begin: Optional[datetime.datetime], last_modified_end: Optional[datetime.datetime], boto3_session: Optional[boto3.Session], s3_additional_kwargs: Optional[Dict[str, str]], chunksize: Optional[int], dataset: bool,