Example #1
0
def _arrowtable2df(
    table: pa.Table,
    categories: Optional[List[str]],
    safe: bool,
    use_threads: bool,
    dataset: bool,
    path: str,
    path_root: Optional[str],
) -> pd.DataFrame:
    metadata: Dict[str, Any] = {}
    if table.schema.metadata is not None and b"pandas" in table.schema.metadata:
        metadata = json.loads(table.schema.metadata[b"pandas"])
    df: pd.DataFrame = _apply_partitions(
        df=table.to_pandas(
            use_threads=use_threads,
            split_blocks=True,
            self_destruct=True,
            integer_object_nulls=False,
            date_as_object=True,
            ignore_metadata=True,
            strings_to_categorical=False,
            safe=safe,
            categories=categories,
            types_mapper=_data_types.pyarrow2pandas_extension,
        ),
        dataset=dataset,
        path=path,
        path_root=path_root,
    )
    df = _utils.ensure_df_is_mutable(df=df)
    if metadata:
        _logger.debug("metadata: %s", metadata)
        df = _apply_index(df=df, metadata=metadata)
        df = _apply_timezone(df=df, metadata=metadata)
    return df
def _arrowtable2df(
    table: pa.Table,
    categories: Optional[List[str]],
    safe: bool,
    use_threads: bool,
    dataset: bool,
    path: str,
    path_root: Optional[str],
) -> pd.DataFrame:
    df: pd.DataFrame = _apply_partitions(
        df=table.to_pandas(
            use_threads=use_threads,
            split_blocks=True,
            self_destruct=True,
            integer_object_nulls=False,
            date_as_object=True,
            ignore_metadata=True,
            categories=categories,
            safe=safe,
            types_mapper=_data_types.pyarrow2pandas_extension,
        ),
        dataset=dataset,
        path=path,
        path_root=path_root,
    )
    return _utils.ensure_df_is_mutable(df=df)
def _read_text_chunked(
    paths: List[str],
    chunksize: int,
    parser_func: Callable[..., pd.DataFrame],
    path_root: Optional[str],
    boto3_session: boto3.Session,
    pandas_kwargs: Dict[str, Any],
    s3_additional_kwargs: Optional[Dict[str, str]],
    dataset: bool,
    use_threads: bool,
) -> Iterator[pd.DataFrame]:
    for path in paths:
        _logger.debug("path: %s", path)
        mode, encoding, newline = _get_read_details(
            path=path, pandas_kwargs=pandas_kwargs)
        with open_s3_object(
                path=path,
                mode=mode,
                s3_block_size=10_485_760,  # 10 MB (10 * 2**20)
                encoding=encoding,
                use_threads=use_threads,
                s3_additional_kwargs=s3_additional_kwargs,
                newline=newline,
                boto3_session=boto3_session,
        ) as f:
            reader: pandas.io.parsers.TextFileReader = parser_func(
                f, chunksize=chunksize, **pandas_kwargs)
            for df in reader:
                yield _apply_partitions(df=df,
                                        dataset=dataset,
                                        path=path,
                                        path_root=path_root)
def _read_text_file(
    path: str,
    parser_func: Callable[..., pd.DataFrame],
    path_root: Optional[str],
    boto3_session: Union[boto3.Session, Dict[str, Optional[str]]],
    pandas_kwargs: Dict[str, Any],
    s3_additional_kwargs: Optional[Dict[str, str]],
    dataset: bool,
) -> pd.DataFrame:
    fs: s3fs.S3FileSystem = _utils.get_fs(
        s3fs_block_size=134_217_728,
        session=boto3_session,
        s3_additional_kwargs=s3_additional_kwargs,  # 128 MB (128 * 2**20)
    )
    mode, encoding, newline = _get_read_details(path=path,
                                                pandas_kwargs=pandas_kwargs)
    with _utils.open_file(fs=fs,
                          path=path,
                          mode=mode,
                          encoding=encoding,
                          newline=newline) as f:
        df: pd.DataFrame = parser_func(f, **pandas_kwargs)
    return _apply_partitions(df=df,
                             dataset=dataset,
                             path=path,
                             path_root=path_root)
def _read_text_chunked(
    paths: List[str],
    chunksize: int,
    parser_func: Callable[..., pd.DataFrame],
    path_root: Optional[str],
    boto3_session: boto3.Session,
    pandas_kwargs: Dict[str, Any],
    s3_additional_kwargs: Optional[Dict[str, str]],
    dataset: bool,
) -> Iterator[pd.DataFrame]:
    for path in paths:
        _logger.debug("path: %s", path)
        fs: s3fs.S3FileSystem = _utils.get_fs(
            s3fs_block_size=8_388_608,
            session=boto3_session,
            s3_additional_kwargs=s3_additional_kwargs,  # 8 MB (8 * 2**20)
        )
        mode, encoding, newline = _get_read_details(
            path=path, pandas_kwargs=pandas_kwargs)
        with _utils.open_file(fs=fs,
                              path=path,
                              mode=mode,
                              encoding=encoding,
                              newline=newline) as f:
            reader: pandas.io.parsers.TextFileReader = parser_func(
                f, chunksize=chunksize, **pandas_kwargs)
            for df in reader:
                yield _apply_partitions(df=df,
                                        dataset=dataset,
                                        path=path,
                                        path_root=path_root)
Example #6
0
def _read_text_file(
    path: str,
    parser_func: Callable[..., pd.DataFrame],
    path_root: Optional[str],
    boto3_session: Union[boto3.Session, Dict[str, Optional[str]]],
    pandas_kwargs: Dict[str, Any],
    s3_additional_kwargs: Optional[Dict[str, str]],
    dataset: bool,
    use_threads: bool,
) -> pd.DataFrame:
    mode, encoding, newline = _get_read_details(path=path,
                                                pandas_kwargs=pandas_kwargs)
    with open_s3_object(
            path=path,
            mode=mode,
            use_threads=use_threads,
            s3_block_size=-1,  # One shot download
            encoding=encoding,
            s3_additional_kwargs=s3_additional_kwargs,
            newline=newline,
            boto3_session=boto3_session,
    ) as f:
        df: pd.DataFrame = parser_func(f, **pandas_kwargs)
    return _apply_partitions(df=df,
                             dataset=dataset,
                             path=path,
                             path_root=path_root)
Example #7
0
def _read_text_file(
    path: str,
    version_id: Optional[str],
    parser_func: Callable[..., pd.DataFrame],
    path_root: Optional[str],
    boto3_session: Union[boto3.Session, _utils.Boto3PrimitivesType],
    pandas_kwargs: Dict[str, Any],
    s3_additional_kwargs: Optional[Dict[str, str]],
    dataset: bool,
    use_threads: Union[bool, int],
) -> pd.DataFrame:
    boto3_session = _utils.ensure_session(boto3_session)
    mode, encoding, newline = _get_read_details(path=path,
                                                pandas_kwargs=pandas_kwargs)
    try:
        with open_s3_object(
                path=path,
                version_id=version_id,
                mode=mode,
                use_threads=use_threads,
                s3_block_size=-1,  # One shot download
                encoding=encoding,
                s3_additional_kwargs=s3_additional_kwargs,
                newline=newline,
                boto3_session=boto3_session,
        ) as f:
            df: pd.DataFrame = parser_func(f, **pandas_kwargs)
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            raise exceptions.NoFilesFound(f"No files Found on: {path}.")
        raise e
    return _apply_partitions(df=df,
                             dataset=dataset,
                             path=path,
                             path_root=path_root)
) -> pd.DataFrame:
    mode, encoding, newline = _get_read_details(path=path,
                                                pandas_kwargs=pandas_kwargs)
    with open_s3_object(
            path=path,
            mode=mode,
            use_threads=use_threads,
            s3_block_size=134_217_728,  # 128 MB (128 * 2**20)
            encoding=encoding,
            s3_additional_kwargs=s3_additional_kwargs,
            newline=newline,
            boto3_session=boto3_session,
    ) as f:
        df: pd.DataFrame = parser_func(f, **pandas_kwargs)
    return _apply_partitions(df=df,
                             dataset=dataset,
                             path=path,
                             path_root=path_root)


def _read_text(
    parser_func: Callable[..., pd.DataFrame],
    path: Union[str, List[str]],
    path_suffix: Union[str, List[str], None],
    path_ignore_suffix: Union[str, List[str], None],
    use_threads: bool,
    last_modified_begin: Optional[datetime.datetime],
    last_modified_end: Optional[datetime.datetime],
    boto3_session: Optional[boto3.Session],
    s3_additional_kwargs: Optional[Dict[str, str]],
    chunksize: Optional[int],
    dataset: bool,