Example #1
0
def test_file_parser_glob():
    f = FileParser("/a/b/*.parquet")
    assert "/a/b" == f.uri
    assert "" == f.scheme
    assert "/a/b/*.parquet" == f.path
    assert ".parquet" == f.suffix
    assert "parquet" == f.file_format
    assert "*.parquet" == f.glob_pattern

    f = FileParser("s3://a/b/*.parquet")
    assert "s3://a/b" == f.uri
    assert "s3" == f.scheme
    assert "/b/*.parquet" == f.path
    assert ".parquet" == f.suffix
    assert "parquet" == f.file_format
    assert "*.parquet" == f.glob_pattern
Example #2
0
def save_df(
    df: DaskDataFrame,
    uri: str,
    format_hint: Optional[str] = None,
    mode: str = "overwrite",
    fs: Optional[FileSystem] = None,
    **kwargs: Any,
) -> None:
    assert_or_throw(
        mode in ["overwrite", "error"],
        lambda: NotImplementedError(f"{mode} is not supported"),
    )
    p = FileParser(uri, format_hint).assert_no_glob()
    if fs is None:
        fs = FileSystem()
    if fs.exists(uri):
        assert_or_throw(mode == "overwrite", FileExistsError(uri))
        try:
            fs.remove(uri)
        except Exception:
            try:
                fs.removetree(uri)
            except Exception:  # pragma: no cover
                pass
    _FORMAT_SAVE[p.file_format](df, p, **kwargs)
Example #3
0
def load_df(
    uri: Union[str, List[str]],
    format_hint: Optional[str] = None,
    columns: Any = None,
    fs: Optional[FileSystem] = None,
    **kwargs: Any,
) -> DaskDataFrame:
    if isinstance(uri, str):
        fp = [FileParser(uri, format_hint)]
    else:
        fp = [FileParser(u, format_hint) for u in uri]
    dfs: List[dd.DataFrame] = []
    schema: Any = None
    for f in _get_single_files(fp, fs):
        df, schema = _FORMAT_LOAD[f.file_format](f, columns, **kwargs)
        dfs.append(df)
    return DaskDataFrame(dd.concat(dfs), schema)
Example #4
0
 def load_df(
     self,
     uri: Union[str, List[str]],
     format_hint: Optional[str] = None,
     columns: Any = None,
     **kwargs: Any,
 ) -> DataFrame:
     if isinstance(uri, str):
         fp = [FileParser(uri, format_hint)]
     else:
         fp = [FileParser(u, format_hint) for u in uri]
     fmts = list(set(f.file_format for f in fp))  # noqa: C401
     assert_or_throw(
         len(fmts) == 1, NotImplementedError("can't support multiple formats")
     )
     fmt = fmts[0]
     files = [f.uri for f in fp]
     return self._loads[fmt](files, columns, **kwargs)
Example #5
0
def test_file_parser():
    f = FileParser("/a/b/c.parquet")
    assert "/a/b/c.parquet" == f.uri
    assert "" == f.scheme
    assert "/a/b/c.parquet" == f.path
    assert ".parquet" == f.suffix
    assert "parquet" == f.file_format
    assert "" == f.glob_pattern

    for k, v in _FORMAT_MAP.items():
        f = FileParser(f"s3://a/b/c{k}")
        assert f"s3://a/b/c{k}" == f.uri
        assert "s3" == f.scheme
        assert f"/b/c{k}" == f.path
        assert k == f.suffix
        assert v == f.file_format

    f = FileParser("s3://a/b/c.test.parquet")
    assert "s3://a/b/c.test.parquet" == f.uri
    assert "s3" == f.scheme
    assert "/b/c.test.parquet" == f.path
    assert ".test.parquet" == f.suffix
    assert "parquet" == f.file_format

    f = FileParser("s3://a/b/c.ppp.gz", "csv")
    assert "s3://a/b/c.ppp.gz" == f.uri
    assert "s3" == f.scheme
    assert "/b/c.ppp.gz" == f.path
    assert ".ppp.gz" == f.suffix
    assert "csv" == f.file_format

    f = FileParser("s3://a/b/c", "csv")
    assert "s3://a/b/c" == f.uri
    assert "s3" == f.scheme
    assert "/b/c" == f.path
    assert "" == f.suffix
    assert "csv" == f.file_format

    raises(NotImplementedError, lambda: FileParser("s3://a/b/c.ppp"))
    raises(NotImplementedError,
           lambda: FileParser("s3://a/b/c.parquet", "csvv"))
    raises(NotImplementedError, lambda: FileParser("s3://a/b/c"))
Example #6
0
 def save_df(
     self,
     df: SparkDataFrame,
     uri: str,
     format_hint: Optional[str] = None,
     partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
     mode: str = "overwrite",
     force_single: bool = False,
     **kwargs: Any,
 ) -> None:
     if not force_single:
         p = FileParser(uri, format_hint)
         writer = self._get_writer(df.native, partition_spec)
         writer.format(p.file_format).options(**kwargs).mode(mode)
         writer.save(uri)
     else:
         ldf = df.as_local()
         save_df(ldf, uri, format_hint=format_hint, mode=mode, fs=self._fs, **kwargs)