def test_file_parser_glob(): f = FileParser("/a/b/*.parquet") assert "/a/b" == f.uri assert "" == f.scheme assert "/a/b/*.parquet" == f.path assert ".parquet" == f.suffix assert "parquet" == f.file_format assert "*.parquet" == f.glob_pattern f = FileParser("s3://a/b/*.parquet") assert "s3://a/b" == f.uri assert "s3" == f.scheme assert "/b/*.parquet" == f.path assert ".parquet" == f.suffix assert "parquet" == f.file_format assert "*.parquet" == f.glob_pattern
def save_df( df: DaskDataFrame, uri: str, format_hint: Optional[str] = None, mode: str = "overwrite", fs: Optional[FileSystem] = None, **kwargs: Any, ) -> None: assert_or_throw( mode in ["overwrite", "error"], lambda: NotImplementedError(f"{mode} is not supported"), ) p = FileParser(uri, format_hint).assert_no_glob() if fs is None: fs = FileSystem() if fs.exists(uri): assert_or_throw(mode == "overwrite", FileExistsError(uri)) try: fs.remove(uri) except Exception: try: fs.removetree(uri) except Exception: # pragma: no cover pass _FORMAT_SAVE[p.file_format](df, p, **kwargs)
def load_df( uri: Union[str, List[str]], format_hint: Optional[str] = None, columns: Any = None, fs: Optional[FileSystem] = None, **kwargs: Any, ) -> DaskDataFrame: if isinstance(uri, str): fp = [FileParser(uri, format_hint)] else: fp = [FileParser(u, format_hint) for u in uri] dfs: List[dd.DataFrame] = [] schema: Any = None for f in _get_single_files(fp, fs): df, schema = _FORMAT_LOAD[f.file_format](f, columns, **kwargs) dfs.append(df) return DaskDataFrame(dd.concat(dfs), schema)
def load_df( self, uri: Union[str, List[str]], format_hint: Optional[str] = None, columns: Any = None, **kwargs: Any, ) -> DataFrame: if isinstance(uri, str): fp = [FileParser(uri, format_hint)] else: fp = [FileParser(u, format_hint) for u in uri] fmts = list(set(f.file_format for f in fp)) # noqa: C401 assert_or_throw( len(fmts) == 1, NotImplementedError("can't support multiple formats") ) fmt = fmts[0] files = [f.uri for f in fp] return self._loads[fmt](files, columns, **kwargs)
def test_file_parser(): f = FileParser("/a/b/c.parquet") assert "/a/b/c.parquet" == f.uri assert "" == f.scheme assert "/a/b/c.parquet" == f.path assert ".parquet" == f.suffix assert "parquet" == f.file_format assert "" == f.glob_pattern for k, v in _FORMAT_MAP.items(): f = FileParser(f"s3://a/b/c{k}") assert f"s3://a/b/c{k}" == f.uri assert "s3" == f.scheme assert f"/b/c{k}" == f.path assert k == f.suffix assert v == f.file_format f = FileParser("s3://a/b/c.test.parquet") assert "s3://a/b/c.test.parquet" == f.uri assert "s3" == f.scheme assert "/b/c.test.parquet" == f.path assert ".test.parquet" == f.suffix assert "parquet" == f.file_format f = FileParser("s3://a/b/c.ppp.gz", "csv") assert "s3://a/b/c.ppp.gz" == f.uri assert "s3" == f.scheme assert "/b/c.ppp.gz" == f.path assert ".ppp.gz" == f.suffix assert "csv" == f.file_format f = FileParser("s3://a/b/c", "csv") assert "s3://a/b/c" == f.uri assert "s3" == f.scheme assert "/b/c" == f.path assert "" == f.suffix assert "csv" == f.file_format raises(NotImplementedError, lambda: FileParser("s3://a/b/c.ppp")) raises(NotImplementedError, lambda: FileParser("s3://a/b/c.parquet", "csvv")) raises(NotImplementedError, lambda: FileParser("s3://a/b/c"))
def save_df( self, df: SparkDataFrame, uri: str, format_hint: Optional[str] = None, partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, mode: str = "overwrite", force_single: bool = False, **kwargs: Any, ) -> None: if not force_single: p = FileParser(uri, format_hint) writer = self._get_writer(df.native, partition_spec) writer.format(p.file_format).options(**kwargs).mode(mode) writer.save(uri) else: ldf = df.as_local() save_df(ldf, uri, format_hint=format_hint, mode=mode, fs=self._fs, **kwargs)