def test_csv_io(tmpdir): fs = FileSystem() df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") path = os.path.join(tmpdir, "a.csv") # without header save_df(df1, path) assert fs.readtext(path).startswith("1,2,3") raises(InvalidOperationError, lambda: load_df(path, header=False)) actual = load_df(path, columns=["a", "b", "c"], header=False, infer_schema=True) assert [[1, 2, 3]] == actual.as_array() assert actual.schema == "a:long,b:long,c:long" actual = load_df(path, columns="a:double,b:str,c:str", header=False) assert [[1.0, "2", "3"]] == actual.as_array() assert actual.schema == "a:double,b:str,c:str" # with header save_df(df1, path, header=True) assert fs.readtext(path).startswith("a,b,c") actual = load_df(path, header=True) assert [["1", "2", "3"]] == actual.as_array() actual = load_df(path, header=True, infer_schema=True) assert [[1, 2, 3]] == actual.as_array() actual = load_df(path, columns=["b", "a"], header=True, infer_schema=True) assert [[2, 1]] == actual.as_array() actual = load_df(path, columns="b:str,a:double", header=True) assert [["2", 1.0]] == actual.as_array() raises(KeyError, lambda: load_df(path, columns="b:str,x:double", header=True)) raises(NotImplementedError, lambda: load_df(path, columns="b:str,x:double", header=2))
def _safe_load_json(path: str, **kwargs: Any) -> pd.DataFrame: kw = {"orient": "records", "lines": True, **kwargs} try: return pd.read_json(path, **kw) except (IsADirectoryError, PermissionError): fs = FileSystem() return pd.concat([ pd.read_json(pfs.path.join(path, os.path.basename(x.path)), **kw) for x in fs.opendir(path).glob("*.json") ])
def __init__(self, conf: Any = None): p = ParamDict(FUGUE_DASK_DEFAULT_CONF) p.update(ParamDict(conf)) super().__init__(p) self._fs = FileSystem() self._log = logging.getLogger() self._default_sql_engine = QPDDaskEngine(self)
def test_load_csv_folder(self): e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6.1, 1.1]], "c:double,a:double") b = ArrayDataFrame([[2.1, 7.1], [4.1, 8.1]], "c:double,a:double") path = os.path.join(self.tmpdir, "a", "b") native.save_df(a, os.path.join(path, "a.csv"), format_hint="csv", header=True) native.save_df(b, os.path.join(path, "b.csv"), format_hint="csv", header=True) FileSystem().touch(os.path.join(path, "_SUCCESS")) c = e.load_df( path, format_hint="csv", header=True, infer_schema=True, columns=["a", "c"], ) df_eq(c, [[1.1, 6.1], [7.1, 2.1], [8.1, 4.1]], "a:double,c:double", throw=True)
def save_df( df: DaskDataFrame, uri: str, format_hint: Optional[str] = None, mode: str = "overwrite", fs: Optional[FileSystem] = None, **kwargs: Any, ) -> None: assert_or_throw( mode in ["overwrite", "error"], lambda: NotImplementedError(f"{mode} is not supported"), ) p = FileParser(uri, format_hint).assert_no_glob() if fs is None: fs = FileSystem() if fs.exists(uri): assert_or_throw(mode == "overwrite", FileExistsError(uri)) try: fs.remove(uri) except Exception: try: fs.removetree(uri) except Exception: # pragma: no cover pass _FORMAT_SAVE[p.file_format](df, p, **kwargs)
def test_csv_io(tmpdir, spark_session): fs = FileSystem() si = SparkIO(spark_session, fs) df1 = _df([["1", 2, 3]], "a:str,b:int,c:long") path = os.path.join(tmpdir, "a.csv") # without header si.save_df(df1, path) raises(InvalidOperationError, lambda: si.load_df(path, header=False)) actual = si.load_df(path, columns=["a", "b", "c"], header=False) assert [["1", "2", "3"]] == actual.as_array() assert actual.schema == "a:str,b:str,c:str" actual = si.load_df(path, columns="a:double,b:str,c:str", header=False) assert [[1.0, "2", "3"]] == actual.as_array() assert actual.schema == "a:double,b:str,c:str" # with header si.save_df(df1, path, header=True) actual = si.load_df(path, header=True) assert [["1", "2", "3"]] == actual.as_array() actual = si.load_df(path, columns=["b", "a"], header=True) assert [["2", "1"]] == actual.as_array() actual = si.load_df(path, columns="b:str,a:double", header=True) assert [["2", 1.0]] == actual.as_array() raises(Exception, lambda: si.load_df(path, columns="b:str,x:double", header=True)) raises(NotImplementedError, lambda: si.load_df(path, columns="b:str,x:double", header=2))
def __init__(self, conf: Any = None): p = ParamDict(FUGUE_DASK_DEFAULT_CONF) p.update(ParamDict(conf)) super().__init__(p) self._fs = FileSystem() self._log = logging.getLogger() self._native = NativeExecutionEngine(conf=conf)
def test_load_parquet_folder(self): e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1]], "c:int,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") native.save_df(a, os.path.join(path, "a.parquet")) native.save_df(b, os.path.join(path, "b.parquet")) FileSystem().touch(os.path.join(path, "_SUCCESS")) c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True)
def test_load_avro_folder(self): # TODO: switch to c:int,a:long when we can preserve schema to avro e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1]], "c:long,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:long,a:long") path = os.path.join(self.tmpdir, "a", "b") native.save_df(a, os.path.join(path, "a.avro")) native.save_df(b, os.path.join(path, "b.avro")) FileSystem().touch(os.path.join(path, "_SUCCESS")) c = e.load_df(path, format_hint="avro", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:long", throw=True)
def _compute( self, df: Iterable[Dict[str, Any]], entrypoint: Callable[[str, Dict[str, Any]], Any], ) -> Iterable[Dict[str, Any]]: ck_fs = FileSystem().makedirs(self._checkpoint_path, recreate=True) for row in df: for trial in get_trials_from_row(row): rjudge = RemoteTrialJudge(entrypoint) self._objective.copy().run(trial, rjudge, ck_fs) if rjudge.report is not None: yield rjudge.report.fill_dict(dict(row))
def test_json(tmpdir): fs = FileSystem() df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") path = os.path.join(tmpdir, "a.json") save_df(df1, path) actual = load_df(path) df_eq(actual, [[1, 2, 3]], "a:long,b:long,c:long") actual = load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") actual = load_df(path, columns="b:str,a:int") df_eq(actual, [["2", 1]], "b:str,a:int") raises(KeyError, lambda: load_df(path, columns="bb:str,a:int"))
def test_json_io(tmpdir, spark_session): fs = FileSystem() si = SparkIO(spark_session, fs) df1 = _df([["1", 2, 3]], "a:str,b:int,c:long") path = os.path.join(tmpdir, "a.json") si.save_df(df1, path) actual = si.load_df(path) df_eq(actual, [[1, 2, 3]], "a:long,b:long,c:long") actual = si.load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") actual = si.load_df(path, columns="b:str,a:int") df_eq(actual, [["2", 1]], "b:str,a:int") raises(Exception, lambda: si.load_df(path, columns="bb:str,a:int"))
def _load_single_avro(path: str, **kwargs: Any) -> pd.DataFrame: from fastavro import reader kw = ParamDict(kwargs) process_record = None if "process_record" in kw: process_record = kw["process_record"] del kw["process_record"] fs = FileSystem() with fs.openbin(path) as fp: # Configure Avro reader avro_reader = reader(fp) # Load records in memory if process_record: records = [process_record(r) for r in avro_reader] else: records = list(avro_reader) # Populate pandas.DataFrame with records return pd.DataFrame.from_records(records)
def _get_single_files(fp: Iterable[FileParser], fs: Optional[FileSystem]) -> Iterable[FileParser]: if fs is None: fs = FileSystem() for f in fp: if f.glob_pattern != "": files = [ FileParser(pfs.path.join(f.uri, os.path.basename(x.path))) for x in fs.opendir(f.uri).glob(f.glob_pattern) ] yield from _get_single_files(files, fs) else: yield f
def test_save_with_partition(tmpdir, spark_session): si = SparkIO(spark_session, FileSystem()) df1 = _df([["1", 2, 3]], "a:str,b:int,c:long") path = os.path.join(tmpdir, "a.parquet") si.save_df(df1, path, partition_spec=PartitionSpec(num=2)) actual = si.load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") si.save_df(df1, path, partition_spec=PartitionSpec(by=["a"])) actual = si.load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") si.save_df(df1, path, partition_spec=PartitionSpec(by=["a"], num=2)) actual = si.load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str")
def _load_avro(p: FileParser, columns: Any = None, **kwargs: Any) -> Tuple[pd.DataFrame, Any]: path = p.uri try: pdf = _load_single_avro(path, **kwargs) except (IsADirectoryError, PermissionError, FileExpected): fs = FileSystem() pdf = pd.concat([ _load_single_avro(pfs.path.join(path, os.path.basename(x.path)), **kwargs) for x in fs.opendir(path).glob("*.avro") ]) if columns is None: return pdf, None if isinstance(columns, list): # column names return pdf[columns], None schema = Schema(columns) # Return created DataFrame return pdf[schema.names], schema
def save_df( df: LocalDataFrame, uri: str, format_hint: Optional[str] = None, mode: str = "overwrite", fs: Optional[FileSystem] = None, **kwargs: Any, ) -> None: assert_or_throw(mode in ["overwrite", "error"], NotImplementedError(f"{mode} is not supported")) p = FileParser(uri, format_hint) if fs is None: fs = FileSystem() if fs.exists(uri): assert_or_throw(mode == "overwrite", FileExistsError(uri)) _FORMAT_SAVE[p.file_format](df, p, **kwargs)
def test_parquet_io(tmpdir, spark_session): si = SparkIO(spark_session, FileSystem()) df1 = _df([["1", 2, 3]], "a:str,b:int,c:long") df2 = _df([[[1, 2]]], "a:[int]") # {a:int} will become {a:long} because pyarrow lib has issue df3 = _df([[dict(a=1)]], "a:{a:long}") for df in [df1, df2, df3]: path = os.path.join(tmpdir, "a.parquet") si.save_df(df, path) actual = si.load_df(path) df_eq(df, actual, throw=True) si.save_df(df1, path) actual = si.load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") actual = si.load_df(path, columns="b:str,a:int") df_eq(actual, [["2", 1]], "b:str,a:int") raises(Exception, lambda: si.load_df(path, columns="bb:str,a:int")) # load directory fs = FileSystem() folder = os.path.join(tmpdir, "folder") fs.makedirs(folder) f0 = os.path.join(folder, "_SUCCESS") f1 = os.path.join(folder, "1.parquet") f2 = os.path.join(folder, "3.parquet") fs.touch(f0) si.save_df(df1, f1, force_single=True) si.save_df(df1, f2, force_single=True) assert fs.isfile(f1) actual = si.load_df(folder, "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # load multiple paths actual = si.load_df([f1, f2], "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") actual = si.load_df([f1, f2], "parquet", columns="b:str,a:str") df_eq(actual, [["2", "1"], ["2", "1"]], "a:str,b:int,c:long") # overwrite = False raises((FileExistsError, AnalysisException), lambda: si.save_df(df1, f1, mode="error")) # wrong mode raises(Exception, lambda: si.save_df(df1, f1, mode="dummy"))
def validate_iterative_objective( func: IterativeObjectiveFunc, trial: Trial, budgets: List[float], validator: Callable[[List[TrialReport]], None], continuous: bool = False, checkpoint_path: str = "", monitor: Optional[Monitor] = None, ) -> None: path = checkpoint_path if checkpoint_path != "" else tempfile.gettempdir() basefs = FileSystem().makedirs(os.path.join(path, str(uuid4())), recreate=True) j = _Validator(monitor, budgets, continuous=continuous) if continuous: f = pickle.loads(pickle.dumps(func)).copy() f.run(trial, j, checkpoint_basedir_fs=basefs) else: for _ in budgets: f = pickle.loads(pickle.dumps(func)).copy() f.run(trial, j, checkpoint_basedir_fs=basefs) validator(j.reports)
def __init__(self, spark_session: Optional[SparkSession] = None, conf: Any = None): if spark_session is None: spark_session = SparkSession.builder.getOrCreate() self._spark_session = spark_session cf = dict(FUGUE_SPARK_DEFAULT_CONF) cf.update({ x[0]: x[1] for x in spark_session.sparkContext.getConf().getAll() }) cf.update(ParamDict(conf)) super().__init__(cf) self._fs = FileSystem() self._log = logging.getLogger() self._default_sql_engine = SparkSQLEngine(self) self._broadcast_func = RunOnce(self._broadcast, lambda *args, **kwargs: id(args[0])) self._persist_func = RunOnce(self._persist, lambda *args, **kwargs: id(args[0])) self._register_func = RunOnce(self._register, lambda *args, **kwargs: id(args[0])) self._io = SparkIO(self.spark_session, self.fs)
def test_parquet_io(tmpdir): df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") df2 = ArrayDataFrame([[[1, 2]]], "a:[int]") # {a:int} will become {a:long} because pyarrow lib has issue df3 = ArrayDataFrame([[dict(a=1)]], "a:{a:long}") for df in [df1, df2, df3]: path = os.path.join(tmpdir, "a.parquet") save_df(df, path) actual = load_df(path) df_eq(df, actual, throw=True) save_df(df1, path) actual = load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") actual = load_df(path, columns="b:str,a:int") df_eq(actual, [["2", 1]], "b:str,a:int") # can't specify wrong columns raises(Exception, lambda: load_df(path, columns="bb:str,a:int")) # load directory fs = FileSystem() folder = os.path.join(tmpdir, "folder") fs.makedirs(folder) f0 = os.path.join(folder, "_SUCCESS") f1 = os.path.join(folder, "1.parquet") f2 = os.path.join(folder, "3.parquet") fs.touch(f0) save_df(df1, f1) save_df(df1, f2) actual = load_df(folder, "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # load multiple paths actual = load_df([f1, f2], "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # overwrite = False raises(FileExistsError, lambda: save_df(df1, f1, mode="error")) # can't overwrite directory raises( IsADirectoryError, lambda: save_df(df1, folder, format_hint="parquet", mode="overwrite"), ) # wrong mode raises(NotImplementedError, lambda: save_df(df1, f1, mode="dummy"))
def load_dir() -> pd.DataFrame: fs = FileSystem() return pd.concat([ pd.read_csv(pfs.path.join(path, os.path.basename(x.path)), **kwargs) for x in fs.opendir(path).glob("*.csv") ])
def __init__(self, conf: Any = None): super().__init__(conf) self._fs = FileSystem() self._log = logging.getLogger()
def __init__(self, conf: Any = None): super().__init__(conf) self._fs = FileSystem() self._log = logging.getLogger() self._default_sql_engine = SqliteEngine(self)