def _test_nested(): # TODO: nested type doesn't work in dask # data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]] # df = DaskDataFrame(data, "a:{a:str,b:[int]}") # a = df.as_array(type_safe=True) # assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a data = [[[json.dumps(dict(b=[30, "40"]))]]] df = DaskDataFrame(data, "a:[{a:str,b:[int]}]") a = df.as_array(type_safe=True) assert [[[dict(a=None, b=[30, 40])]]] == a
def to_df(self, df: Any, schema: Any = None, metadata: Any = None) -> DaskDataFrame: """Convert a data structure to :class:`~fugue_dask.dataframe.DaskDataFrame` :param data: :class:`~fugue.dataframe.dataframe.DataFrame`, :class:`dask:dask.dataframe.DataFrame`, pandas DataFrame or list or iterable of arrays :param schema: |SchemaLikeObject|, defaults to None. :param metadata: |ParamsLikeObject|, defaults to None :return: engine compatible dataframe :Notice: * if the input is already :class:`~fugue_dask.dataframe.DaskDataFrame`, it should return itself * For list or iterable of arrays, ``schema`` must be specified * When ``schema`` is not None, a potential type cast may happen to ensure the dataframe's schema. * all other methods in the engine can take arbitrary dataframes and call this method to convert before doing anything """ default_partitions = self.conf.get_or_throw( FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS, int) if isinstance(df, DataFrame): assert_or_throw( schema is None and metadata is None, ValueError( "schema and metadata must be None when df is a DataFrame"), ) if isinstance(df, DaskDataFrame): return df if isinstance(df, PandasDataFrame): return DaskDataFrame(df.native, df.schema, df.metadata, num_partitions=default_partitions) return DaskDataFrame( df.as_array(type_safe=True), df.schema, df.metadata, num_partitions=default_partitions, ) return DaskDataFrame(df, schema, metadata, num_partitions=default_partitions)
def test_csv_io(tmpdir): fs = FileSystem() df1 = DaskDataFrame([["1", 2, 3]], "a:str,b:int,c:long") path = os.path.join(tmpdir, "a.csv") # without header save_df(df1, path) # assert fs.readtext(path).startswith("1,2,3") raises(InvalidOperationError, lambda: load_df(path, header=False)) actual = load_df(path, columns=["a", "b", "c"], header=False, infer_schema=True) assert [[1, 2, 3]] == actual.as_array() assert actual.schema == "a:long,b:long,c:long" actual = load_df(path, columns="a:double,b:str,c:str", header=False) assert [[1.0, "2", "3"]] == actual.as_array() assert actual.schema == "a:double,b:str,c:str" # with header save_df(df1, path, header=True) # assert fs.readtext(path).startswith("a,b,c") actual = load_df(path, header=True) assert [["1", "2", "3"]] == actual.as_array() actual = load_df(path, header=True, infer_schema=True) assert [[1, 2, 3]] == actual.as_array() actual = load_df(path, columns=["b", "a"], header=True, infer_schema=True) assert [[2, 1]] == actual.as_array() actual = load_df(path, columns="b:str,a:double", header=True) assert [["2", 1.0]] == actual.as_array() raises(KeyError, lambda: load_df(path, columns="b:str,x:double", header=True)) raises(NotImplementedError, lambda: load_df(path, columns="b:str,x:double", header=2))
def distinct( self, df: DataFrame, metadata: Any = None, ) -> DataFrame: d = self.pl_utils.drop_duplicates(self.to_df(df).native) return DaskDataFrame(d, df.schema, metadata)
def select(self, dfs: DataFrames, statement: str) -> DataFrame: dask_dfs = { k: self.execution_engine.to_df(v).native # type: ignore for k, v in dfs.items() } df = run_sql_on_dask(statement, dask_dfs) return DaskDataFrame(df)
def fillna( self, df: DataFrame, value: Any, subset: List[str] = None, metadata: Any = None, ) -> DataFrame: assert_or_throw( (not isinstance(value, list)) and (value is not None), ValueError("fillna value can not be a list or None"), ) if isinstance(value, dict): assert_or_throw( (None not in value.values()) and (any(value.values())), ValueError( "fillna dict can not contain None and needs at least one value" ), ) mapping = value else: # If subset is none, apply to all columns subset = subset or df.schema.names mapping = {col: value for col in subset} d = self.to_df(df).native.fillna(mapping) return DaskDataFrame(d, df.schema, metadata)
def dropna( self, df: DataFrame, metadata: Any = None, how: str = "any", thresh: int = None, subset: List[str] = None, ) -> DataFrame: d = self.to_df(df).native.dropna(how=how, thresh=thresh, subset=subset) return DaskDataFrame(d, df.schema, metadata)
def test_json(tmpdir): df1 = DaskDataFrame([["1", 2, 3]], "a:str,b:int,c:long") path = os.path.join(tmpdir, "a.json") save_df(df1, path) actual = load_df(path) df_eq(actual, [[1, 2, 3]], "a:long,b:long,c:long") actual = load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") actual = load_df(path, columns="b:str,a:int") df_eq(actual, [["2", 1]], "b:str,a:int") raises(KeyError, lambda: load_df(path, columns="bb:str,a:int"))
def take( self, df: DataFrame, n: int, presort: str, na_position: str = "last", partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, metadata: Any = None, ) -> DataFrame: assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), ) d = self.to_df(df).native meta = [(d[x].name, d[x].dtype) for x in d.columns] if presort: presort = parse_presort_exp(presort) # Use presort over partition_spec.presort if possible _presort: IndexedOrderedDict = presort or partition_spec.presort def _partition_take(partition, n, presort): if len(presort.keys()) > 0: partition = partition.sort_values( list(presort.keys()), ascending=list(presort.values()), na_position=na_position, ) return partition.head(n) if len(partition_spec.partition_by) == 0: if len(_presort.keys()) == 0: d = d.head(n) else: # Use the default partition d = (d.map_partitions( _partition_take, n, _presort, meta=meta).reset_index(drop=True).compute()) # compute() brings this to Pandas so we can use pandas d = d.sort_values( list(_presort.keys()), ascending=list(_presort.values()), na_position=na_position, ).head(n) else: d = (d.groupby(partition_spec.partition_by, dropna=False).apply( _partition_take, n=n, presort=_presort, meta=meta).reset_index(drop=True)) return DaskDataFrame(d, df.schema, metadata)
def _test_as_array_perf(): s = Schema() arr = [] for i in range(100): s.append(f"a{i}:int") arr.append(i) for i in range(100): s.append(f"b{i}:int") arr.append(float(i)) for i in range(100): s.append(f"c{i}:str") arr.append(str(i)) data = [] for i in range(5000): data.append(list(arr)) df = DaskDataFrame(data, s) res = df.as_array() res = df.as_array(type_safe=True) nts, ts = 0.0, 0.0 for i in range(10): t = datetime.now() res = df.as_array() nts += (datetime.now() - t).total_seconds() t = datetime.now() res = df.as_array(type_safe=True) ts += (datetime.now() - t).total_seconds() print(nts, ts)
def test_simple_methods(): df = DaskDataFrame([], "a:str,b:int") assert df.empty assert 0 == df.count() assert not df.is_local df = DaskDataFrame([["a", 1], ["b", "2"]], "x:str,y:double") assert not df.empty assert 2 == df.count() assert ["a", 1.0] == df.peek_array() assert dict(x="a", y=1.0) == df.peek_dict() df_eq( PandasDataFrame(df.as_pandas()), [["a", 1.0], ["b", 2.0]], "x:str,y:double", throw=True, )
def union( self, df1: DataFrame, df2: DataFrame, distinct: bool = True, metadata: Any = None, ) -> DataFrame: assert_or_throw(df1.schema == df2.schema, ValueError(f"{df1.schema} != {df2.schema}")) d = self.pl_utils.union(self.to_df(df1).native, self.to_df(df2).native, unique=distinct) return DaskDataFrame(d, df1.schema, metadata)
def map( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) input_schema = df.schema on_init_once: Any = ( None if on_init is None else RunOnce( on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0])) ) ) def _map(pdf: Any) -> pd.DataFrame: if pdf.shape[0] == 0: return PandasDataFrame([], output_schema).as_pandas() if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame( pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True ) if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() df = self.to_df(df) if len(partition_spec.partition_by) == 0: pdf = self.repartition(df, partition_spec) result = pdf.native.map_partitions(_map, meta=output_schema.pandas_dtype) else: df = self.repartition(df, PartitionSpec(num=partition_spec.num_partitions)) result = self.pl_utils.safe_groupby_apply( df.native, partition_spec.partition_by, _map, meta=output_schema.pandas_dtype, ) return DaskDataFrame(result, output_schema, metadata)
def test_nan_none(): # TODO: on dask, these tests can't pass # df = ArrayDataFrame([[None, None]], "b:str,c:double") # assert df.as_pandas().iloc[0, 0] is None # arr = DaskDataFrame(df.as_pandas(), df.schema).as_array()[0] # assert arr[0] is None # assert math.isnan(arr[1]) # df = ArrayDataFrame([[None, None]], "b:int,c:bool") # arr = DaskDataFrame(df.as_pandas(), df.schema).as_array(type_safe=True)[0] # assert np.isnan(arr[0]) # TODO: this will cause inconsistent behavior cross engine # assert np.isnan(arr[1]) # TODO: this will cause inconsistent behavior cross engine df = ArrayDataFrame([["a", 1.1], [None, None]], "b:str,c:double") arr = DaskDataFrame(df.as_pandas(), df.schema).as_array()[1] assert arr[0] is None assert math.isnan(arr[1]) arr = DaskDataFrame(df.as_array(), df.schema).as_array()[1] assert arr[0] is None assert math.isnan(arr[1]) arr = DaskDataFrame(df.as_pandas()["b"], "b:str").as_array()[1] assert arr[0] is None
def intersect( self, df1: DataFrame, df2: DataFrame, distinct: bool = True, metadata: Any = None, ) -> DataFrame: assert_or_throw( distinct, NotImplementedError("INTERSECT ALL for DaskExecutionEngine")) assert_or_throw(df1.schema == df2.schema, ValueError(f"{df1.schema} != {df2.schema}")) d = self.pl_utils.intersect(self.to_df(df1).native, self.to_df(df2).native, unique=distinct) return DaskDataFrame(d, df1.schema, metadata)
def join( self, df1: DataFrame, df2: DataFrame, how: str, on: List[str] = _DEFAULT_JOIN_KEYS, metadata: Any = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) d = self.pl_utils.join( self.to_df(df1).native, self.to_df(df2).native, join_type=how, on=key_schema.names, ) return DaskDataFrame(d, output_schema, metadata)
def load_df( uri: Union[str, List[str]], format_hint: Optional[str] = None, columns: Any = None, fs: Optional[FileSystem] = None, **kwargs: Any, ) -> DaskDataFrame: if isinstance(uri, str): fp = [FileParser(uri, format_hint)] else: fp = [FileParser(u, format_hint) for u in uri] dfs: List[dd.DataFrame] = [] schema: Any = None for f in _get_single_files(fp, fs): df, schema = _FORMAT_LOAD[f.file_format](f, columns, **kwargs) dfs.append(df) return DaskDataFrame(dd.concat(dfs), schema)
def subtract( self, df1: DataFrame, df2: DataFrame, distinct: bool = True, metadata: Any = None, ) -> DataFrame: assert_or_throw( distinct, NotImplementedError("EXCEPT ALL for DaskExecutionEngine")) assert_or_throw( df1.schema == df2.schema, lambda: ValueError(f"{df1.schema} != {df2.schema}"), ) d = self.pl_utils.except_df(self.to_df(df1).native, self.to_df(df2).native, unique=distinct) return DaskDataFrame(d, df1.schema, metadata)
def sample( self, df: DataFrame, n: Optional[int] = None, frac: Optional[float] = None, replace: bool = False, seed: Optional[int] = None, metadata: Any = None, ) -> DataFrame: assert_or_throw( (n is None and frac is not None) or (n is not None and frac is None), ValueError("one and only one of n and frac should be set"), ) # TODO: dask does not support sample by number of rows d = self.to_df(df).native.sample(n=n, frac=frac, replace=replace, random_state=seed) return DaskDataFrame(d, df.schema, metadata)
def repartition(self, df: DataFrame, partition_spec: PartitionSpec) -> DaskDataFrame: df = self.to_df(df) if partition_spec.empty: return df if len(partition_spec.partition_by) > 0: return df p = partition_spec.get_num_partitions( **{ KEYWORD_ROWCOUNT: lambda: df.persist().count(), # type: ignore KEYWORD_CORECOUNT: lambda: 2, # TODO: remove this hard code }) if p > 0: return DaskDataFrame( df.native.repartition(npartitions=p), schema=df.schema, metadata=df.metadata, type_safe=False, ) return df
def test_as_dict_iterable(): df = DaskDataFrame([["2020-01-01", 1.1]], "a:datetime,b:int") assert [dict(a=datetime(2020, 1, 1), b=1)] == list(df.as_dict_iterable())
def join( self, df1: DataFrame, df2: DataFrame, how: str, on: List[str] = _DEFAULT_JOIN_KEYS, metadata: Any = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) how = how.lower().replace("_", "").replace(" ", "") if how == "cross": d1 = self.to_df(df1).native d2 = self.to_df(df2).native d1["__cross_join_index__"] = 1 d2["__cross_join_index__"] = 1 d = d1.merge(d2, on=("__cross_join_index__")).drop( "__cross_join_index__", axis=1) return DaskDataFrame(d.reset_index(drop=True), output_schema, metadata) if how in ["semi", "leftsemi"]: d1 = self.to_df(df1).native d2 = self.to_df(df2).native[key_schema.names] d = d1.merge(d2, on=key_schema.names, how="inner") return DaskDataFrame(d.reset_index(drop=True), output_schema, metadata) if how in ["anti", "leftanti"]: d1 = self.to_df(df1).native d2 = self.to_df(df2).native[key_schema.names] if DASK_UTILS.empty(d1) or DASK_UTILS.empty(d2): return df1 d2["__anti_join_dummy__"] = 1.0 d = d1.merge(d2, on=key_schema.names, how="left") d = d[d["__anti_join_dummy__"].isnull()] return DaskDataFrame( d.drop(["__anti_join_dummy__"], axis=1).reset_index(drop=True), output_schema, metadata, ) fix_left, fix_right = False, False if how in ["leftouter"]: how = "left" self._validate_outer_joinable(df2.schema, key_schema) fix_right = True if how in ["rightouter"]: how = "right" self._validate_outer_joinable(df1.schema, key_schema) fix_left = True if how in ["fullouter"]: how = "outer" self._validate_outer_joinable(df1.schema, key_schema) self._validate_outer_joinable(df2.schema, key_schema) fix_left, fix_right = True, True d1 = self.to_df(df1).native d2 = self.to_df(df2).native d = d1.merge(d2, on=key_schema.names, how=how) if fix_left: d = self._fix_nan( d, output_schema, df1.schema.exclude(list(df2.schema.keys())).keys()) if fix_right: d = self._fix_nan( d, output_schema, df2.schema.exclude(list(df1.schema.keys())).keys()) return DaskDataFrame(d.reset_index(drop=True), output_schema, metadata)
def df(self, data: Any = None, schema: Any = None, metadata: Any = None) -> DaskDataFrame: return DaskDataFrame(data, schema, metadata)
def test_avro_io(tmpdir): df1 = DaskDataFrame([["1", 2, 3]], "a:str,b:int,c:long") path = os.path.join(tmpdir, "a.avro") save_df(df1, path) actual = load_df(path) df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long") actual = load_df(path, columns=["a", "b"]) df_eq(actual, [["1", 3]], "a:str,b:long") actual = load_df(path, columns="a:str,b:int,c:long") df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long") actual = load_df(path, columns=["b", "c"], infer_schema=True) df_eq(actual, [[2, 3]], "b:long,c:long") # provide schema and columns -> throw error raises( Exception, lambda: save_df( path, columns="a:str,b:int,c:long", schema={ "type": "record", "name": "Root", "fields": [ { "name": "station", "type": "string" }, { "name": "time", "type": "long" }, { "name": "temp", "type": "int" }, ], }, ), ) # provide schema and infer_schema is True -> throw error raises( Exception, lambda: save_df( path, columns=None, schema={ "type": "record", "name": "Root", "fields": [ { "name": "station", "type": "string" }, { "name": "time", "type": "long" }, { "name": "temp", "type": "int" }, ], }, infer_schema=True, ), )
def test_parquet_io(tmpdir): df1 = DaskDataFrame([["1", 2, 3]], "a:str,b:int,c:long") df2 = DaskDataFrame([[[1, 2]]], "a:[int]") # {a:int} will become {a:long} because pyarrow lib has issue df3 = DaskDataFrame([[dict(a=1)]], "a:{a:long}") for df in [df1, df2, df3]: path = os.path.join(tmpdir, "a.parquet") save_df(df, path) actual = load_df(path) df_eq(df, actual, throw=True) save_df(df1, path) actual = load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") actual = load_df(path, columns="b:str,a:int") df_eq(actual, [["2", 1]], "b:str,a:int") # can't specify wrong columns raises(Exception, lambda: load_df(path, columns="bb:str,a:int")) # load directory fs = FileSystem() for name in ["folder.parquet", "folder"]: folder = os.path.join(tmpdir, name) fs.makedirs(folder) f0 = os.path.join(folder, "_SUCCESS") f1 = os.path.join(folder, "1.parquet") f2 = os.path.join(folder, "3.parquet") fs.touch(f0) pd_save_df(df1, f1) pd_save_df(df1, f2) actual = load_df(folder, "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # load multiple paths actual = load_df([f1, f2], "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # load folder actual = load_df(folder, "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") actual = load_df(os.path.join(tmpdir, "folder.parquet")) df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # load pattern actual = load_df(os.path.join(tmpdir, "folder", "*.parquet")) df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # overwrite folder with single file save_df(actual, os.path.join(tmpdir, "folder.parquet"), mode="overwrite") actual = load_df(os.path.join(tmpdir, "folder.parquet")) df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # overwrite = False raises(FileExistsError, lambda: save_df(df1, f1, mode="error")) raises( FileExistsError, lambda: save_df( df1, os.path.join(tmpdir, "folder.parquet"), mode="error"), ) # wrong mode raises(NotImplementedError, lambda: save_df(df1, f1, mode="dummy"))
def test_init(): df = DaskDataFrame(schema="a:str,b:int") assert df.is_bounded assert df.count() == 0 assert df.schema == "a:str,b:int" pdf = pandas.DataFrame([["a", 1], ["b", 2]]) raises(FugueDataFrameInitError, lambda: DaskDataFrame(pdf)) df = DaskDataFrame(pdf, "a:str,b:str") assert [["a", "1"], ["b", "2"]] == df.as_pandas().values.tolist() df = DaskDataFrame(pdf, "a:str,b:int") assert [["a", 1], ["b", 2]] == df.as_pandas().values.tolist() df = DaskDataFrame(pdf, "a:str,b:double") assert [["a", 1.0], ["b", 2.0]] == df.as_pandas().values.tolist() pdf = DaskDataFrame([["a", 1], ["b", 2]], "a:str,b:int").native["b"] assert isinstance(pdf, pd.Series) df = DaskDataFrame(pdf, "b:str") assert [["1"], ["2"]] == df.as_pandas().values.tolist() df = DaskDataFrame(pdf, "b:double") assert [[1.0], [2.0]] == df.as_pandas().values.tolist() pdf = DaskDataFrame([["a", 1], ["b", 2]], "x:str,y:long").native df = DaskDataFrame(pdf) assert df.schema == "x:str,y:long" df = DaskDataFrame(pdf, "y:str,x:str") assert [["1", "a"], ["2", "b"]] == df.as_pandas().values.tolist() ddf = DaskDataFrame(df) assert [["1", "a"], ["2", "b"]] == ddf.as_pandas().values.tolist() assert df.native is ddf.native # no real copy happened df = DaskDataFrame([["a", 1], ["b", "2"]], "x:str,y:double") assert [["a", 1.0], ["b", 2.0]] == df.as_pandas().values.tolist() df = DaskDataFrame([], "x:str,y:double") assert [] == df.as_pandas().values.tolist() raises(FugueDataFrameInitError, lambda: DaskDataFrame(123))
def test_as_array(): df = DaskDataFrame([], "a:str,b:int") assert [] == df.as_array() assert [] == df.as_array(type_safe=True) assert [] == list(df.as_array_iterable()) assert [] == list(df.as_array_iterable(type_safe=True)) df = DaskDataFrame([["a", 1]], "a:str,b:int") assert [["a", 1]] == df.as_array() assert [["a", 1]] == df.as_array(["a", "b"]) assert [[1, "a"]] == df.as_array(["b", "a"]) # prevent pandas auto type casting df = DaskDataFrame([[1.0, 1.1]], "a:double,b:int") assert [[1.0, 1]] == df.as_array() assert isinstance(df.as_array()[0][0], float) assert isinstance(df.as_array()[0][1], int) assert [[1.0, 1]] == df.as_array(["a", "b"]) assert [[1, 1.0]] == df.as_array(["b", "a"]) df = DaskDataFrame([[np.float64(1.0), 1.1]], "a:double,b:int") assert [[1.0, 1]] == df.as_array() assert isinstance(df.as_array()[0][0], float) assert isinstance(df.as_array()[0][1], int) df = DaskDataFrame([[pandas.Timestamp("2020-01-01"), 1.1]], "a:datetime,b:int") df.native["a"] = pd.to_datetime(df.native["a"]) assert [[datetime(2020, 1, 1), 1]] == df.as_array() assert isinstance(df.as_array()[0][0], datetime) assert isinstance(df.as_array()[0][1], int) df = DaskDataFrame([[pandas.NaT, 1.1]], "a:datetime,b:int") df.native["a"] = pd.to_datetime(df.native["a"]) assert isinstance(df.as_array()[0][0], datetime) assert isinstance(df.as_array()[0][1], int) df = DaskDataFrame([[1.0, 1.1]], "a:double,b:int") assert [[1.0, 1]] == df.as_array(type_safe=True) assert isinstance(df.as_array()[0][0], float) assert isinstance(df.as_array()[0][1], int)