Exemple #1
0
def test_get_join_schemas():
    a = ArrayDataFrame([], "a:int,b:int")
    b = ArrayDataFrame([], "c:int")
    c = ArrayDataFrame([], "d:str,a:int")
    i, u = get_join_schemas(a, b, how="cross", on=[])
    assert i == ""
    assert u == "a:int,b:int,c:int"
    raises(NoneArgumentError, lambda: get_join_schemas(a, b, how=None, on=[]))
    raises(ValueError, lambda: get_join_schemas(a, b, how="x", on=[]))
    raises(SchemaError, lambda: get_join_schemas(a, b, how="CROSS", on=["a"]))
    raises(SchemaError, lambda: get_join_schemas(a, c, how="CROSS", on=["a"]))
    raises(SchemaError, lambda: get_join_schemas(a, c, how="CROSS", on=[]))
    raises(SchemaError, lambda: get_join_schemas(a, b, how="inner", on=["a"]))
    raises(ValueError, lambda: get_join_schemas(a, c, how="outer", on=["a"]))
    i, u = get_join_schemas(a, c, how="inner", on=["a"])
    assert i == "a:int"
    assert u == "a:int,b:int,d:str"
    i, u = get_join_schemas(a, c, how="inner", on=[])  # infer
    assert i == "a:int"
    assert u == "a:int,b:int,d:str"
    a = ArrayDataFrame([], "a:int,b:int,c:int")
    b = ArrayDataFrame([], "c:int,b:int,x:int")
    raises(SchemaError, lambda: get_join_schemas(a, b, how="inner", on=["a"]))
    i, u = get_join_schemas(a, b, how="inner", on=["c", "b"])
    assert i == "b:int,c:int"
    assert u == "a:int,b:int,c:int,x:int"
    for how in ["SEMI", "LEFT_Semi", "Anti", "left_Anti"]:
        i, u = get_join_schemas(c, a, how=how, on=["a"])
        assert i == "a:int"
        assert u == "d:str,a:int"
Exemple #2
0
 def run(self, cursor: PartitionCursor, dfs: DataFrames) -> LocalDataFrame:
     self.transformer._cursor = cursor  # type: ignore
     try:
         to_local_bounded_df(self.transformer.transform(dfs))
         return ArrayDataFrame([], self.transformer.output_schema)
     except self.ignore_errors:  # type: ignore
         return ArrayDataFrame([], self.transformer.output_schema)
Exemple #3
0
def _generate_comap_empty_dfs(schemas: Any, named: bool) -> DataFrames:
    if named:
        return DataFrames(
            {k: ArrayDataFrame([], v)
             for k, v in schemas.items()})
    else:
        return DataFrames([ArrayDataFrame([], v) for v in schemas.values()])
Exemple #4
0
 def test_subtract(self):
     with self.dag() as dag:
         a = dag.df([[1, 10], [2, None], [2, None]], "x:long,y:double")
         b = dag.df([[2, None], [2, 20]], "x:long,y:double")
         c = dag.df([[1, 10], [2, 20]], "x:long,y:double")
         a.subtract(b).assert_eq(
             ArrayDataFrame(
                 [[1, 10]],
                 "x:long,y:double",
             ))
         a.subtract(c).assert_eq(
             ArrayDataFrame(
                 [[2, None]],
                 "x:long,y:double",
             ))
         # # TODO: EXCEPT ALL is not implemented (QPD issue)
         # a.subtract(c, distinct=False).assert_eq(
         #     ArrayDataFrame(
         #         [[2, None], [2, None]],
         #         "x:long,y:double",
         #     )
         # )
         a.subtract(b,
                    c).assert_eq(ArrayDataFrame(
                        [],
                        "x:long,y:double",
                    ))
Exemple #5
0
 def test_union(self):
     with self.dag() as dag:
         a = dag.df([[1, 10], [2, None], [2, None]], "x:long,y:double")
         b = dag.df([[2, None], [2, 20]], "x:long,y:double")
         c = dag.df([[1, 10], [2, 20]], "x:long,y:double")
         a.union().assert_eq(a)
         a.union(b, c).assert_eq(
             ArrayDataFrame(
                 [
                     [1, 10],
                     [2, None],
                     [2, 20],
                 ],
                 "x:long,y:double",
             ))
         a.union(b, c, distinct=False).assert_eq(
             ArrayDataFrame(
                 [
                     [1, 10],
                     [2, None],
                     [2, None],
                     [2, None],
                     [2, 20],
                     [1, 10],
                     [2, 20],
                 ],
                 "x:long,y:double",
             ))
Exemple #6
0
        def test_col_ops(self):
            with self.dag() as dag:
                a = dag.df([[1, 10], [2, 20]], "x:long,y:long")
                aa = dag.df([[1, 10], [2, 20]], "xx:long,y:long")
                a.rename({"x": "xx"}).assert_eq(aa)
                a[["x"]].assert_eq(ArrayDataFrame([[1], [2]], "x:long"))

                a.drop(["y", "yy"], if_exists=True).assert_eq(
                    ArrayDataFrame([[1], [2]], "x:long"))

                a[["x"]].rename(x="xx").assert_eq(
                    ArrayDataFrame([[1], [2]], "xx:long"))
Exemple #7
0
 def test_intersect(self):
     with self.dag() as dag:
         a = dag.df([[1, 10], [2, None], [2, None]], "x:long,y:double")
         b = dag.df([[2, None], [2, 20]], "x:long,y:double")
         c = dag.df([[1, 10], [2, 20]], "x:long,y:double")
         # d = dag.df([[1, 10], [2, 20], [2, None]], "x:long,y:double")
         a.intersect(b).assert_eq(
             ArrayDataFrame(
                 [[2, None]],
                 "x:long,y:double",
             ))
         a.intersect(b, c).assert_eq(
             ArrayDataFrame(
                 [],
                 "x:long,y:double",
             ))
def f30(e: EmptyAwareIterable[List[Any]],
        a: EmptyAwareIterable[Dict[str, Any]]) -> LocalDataFrame:
    e.peek()
    a.peek()
    e = list(e)
    e += [[x["a"]] for x in a]
    return ArrayDataFrame(e, "a:int")
Exemple #9
0
def to_local_bounded_df(df: Any,
                        schema: Any = None,
                        metadata: Any = None) -> LocalBoundedDataFrame:
    """Convert a data structure to
    :class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame`

    :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
      list or iterable of arrays
    :param schema: |SchemaLikeObject|, defaults to None, it should not be set for
      :class:`~fugue.dataframe.dataframe.DataFrame` type
    :param metadata: dict-like object with string keys, defaults to  None
    :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
      but you set ``schema`` or ``metadata``
    :raises TypeError: if ``df`` is not compatible
    :return: the dataframe itself if it's
      :class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame` else a converted one

    .. admonition:: Examples

        >>> a = IterableDataFrame([[0,'a'],[1,'b']],"a:int,b:str")
        >>> assert isinstance(to_local_bounded_df(a), LocalBoundedDataFrame)
        >>> to_local_bounded_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))

    .. note::

        Compared to :func:`.to_local_df`, this function makes sure the dataframe is also
        bounded, so :class:`~fugue.dataframe.iterable_dataframe.IterableDataFrame` will
        be converted although it's local.
    """
    df = to_local_df(df, schema, metadata)
    if isinstance(df, LocalBoundedDataFrame):
        return df
    return ArrayDataFrame(df.as_array(), df.schema, df.metadata)
Exemple #10
0
def test_use_soecial_df(tmpdir):
    # external non-workflowdataframe
    arr = ArrayDataFrame([[0], [1]], "a:int")
    fsql(
        """
        b=CREATE[[0], [1]] SCHEMA a: int
        a = SELECT * FROM a.x
        OUTPUT a, b USING assert_eq
        a = SELECT x.* FROM a.x AS x
        OUTPUT a, b USING assert_eq
        c=CREATE [[0,0],[1,1]] SCHEMA a:int,b:int
        d = SELECT x.*,y.a AS b FROM a.x x INNER JOIN a.x y ON x.a=y.a
        OUTPUT c, d USING assert_eq
        """,
        {
            "a.x": arr
        },
    ).run()

    # from yield file
    engine = NativeExecutionEngine(
        conf={"fugue.workflow.checkpoint.path": os.path.join(tmpdir, "ck")})
    with FugueSQLWorkflow(engine) as dag:
        dag("CREATE[[0], [1]] SCHEMA a: int YIELD FILE AS b")
        res = dag.yields["b"]

    with FugueSQLWorkflow(engine) as dag:
        dag(
            """
        b=CREATE[[0], [1]] SCHEMA a: int
        a = SELECT * FROM a.x
        OUTPUT a, b USING assert_eq
        """,
            {"a.x": res},
        )
Exemple #11
0
def to_local_df(df: Any, schema: Any = None, metadata: Any = None) -> LocalDataFrame:
    """Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalDataFrame`

    :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
      list or iterable of arrays
    :param schema: |SchemaLikeObject|, defaults to None, it should not be set for
      :class:`~fugue.dataframe.dataframe.DataFrame` type
    :param metadata: dict-like object with string keys, defaults to  None
    :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
      but you set ``schema`` or ``metadata``
    :raises TypeError: if ``df`` is not compatible
    :return: the dataframe itself if it's
      :class:`~fugue.dataframe.dataframe.LocalDataFrame` else a converted one

    :Examples:

    >>> a = to_local_df([[0,'a'],[1,'b']],"a:int,b:str")
    >>> assert to_local_df(a) is a
    >>> to_local_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
    """
    assert_arg_not_none(df, "df")
    if isinstance(df, DataFrame):
        aot(
            schema is None and metadata is None,
            ValueError("schema and metadata must be None when df is a DataFrame"),
        )
        return df.as_local()
    if isinstance(df, pd.DataFrame):
        return PandasDataFrame(df, schema, metadata)
    if isinstance(df, List):
        return ArrayDataFrame(df, schema, metadata)
    if isinstance(df, Iterable):
        return IterableDataFrame(df, schema, metadata)
    raise TypeError(f"{df} cannot convert to a LocalDataFrame")
Exemple #12
0
def test_workflow():
    builder = FugueWorkflow()

    a = builder.create_data([[0], [0], [1]], "a:int")
    raises(InvalidOperationError, lambda: a._task.copy())
    raises(InvalidOperationError, lambda: copy.copy(a._task))
    raises(InvalidOperationError, lambda: copy.deepcopy(a._task))
    a.show()
    a.show()

    raises(FugueWorkflowCompileError, lambda: builder.df(123))

    b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"]))
    b.show()
    builder.create_data([[0], [1]], "b:int").show()
    c = ArrayDataFrame([[100]], "a:int")
    builder.show(a, b, c)
    b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast()
    b.show()

    builder.run()
    df_eq(a.result, [[0], [0], [1]], "a:int")
    raises(TypeError, lambda: builder.run("abc"))
    builder.run(FugueWorkflowContext())
    df_eq(a.result, [[0], [0], [1]], "a:int")
    builder.run("NativeExecutionEngine")
    df_eq(b.result, [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
    df_eq(b.compute(), [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
    df_eq(b.compute(NativeExecutionEngine), [[0, 2], [0, 2], [1, 1]],
          "a:int,b:int")
Exemple #13
0
def test_worflow_dataframes():
    dag1 = FugueWorkflow()
    df1 = dag1.df([[0]], "a:int")
    df2 = dag1.df([[0]], "b:int")
    dag2 = FugueWorkflow()
    df3 = dag2.df([[0]], "a:int")

    dfs1 = WorkflowDataFrames(a=df1, b=df2)
    assert dfs1["a"] is df1
    assert dfs1["b"] is df2

    dfs2 = WorkflowDataFrames(dfs1, aa=df1, bb=df2)
    assert 4 == len(dfs2)

    with raises(ValueError):
        WorkflowDataFrames(a=df1, b=df3)

    with raises(ValueError):
        WorkflowDataFrames(a=df1, b=ArrayDataFrame([[0]], "a:int"))

    dag = FugueWorkflow()
    df = dag.df([[0], [1]], "a:int")
    assert df.partition_spec.empty
    df2 = df.partition(by=["a"])
    assert df.partition_spec.empty
    assert df2.partition_spec == PartitionSpec(by=["a"])
Exemple #14
0
 def execute(self, ctx: TaskContext) -> None:
     self._outputter._execution_engine = self._get_execution_engine(ctx)
     if self._input_has_key:
         self._outputter.process(DataFrames(ctx.inputs))
     else:
         self._outputter.process(DataFrames(ctx.inputs.values()))
     # TODO: output dummy to force cache to work, should we fix adagio?
     ctx.outputs["_0"] = ArrayDataFrame([], "_0:int")
 def transform(self, df):
     if not hasattr(self, "called"):
         self.called = 1
     else:
         self.called += 1
     n = self.params.get("n", 1)
     assert self.called <= n
     return ArrayDataFrame([[len(df.as_array())]], "c:int")
Exemple #16
0
 def test_distinct(self):
     with self.dag() as dag:
         a = dag.df([[1, 10], [2, None], [2, None]], "x:long,y:double")
         a.distinct().assert_eq(
             ArrayDataFrame(
                 [[1, 10], [2, None]],
                 "x:long,y:double",
             ))
Exemple #17
0
def test_parquet_io(tmpdir):
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    df2 = ArrayDataFrame([[[1, 2]]], "a:[int]")
    # {a:int} will become {a:long} because pyarrow lib has issue
    df3 = ArrayDataFrame([[dict(a=1)]], "a:{a:long}")
    for df in [df1, df2, df3]:
        path = os.path.join(tmpdir, "a.parquet")
        save_df(df, path)
        actual = load_df(path)
        df_eq(df, actual, throw=True)

    save_df(df1, path)
    actual = load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    actual = load_df(path, columns="b:str,a:int")
    df_eq(actual, [["2", 1]], "b:str,a:int")
    # can't specify wrong columns
    raises(Exception, lambda: load_df(path, columns="bb:str,a:int"))

    # load directory
    fs = FileSystem()
    folder = os.path.join(tmpdir, "folder")
    fs.makedirs(folder)
    f0 = os.path.join(folder, "_SUCCESS")
    f1 = os.path.join(folder, "1.parquet")
    f2 = os.path.join(folder, "3.parquet")
    fs.touch(f0)
    save_df(df1, f1)
    save_df(df1, f2)
    actual = load_df(folder, "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # load multiple paths
    actual = load_df([f1, f2], "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # overwrite = False
    raises(FileExistsError, lambda: save_df(df1, f1, mode="error"))
    # can't overwrite directory
    raises(
        IsADirectoryError,
        lambda: save_df(df1, folder, format_hint="parquet", mode="overwrite"),
    )
    # wrong mode
    raises(NotImplementedError, lambda: save_df(df1, f1, mode="dummy"))
Exemple #18
0
    def alter_columns(self, columns: Any) -> DataFrame:
        if self.empty:
            return ArrayDataFrame([], self.schema).alter_columns(columns)

        def _transform() -> Iterable[DataFrame]:
            for df in self.native:
                yield df.alter_columns(columns)

        return LocalDataFrameIterableDataFrame(_transform())
Exemple #19
0
    def _select_cols(self, keys: List[Any]) -> DataFrame:
        if self.empty:
            return ArrayDataFrame([], self.schema)[keys]

        def _transform():
            for df in self.native:
                yield df[keys]

        return LocalDataFrameIterableDataFrame(_transform())
Exemple #20
0
    def _drop_cols(self, cols: List[str]) -> DataFrame:
        if self.empty:
            return ArrayDataFrame([], self.schema)._drop_cols(cols)

        def _transform() -> Iterable[DataFrame]:
            for df in self.native:
                yield df._drop_cols(cols)

        return LocalDataFrameIterableDataFrame(_transform())
Exemple #21
0
def test_to_local_bounded_df():
    df = ArrayDataFrame([[0, 1]], "a:int,b:int")
    idf = IterableDataFrame([[0, 1]], "a:int,b:int", dict(a=1))
    assert to_local_bounded_df(df) is df
    r = to_local_bounded_df(idf)
    assert r is not idf
    assert r.as_array() == [[0, 1]]
    assert r.schema == "a:int,b:int"
    assert r.metadata == dict(a=1)
Exemple #22
0
def test_to_local_df():
    df = ArrayDataFrame([[0, 1]], "a:int,b:int")
    pdf = PandasDataFrame(df.as_pandas(), "a:int,b:int")
    idf = IterableDataFrame([[0, 1]], "a:int,b:int")
    assert to_local_df(df) is df
    assert to_local_df(pdf) is pdf
    assert to_local_df(idf) is idf
    assert isinstance(to_local_df(df.native, "a:int,b:int"), ArrayDataFrame)
    assert isinstance(to_local_df(pdf.native, "a:int,b:int"), PandasDataFrame)
    assert isinstance(to_local_df(idf.native, "a:int,b:int"),
                      IterableDataFrame)
    raises(TypeError, lambda: to_local_df(123))

    metadata = dict(a=1)
    assert to_local_df(df.native, df.schema, metadata).metadata == metadata

    raises(NoneArgumentError, lambda: to_local_df(None))
    raises(ValueError, lambda: to_local_df(df, "a:int,b:int", None))
Exemple #23
0
    def rename(self, columns: Dict[str, str]) -> DataFrame:
        if self.empty:
            return ArrayDataFrame([], self.schema).rename(columns)

        def _transform() -> Iterable[DataFrame]:
            for df in self.native:
                yield df.rename(columns)

        return LocalDataFrameIterableDataFrame(_transform())
Exemple #24
0
def test_nan_none():
    df = ArrayDataFrame([[None, None]], "b:str,c:double")
    assert df.as_pandas().iloc[0, 0] is None
    arr = PandasDataFrame(df.as_pandas(), df.schema).as_array()[0]
    assert arr[0] is None
    assert math.isnan(arr[1])

    df = ArrayDataFrame([[None, None]], "b:int,c:bool")
    arr = PandasDataFrame(df.as_pandas(),
                          df.schema).as_array(type_safe=True)[0]
    assert arr[0] is None
    assert arr[1] is None

    df = ArrayDataFrame([["a", 1.1], [None, None]], "b:str,c:double")
    arr = PandasDataFrame(df.as_pandas(),
                          df.schema).as_array(type_safe=True)[1]
    assert arr[0] is None
    assert arr[1] is None
Exemple #25
0
 def _get_dfs(self, row: Any) -> Iterable[Any]:
     for k, name, v in self.df_idx:
         if row[k] is None:
             df: DataFrame = ArrayDataFrame([], v)
         else:
             df = deserialize_df(row[k])  # type: ignore
             assert df is not None
         if self.named:
             yield name, df
         else:
             yield df
 def test_to_df(self):
     e = self.engine
     o = ArrayDataFrame([[1, 2]],
                        "a:int,b:int",
                        dict(a=1),
                        )
     a = e.to_df(o)
     assert a is not o
     df_eq(a, o, throw=True)
     a = e.to_df([[1, None]], "a:int,b:int", dict(a=1))
     df_eq(a, [[1, None]], "a:int,b:int", dict(a=1), throw=True)
Exemple #27
0
 def get(self, key: str):
     if self.dummy:
         return True, False, ArrayDataFrame([[100]], "a:int")
     self.get_called += 1
     if key not in self.tb:
         print("not get", key)
         return False, False, None
     x = self.tb[key]
     print("get", key)
     self.hit += 1
     return True, x[0], x[1]
Exemple #28
0
def test_use_df(tmpdir):
    # df generated inside dag
    with FugueSQLWorkflow() as dag:
        a = dag.df([[0], [1]], "a:int")
        dag("""
        b=CREATE[[0], [1]] SCHEMA a: int
        OUTPUT a, b USING assert_eq
        """)
        dag.sql_vars["b"].assert_eq(a)

    # external non-workflowdataframe
    arr = ArrayDataFrame([[0], [1]], "a:int")
    with FugueSQLWorkflow() as dag:
        dag(
            """
        b=CREATE[[0], [1]] SCHEMA a: int
        OUTPUT a, b USING assert_eq
        """,
            a=arr,
        )
        dag.sql_vars["b"].assert_eq(dag.df([[0], [1]], "a:int"))

    # from yield file
    engine = NativeExecutionEngine(
        conf={"fugue.workflow.checkpoint.path": os.path.join(tmpdir, "ck")})
    with FugueSQLWorkflow(engine) as dag:
        dag("CREATE[[0], [1]] SCHEMA a: int YIELD FILE AS b")
        res = dag.yields["b"]

    with FugueSQLWorkflow(engine) as dag:
        dag(
            """
        b=CREATE[[0], [1]] SCHEMA a: int
        OUTPUT a, b USING assert_eq
        """,
            a=res,
        )

    # from yield dataframe
    engine = NativeExecutionEngine()
    with FugueSQLWorkflow(engine) as dag:
        dag("CREATE[[0], [1]] SCHEMA a: int YIELD DATAFRAME AS b")
        res = dag.yields["b"]

    with FugueSQLWorkflow(engine) as dag:
        dag(
            """
        b=CREATE[[0], [1]] SCHEMA a: int
        OUTPUT a, b USING assert_eq
        """,
            a=res,
        )
Exemple #29
0
    def execute(self, ctx: TaskContext) -> None:
        self._outputter._execution_engine = self._get_execution_engine(ctx)
        if self._input_has_key:
            inputs = DataFrames(ctx.inputs)
        else:
            inputs = DataFrames(ctx.inputs.values())

        def exe():
            self._outputter.validate_on_runtime(inputs)
            self._outputter.process(inputs)

        self._execute_with_modified_traceback(exe)
        # TODO: output dummy to force cache to work, should we fix adagio?
        ctx.outputs["_0"] = ArrayDataFrame([], "_0:int")
Exemple #30
0
 def transform(self, dfs: DataFrames) -> LocalDataFrame:
     assert 1 == self.on_init_called
     assert "test" in self.workflow_conf
     assert 2 == len(dfs)
     if self.params.get("named", False):
         assert dfs.has_key
     else:
         assert not dfs.has_key
     row = self.cursor.key_value_array + [
         dfs[0].count(),
         dfs[1].count(),
         self.params.get("p", 1),
     ]
     return ArrayDataFrame([row], self.output_schema)