Ejemplo n.º 1
0
    def assign(
        self, df: DataFrame, columns: List[ColumnExpr], metadata: Any = None
    ) -> DataFrame:
        """Update existing columns with new values and add new columns

        :param df: the dataframe to set columns
        :param columns: column expressions
        :param metadata: dict-like object to add to the result dataframe,
            defaults to None. It's currently not used
        :return: the updated dataframe

        .. tip::

            This can be used to cast data types, alter column values or add new
            columns. But you can't use aggregation in columns.

        .. admonition:: New Since
            :class: hint

            **0.6.0**

        .. seealso::

            Please find more expression examples in :mod:`fugue.column.sql` and
            :mod:`fugue.column.functions`

        .. admonition:: Examples

            .. code-block:: python

                # assume df has schema: a:int,b:str

                # add constant column x
                engine.assign(df, lit(1,"x"))

                # change column b to be a constant integer
                engine.assign(df, lit(1,"b"))

                # add new x to be a+b
                engine.assign(df, (col("a")+col("b")).alias("x"))

                # cast column a data type to double
                engine.assign(df, col("a").cast(float))
        """
        SelectColumns(
            *columns
        ).assert_no_wildcard().assert_all_with_names().assert_no_agg()

        cols = [col(n) for n in df.schema.names]
        for c in columns:
            if c.output_name not in df.schema:
                cols.append(c)
            else:
                cols[df.schema.index_of_key(c.output_name)] = c
        return self.select(df, SelectColumns(*cols), metadata=metadata)
Ejemplo n.º 2
0
    def filter(
        self, df: DataFrame, condition: ColumnExpr, metadata: Any = None
    ) -> DataFrame:
        """Filter rows by the given condition

        :param df: the dataframe to be filtered
        :param condition: (boolean) column expression
        :param metadata: dict-like object to add to the result dataframe,
            defaults to None. It's currently not used
        :return: the filtered dataframe

        .. admonition:: New Since
            :class: hint

            **0.6.0**

        .. seealso::

            Please find more expression examples in :mod:`fugue.column.sql` and
            :mod:`fugue.column.functions`

        .. admonition:: Examples

            .. code-block:: python

                import fugue.column.functions as f

                engine.filter(df, (col("a")>1) & (col("b")=="x"))
                engine.filter(df, f.coalesce(col("a"),col("b"))>1)
        """
        return self.select(
            df, cols=SelectColumns(col("*")), where=condition, metadata=metadata
        )
Ejemplo n.º 3
0
    def aggregate(
        self,
        df: DataFrame,
        partition_spec: Optional[PartitionSpec],
        agg_cols: List[ColumnExpr],
        metadata: Any = None,
    ):
        """Aggregate on dataframe

        :param df: the dataframe to aggregate on
        :param partition_spec: PartitionSpec to specify partition keys
        :param agg_cols: aggregation expressions
        :param metadata: dict-like object to add to the result dataframe,
            defaults to None. It's currently not used
        :return: the aggregated result as a dataframe

        .. admonition:: New Since
            :class: hint

            **0.6.0**

        .. seealso::

            Please find more expression examples in :mod:`fugue.column.sql` and
            :mod:`fugue.column.functions`

        .. admonition:: Examples

            .. code-block:: python

                import fugue.column.functions as f

                # SELECT MAX(b) AS b FROM df
                engine.aggregate(
                    df,
                    partition_spec=None,
                    agg_cols=[f.max(col("b"))])

                # SELECT a, MAX(b) AS x FROM df GROUP BY a
                engine.aggregate(
                    df,
                    partition_spec=PartitionSpec(by=["a"]),
                    agg_cols=[f.max(col("b")).alias("x")])
        """
        assert_or_throw(len(agg_cols) > 0, ValueError("agg_cols can't be empty"))
        assert_or_throw(
            all(is_agg(x) for x in agg_cols),
            ValueError("all agg_cols must be aggregation functions"),
        )
        keys: List[ColumnExpr] = []
        if partition_spec is not None and len(partition_spec.partition_by) > 0:
            keys = [col(y) for y in partition_spec.partition_by]
        cols = SelectColumns(*keys, *agg_cols)
        return self.select(df, cols=cols, metadata=metadata)
Ejemplo n.º 4
0
def test_correct_select_schema():
    schema = Schema("a:double,b:str")
    gen = SQLExpressionGenerator()

    sc = SelectColumns(col("*"), col("c"))
    output = Schema("a:double,b:str,c:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c is None

    output = Schema("a:int,b:int,c:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c == "a:double,b:str"

    sc = SelectColumns(f.count(col("*")).alias("t"), col("c").alias("a"))
    output = Schema("t:int,a:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c is None

    sc = SelectColumns((col("a") + col("b")).cast(str).alias("a"), lit(1, "c"))
    output = Schema("a:int,c:str")
    c = gen.correct_select_schema(schema, sc, output)
    assert c == "a:str,c:long"
Ejemplo n.º 5
0
def test_select_columns():
    # not all with names
    cols = SelectColumns(col("a"), lit(1, "b"),
                         col("bb") + col("cc"), f.first(col("c")))
    assert to_uuid(cols) == to_uuid(cols)
    raises(ValueError, lambda: cols.assert_all_with_names())

    # distinct
    cols2 = SelectColumns(
        col("a"),
        lit(1, "b"),
        col("bb") + col("cc"),
        f.first(col("c")),
        arg_distinct=True,
    )
    assert to_uuid(cols) != to_uuid(cols2)

    # duplicated names
    cols = SelectColumns(col("a").alias("b"), lit(1, "b"))
    assert to_uuid(cols) != to_uuid(
        SelectColumns(col("a").alias("b"), lit(1, "c")))
    raises(ValueError, lambda: cols.assert_all_with_names())

    # with *, all cols must have alias
    cols = SelectColumns(col("*"), col("a")).assert_no_agg()
    raises(ValueError, lambda: cols.assert_all_with_names())

    # * can be used at most once
    raises(ValueError, lambda: SelectColumns(col("*"), col("*"),
                                             col("a").alias("p")))

    # * can't be used with aggregation
    raises(ValueError, lambda: SelectColumns(col("*"),
                                             f.first(col("a")).alias("x")))

    cols = SelectColumns(
        col("aa").alias("a").cast(int),
        lit(1, "b"),
        (col("bb") + col("cc")).alias("c"),
        f.first(col("c")).alias("d"),
    ).assert_all_with_names()
    raises(AssertionError, lambda: cols.assert_no_agg())
    assert not cols.simple
    assert 1 == len(cols.simple_cols)
    assert "CAST(aa AS long) AS a" == str(cols.simple_cols[0])
    assert cols.has_literals
    assert 1 == len(cols.literals)
    assert "1 AS b" == str(cols.literals[0])
    assert cols.has_agg
    assert 1 == len(cols.non_agg_funcs)
    assert "+(bb,cc) AS c" == str(cols.non_agg_funcs[0])
    assert 1 == len(cols.agg_funcs)
    assert "FIRST(c) AS d" == str(cols.agg_funcs[0])
    assert 2 == len(cols.group_keys)  # a, c
    assert "aa" == cols.group_keys[0].output_name
    assert "" == cols.group_keys[1].output_name
    assert isinstance(cols.group_keys[1], _BinaryOpExpr)

    cols = SelectColumns(col("a")).assert_no_wildcard()
    assert cols.simple
    assert not cols.has_literals
    assert not cols.has_agg

    cols = SelectColumns(col("x"), col("*"), col("y") + col("z"))
    cols = cols.replace_wildcard(Schema("a:int,b:int"))
    assert "x" == str(cols.all_cols[0])
Ejemplo n.º 6
0
def test_no_cast():
    gen = SQLExpressionGenerator(enable_cast=False)
    cols = SelectColumns(
        f.max(col("c")).cast("long").alias("c"), col("a", "aa"), col("b"))
    assert "SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b" == gen.select(
        cols, "t")
Ejemplo n.º 7
0
def test_select():
    gen = SQLExpressionGenerator()

    # no aggregation
    cols = SelectColumns(col("*"))
    assert "SELECT * FROM x" == gen.select(cols, "x")

    cols = SelectColumns(col("a"),
                         lit(1).alias("b"), (col("b") + col("c")).alias("x"))
    where = (col("a") > 5).alias("aa")
    assert "SELECT a, 1 AS b, b+c AS x FROM t WHERE a>5" == gen.select(
        cols, "t", where=where)

    # aggregation without literals
    cols = SelectColumns(f.max(col("c")).alias("c"), col("a", "aa"), col("b"))
    assert "SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b" == gen.select(
        cols, "t")

    where = col("a") < 10
    having = (f.max(col("a")) > 5).alias("aaa")
    assert (
        "SELECT MAX(c) AS c, a AS aa, b FROM t WHERE a<10 GROUP BY a, b HAVING MAX(a)>5"
        == gen.select(cols, "t", where=where, having=having))

    cols = SelectColumns(
        f.min(col("c") + 1).alias("c"),
        f.avg(col("d") + col("e")).cast(int).alias("d"),
    )
    assert "SELECT MIN(c+1) AS c, CAST(AVG(d+e) AS long) AS d FROM t" == gen.select(
        cols, "t")

    # aggregation with literals
    cols = SelectColumns(lit(1, "k"),
                         f.max(col("c")).alias("c"), lit(2, "j"),
                         col("a", "aa"), col("b"))
    assert (
        "SELECT 1 AS k, c, 2 AS j, aa, b FROM (SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b)"
        == gen.select(cols, "t"))

    cols = SelectColumns(lit(1, "k"), f.max(col("c")).alias("c"), lit(2, "j"))
    assert "SELECT 1 AS k, c, 2 AS j FROM (SELECT MAX(c) AS c FROM t)" == gen.select(
        cols, "t")

    cols = SelectColumns(lit(1, "k"), col("a"),
                         f.max(col("c")).alias("c"), lit(2, "j"))
    assert (
        "SELECT 1 AS k, a, c, 2 AS j FROM (SELECT a, MAX(c) AS c FROM t GROUP BY a)"
        == gen.select(cols, "t"))

    # cast
    cols = SelectColumns(
        col("c").cast(float),
        f.avg(col("d") + col("e")).cast(int).alias("d"),
    )
    assert (
        "SELECT CAST(c AS double) AS c, CAST(AVG(d+e) AS long) AS d FROM t GROUP BY c"
        == gen.select(cols, "t"))

    # infer alias
    cols = SelectColumns(
        (-col("c")).cast(float),
        f.max(col("e")).cast(int),
        f.avg(col("d") + col("e")).cast(int).alias("d"),
    )
    assert ("SELECT CAST(-c AS double) AS c, CAST(MAX(e) AS long) AS e, "
            "CAST(AVG(d+e) AS long) AS d FROM t GROUP BY -c" == gen.select(
                cols, "t"))
Ejemplo n.º 8
0
        def test_select(self):
            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)

            # simple
            b = e.select(
                a, SelectColumns(col("b"),
                                 (col("b") + 1).alias("c").cast(str)))
            df_eq(
                b,
                [[2, "3"], [2, "3"], [1, "2"], [4, "5"], [4, "5"]],
                "b:int,c:str",
                throw=True,
            )

            # with distinct
            b = e.select(
                a,
                SelectColumns(col("b"), (col("b") + 1).alias("c").cast(str),
                              arg_distinct=True),
            )
            df_eq(
                b,
                [[2, "3"], [1, "2"], [4, "5"]],
                "b:int,c:str",
                throw=True,
            )

            # wildcard
            b = e.select(a,
                         SelectColumns(col("*")),
                         where=col("a") + col("b") == 3)
            df_eq(b, [[1, 2]], "a:double,b:int", throw=True)

            # aggregation
            b = e.select(
                a,
                SelectColumns(col("a"),
                              ff.sum(col("b")).cast(float).alias("b")))
            df_eq(b, [[1, 2], [3, 4], [None, 7]],
                  "a:double,b:double",
                  throw=True)

            # having
            # https://github.com/fugue-project/fugue/issues/222
            col_b = ff.sum(col("b"))
            b = e.select(
                a,
                SelectColumns(col("a"),
                              col_b.cast(float).alias("b")),
                having=(col_b >= 7) | (col("a") == 1),
            )
            df_eq(b, [[1, 2], [None, 7]], "a:double,b:double", throw=True)

            # literal + alias inference
            # https://github.com/fugue-project/fugue/issues/222
            col_b = ff.sum(col("b"))
            b = e.select(
                a,
                SelectColumns(col("a"),
                              lit(1, "o").cast(str), col_b.cast(float)),
                having=(col_b >= 7) | (col("a") == 1),
            )
            df_eq(b, [[1, "1", 2], [None, "1", 7]],
                  "a:double,o:str,b:double",
                  throw=True)