Beispiel #1
0
class _UnresolvedSubquery(Value, _Negatable):
    """An exists subquery whose outer leaf table is unknown.

    Notes
    -----
    Consider the following ibis expressions

    >>> t = ibis.table(dict(a="string"))
    >>> s = ibis.table(dict(a="string"))
    >>> cond = (t.a == s.a).any()

    Without knowing the table to use as the outer query there are two ways to
    turn this expression into a SQL `EXISTS` predicate depending on which of
    `t` or `s` is filtered on.

    Filtering from `t`:

    ```sql
    SELECT *
    FROM t
    WHERE EXISTS (SELECT 1 WHERE t.a = s.a)
    ```

    Filtering from `s`:

    ```sql
    SELECT *
    FROM s
    WHERE EXISTS (SELECT 1 WHERE t.a = s.a)
    ```

    Notably the subquery `(SELECT 1 WHERE t.a = s.a)` cannot stand on its own.

    The purpose of `_UnresolvedSubquery` is to capture enough information about
    an exists predicate such that it can be resolved when predicates are
    resolved against the outer leaf table when `Selection`s are constructed.
    """

    tables = rlz.tuple_of(rlz.table)
    predicates = rlz.tuple_of(rlz.boolean)

    output_dtype = dt.boolean
    output_shape = rlz.Shape.COLUMNAR

    @abc.abstractmethod
    def _resolve(
        self, table: ir.Table
    ) -> type[ExistsSubquery] | type[NotExistsSubquery]:  # pragma: no cover
        ...
Beispiel #2
0
def test_valid_list_of_extra():
    validator = rlz.tuple_of(identity)
    assert validator((3, 2)) == tuple([3, 2])

    validator = rlz.list_of(rlz.list_of(rlz.string))
    result = validator([[], ['a']])
    assert result[1][0].equals(ibis.literal('a'))
Beispiel #3
0
class Bucket(BucketLike):
    arg = rlz.column(rlz.any)
    buckets = rlz.tuple_of(rlz.scalar(rlz.any))
    closed = rlz.optional(rlz.isin({'left', 'right'}), default='left')
    close_extreme = rlz.optional(rlz.instance_of(bool), default=True)
    include_under = rlz.optional(rlz.instance_of(bool), default=False)
    include_over = rlz.optional(rlz.instance_of(bool), default=False)

    def __init__(self, buckets, include_under, include_over, **kwargs):
        if not len(buckets):
            raise ValueError('Must be at least one bucket edge')
        elif len(buckets) == 1:
            if not include_under or not include_over:
                raise ValueError('If one bucket edge provided, must have '
                                 'include_under=True and include_over=True')
        super().__init__(
            buckets=buckets,
            include_under=include_under,
            include_over=include_over,
            **kwargs,
        )

    @property
    def nbuckets(self):
        return len(self.buckets) - 1 + self.include_over + self.include_under
Beispiel #4
0
class VectorizedUDF(Value):
    func = rlz.instance_of((FunctionType, LambdaType))
    func_args = rlz.tuple_of(rlz.column(rlz.any))
    # TODO(kszucs): should rename these arguments to
    # input_dtypes and return_dtype
    input_type = rlz.tuple_of(rlz.datatype)
    return_type = rlz.datatype

    @property
    def inputs(self):
        return self.func_args

    @property
    def output_dtype(self):
        return self.return_type

    def root_tables(self):
        return distinct_roots(*self.func_args)
Beispiel #5
0
class NotExistsSubquery(Value, _Negatable):
    foreign_table = rlz.table
    predicates = rlz.tuple_of(rlz.boolean)

    output_dtype = dt.boolean
    output_shape = rlz.Shape.COLUMNAR

    def negate(self) -> ExistsSubquery:
        return ExistsSubquery(*self.args)
Beispiel #6
0
class DropNa(TableNode, sch.HasSchema):
    """Drop null values in the table."""

    table = rlz.table
    how = rlz.isin({'any', 'all'})
    subset = rlz.optional(rlz.tuple_of(rlz.column_from("table")), default=())

    @property
    def schema(self):
        return self.table.schema()
Beispiel #7
0
class ValueList(Value):
    """Data structure for a list of value expressions"""

    # NOTE: this proxies the Value behaviour to the underlying values

    values = rlz.tuple_of(rlz.any)

    output_type = ir.ValueList
    output_dtype = rlz.dtype_like("values")
    output_shape = rlz.shape_like("values")

    def root_tables(self):
        return distinct_roots(*self.values)
Beispiel #8
0
class CategoryLabel(Value):
    arg = rlz.category
    labels = rlz.tuple_of(rlz.instance_of(str))
    nulls = rlz.optional(rlz.instance_of(str))

    output_dtype = dt.string
    output_shape = rlz.shape_like("arg")

    def __init__(self, arg, labels, **kwargs):
        cardinality = arg.type().cardinality
        if len(labels) != cardinality:
            raise ValueError('Number of labels must match number of '
                             f'categories: {cardinality}')
        super().__init__(arg=arg, labels=labels, **kwargs)
Beispiel #9
0
class Aggregation(TableNode, sch.HasSchema):
    """
    metrics : per-group scalar aggregates
    by : group expressions
    having : post-aggregation predicate

    TODO: not putting this in the aggregate operation yet
    where : pre-aggregation predicate
    """

    table = rlz.table
    metrics = rlz.optional(
        rlz.tuple_of(
            rlz.one_of((
                rlz.function_of(
                    "table",
                    output_rule=rlz.one_of(
                        (rlz.reduction, rlz.scalar(rlz.any))),
                ),
                rlz.reduction,
                rlz.scalar(rlz.any),
                rlz.tuple_of(rlz.scalar(rlz.any)),
            )),
            flatten=True,
        ),
        default=(),
    )
    by = rlz.optional(
        rlz.tuple_of(
            rlz.one_of((
                rlz.function_of("table"),
                rlz.column_from("table"),
                rlz.column(rlz.any),
            ))),
        default=(),
    )
    having = rlz.optional(
        rlz.tuple_of(
            rlz.one_of((
                rlz.function_of("table", output_rule=rlz.scalar(rlz.boolean)),
                rlz.scalar(rlz.boolean),
            )), ),
        default=(),
    )
    predicates = rlz.optional(rlz.tuple_of(rlz.boolean), default=())
    sort_keys = rlz.optional(
        rlz.tuple_of(
            rlz.one_of((
                rlz.column_from("table"),
                rlz.function_of("table"),
                rlz.sort_key(from_="table"),
                rlz.pair(
                    rlz.one_of((
                        rlz.column_from("table"),
                        rlz.function_of("table"),
                        rlz.any,
                    )),
                    rlz.map_to({
                        True: True,
                        False: False,
                        "desc": False,
                        "descending": False,
                        "asc": True,
                        "ascending": True,
                        1: True,
                        0: False,
                    }),
                ),
            ))),
        default=(),
    )

    def __init__(self, table, metrics, by, having, predicates, sort_keys):
        from ibis.expr.analysis import shares_all_roots, shares_some_roots

        # All non-scalar refs originate from the input table
        if not shares_all_roots(metrics + by + having + sort_keys, table):
            raise com.RelationError(
                "Selection expressions don't fully originate from "
                "dependencies of the table expression.")

        # invariant due to Aggregation and AggregateSelection requiring a valid
        # Selection
        assert all(
            shares_some_roots(predicate, table) for predicate in predicates)

        if not by:
            sort_keys = tuple()

        super().__init__(
            table=table,
            metrics=metrics,
            by=by,
            having=having,
            predicates=predicates,
            sort_keys=sort_keys,
        )
        # Validate schema has no overlapping columns
        assert self.schema

    def blocks(self):
        return True

    @util.deprecated(instead="instantiate Aggregation directly",
                     version="4.0.0")
    def substitute_table(self, table_expr):  # pragma: no cover
        return Aggregation(table_expr,
                           self.metrics,
                           by=self.by,
                           having=self.having)

    @cached_property
    def schema(self):
        names = []
        types = []

        for e in self.by + self.metrics:
            if isinstance(e, ir.DestructValue):
                # If this is a destruct, then we destructure
                # the result and assign to multiple columns
                struct_type = e.type()
                for name in struct_type.names:
                    names.append(name)
                    types.append(struct_type[name])
            else:
                names.append(e.get_name())
                types.append(e.type())

        return sch.Schema(names, types)

    def sort_by(self, expr, sort_exprs):
        from ibis.expr.analysis import shares_all_roots

        resolved_keys = _maybe_convert_sort_keys([self.table, expr],
                                                 sort_exprs)
        if shares_all_roots(resolved_keys, self.table):
            return Aggregation(
                self.table,
                self.metrics,
                by=self.by,
                having=self.having,
                predicates=self.predicates,
                sort_keys=self.sort_keys + tuple(resolved_keys),
            )

        return Selection(expr, [], sort_keys=resolved_keys)
Beispiel #10
0
class Selection(TableNode, sch.HasSchema):
    table = rlz.table
    selections = rlz.optional(
        rlz.tuple_of(
            rlz.one_of((
                rlz.table,
                rlz.column_from("table"),
                rlz.function_of("table"),
                rlz.any,
            ))),
        default=(),
    )
    predicates = rlz.optional(rlz.tuple_of(rlz.boolean), default=())
    sort_keys = rlz.optional(
        rlz.tuple_of(
            rlz.one_of((
                rlz.column_from("table"),
                rlz.function_of("table"),
                rlz.sort_key(from_="table"),
                rlz.pair(
                    rlz.one_of((
                        rlz.column_from("table"),
                        rlz.function_of("table"),
                        rlz.any,
                    )),
                    rlz.map_to({
                        True: True,
                        False: False,
                        "desc": False,
                        "descending": False,
                        "asc": True,
                        "ascending": True,
                        1: True,
                        0: False,
                    }),
                ),
            ))),
        default=(),
    )

    def __init__(self, table, selections, predicates, sort_keys, **kwargs):
        from ibis.expr.analysis import shares_all_roots, shares_some_roots

        if not shares_all_roots(selections + sort_keys, table):
            raise com.RelationError(
                "Selection expressions don't fully originate from "
                "dependencies of the table expression.")

        for predicate in predicates:
            if not shares_some_roots(predicate, table):
                raise com.RelationError(
                    "Predicate doesn't share any roots with table")

        super().__init__(
            table=table,
            selections=selections,
            predicates=predicates,
            sort_keys=sort_keys,
            **kwargs,
        )

        # Validate no overlapping columns in schema
        assert self.schema

    @cached_property
    def _projection(self):
        return self.__class__(table=self.table, selections=self.selections)

    @cached_property
    def schema(self):
        # Resolve schema and initialize
        if not self.selections:
            return self.table.schema()

        types = []
        names = []

        for projection in self.selections:
            if isinstance(projection, ir.DestructColumn):
                # If this is a destruct, then we destructure
                # the result and assign to multiple columns
                struct_type = projection.type()
                for name in struct_type.names:
                    names.append(name)
                    types.append(struct_type[name])
            elif isinstance(projection, ir.Value):
                names.append(projection.get_name())
                types.append(projection.type())
            elif isinstance(projection, ir.Table):
                schema = projection.schema()
                names.extend(schema.names)
                types.extend(schema.types)

        return sch.Schema(names, types)

    def blocks(self):
        return bool(self.selections)

    @util.deprecated(instead="instantiate Selection directly", version="4.0.0")
    def substitute_table(self, table_expr):  # pragma: no cover
        return Selection(table_expr, self.selections)

    def root_tables(self):
        return [self]

    @util.deprecated(instead="", version="4.0.0")
    def can_add_filters(self, wrapped_expr, predicates):  # pragma: no cover
        pass

    @util.deprecated(instead="", version="4.0.0")
    def empty_or_equal(self, other) -> bool:  # pragma: no cover
        for field in "selections", "sort_keys", "predicates":
            selfs = getattr(self, field)
            others = getattr(other, field)
            valid = (not selfs or not others
                     or (a.equals(b) for a, b in zip(selfs, others)))
            if not valid:
                return False
        return True

    @util.deprecated(instead="", version="4.0.0")
    def compatible_with(self, other):  # pragma: no cover
        # self and other are equivalent except for predicates, selections, or
        # sort keys any of which is allowed to be empty. If both are not empty
        # then they must be equal
        if self.equals(other):
            return True

        if not isinstance(other, type(self)):
            return False

        return self.table.equals(other.table) and self.empty_or_equal(other)

    def aggregate(self, this, metrics, by=None, having=None):
        if len(self.selections) > 0:
            return Aggregation(this, metrics, by=by, having=having)
        else:
            helper = AggregateSelection(this, metrics, by, having)
            return helper.get_result()

    def sort_by(self, expr, sort_exprs):
        from ibis.expr.analysis import shares_all_roots

        resolved_keys = _maybe_convert_sort_keys([self.table, expr],
                                                 sort_exprs)
        if not self.blocks():
            if shares_all_roots(resolved_keys, self.table):
                return Selection(
                    self.table,
                    self.selections,
                    predicates=self.predicates,
                    sort_keys=self.sort_keys + tuple(resolved_keys),
                )

        return Selection(expr, [], sort_keys=resolved_keys)