def test_large_compile(): """ Tests that compiling a large expression tree finishes within a reasonable amount of time """ num_columns = 20 num_joins = 7 class MockCloudSpannerClient(cs_compile.CloudSpannerClient): def __init__(self): pass names = [f"col_{i}" for i in range(num_columns)] schema = ibis.Schema(names, ["string"] * num_columns) ibis_client = MockCloudSpannerClient() table = TableExpr( ops.SQLQueryResult("select * from t", schema, ibis_client)) for _ in range(num_joins): table = table.mutate(dummy=ibis.literal("")) table = table.left_join(table, ["dummy"])[[table]] start = datetime.datetime.now() cs_compile.compile(table) delta = datetime.datetime.now() - start assert delta.total_seconds() < 60
def __call__(self, expr: ir.TableExpr) -> ir.TableExpr: op = expr.op() if isinstance(op, ops.Join): scope = {X: op.left, Y: op.right} else: scope = {X: expr} return expr.projection( list( itertools.chain.from_iterable( ibis.util.promote_list(column.resolve(expr, scope)) for column in self.columns)))
def __call__(self, expr: ir.TableExpr) -> ir.TableExpr: columns = [ column.resolve(expr, { X: expr }).name(name) for name, column in self.mutations.items() ] return expr.projection(columns)
def join(left, right, predicates=(), how='inner'): """ Perform a relational join between two tables. Does not resolve resulting table schema. Parameters ---------- left : TableExpr right : TableExpr predicates : join expression(s) how : string, default 'inner' - 'inner': inner join - 'left': left join - 'outer': full outer join - 'semi' or 'left_semi': left semi join - 'anti': anti join Returns ------- joined : TableExpr Note, schema is not materialized yet """ klass = _join_classes[how.lower()] if isinstance(predicates, Expr): predicates = _L.unwrap_ands(predicates) op = klass(left, right, predicates) return TableExpr(op)
def filter(table, predicates): """ Select rows from table based on boolean expressions Parameters ---------- predicates : boolean array expressions, or list thereof Returns ------- filtered_expr : TableExpr """ if isinstance(predicates, Expr): predicates = _L.unwrap_ands(predicates) predicates = util.promote_list(predicates) predicates = [ir.bind_expr(table, x) for x in predicates] resolved_predicates = [] for pred in predicates: if isinstance(pred, ir.AnalyticExpr): pred = pred.to_filter() resolved_predicates.append(pred) op = _L.apply_filter(table, resolved_predicates) return TableExpr(op)
def aggregate(table, metrics=None, by=None, having=None, **kwds): """ Aggregate a table with a given set of reductions, with grouping expressions, and post-aggregation filters. Parameters ---------- table : table expression metrics : expression or expression list by : optional, default None Grouping expressions having : optional, default None Post-aggregation filters Returns ------- agg_expr : TableExpr """ if metrics is None: metrics = [] for k, v in sorted(kwds.items()): v = table._ensure_expr(v) metrics.append(v.name(k)) op = _ops.Aggregation(table, metrics, by=by, having=having) return TableExpr(op)
def table(schema, name=None): """ Create an unbound Ibis table for creating expressions. Cannot be executed without being bound to some physical table. Useful for testing Parameters ---------- schema : ibis Schema name : string, default None Name for table Returns ------- table : TableExpr """ if not isinstance(schema, Schema): if isinstance(schema, list): schema = Schema.from_tuples(schema) else: schema = Schema.from_dict(schema) node = _ops.UnboundTable(schema, name=name) return TableExpr(node)
def _table_materialize(table): """ Force schema resolution for a joined table, selecting all fields from all tables. """ if table._is_materialized(): return table else: op = _ops.MaterializedJoin(table) return TableExpr(op)
def handle_selection(self, ibis_table: TableExpr, columns: List[Value]) -> TableExpr: column_mutation = [] for column in columns: if column.get_name() == "*": return ibis_table column_value = column.get_value().name(column.get_name()) column_mutation.append(column_value) if column_mutation: return ibis_table.projection(column_mutation) return ibis_table
def union_distinct( self, expr1: TableExpr, expr2: TableExpr, ): """ Return union distinct of two TableExpr :param expr1: Left TableExpr :param expr2: Right TableExpr :return: """ return expr1.union(expr2, distinct=True)
def flatten(table: ir.TableExpr): """Extract all intersection or difference queries from `table`. Parameters ---------- table : TableExpr Returns ------- Iterable[Union[TableExpr]] """ op = table.op() return list(toolz.concatv(flatten_union(op.left), flatten_union(op.right)))
def flatten_difference(table: ir.TableExpr): """Extract all intersection queries from `table`. Parameters ---------- table : TableExpr Returns ------- Iterable[Union[TableExpr]] """ op = table.op() if isinstance(op, ops.Difference): return toolz.concatv(flatten_union(op.left), flatten_union(op.right)) return [table]
def _table_view(self): """ Create a new table expression that is semantically equivalent to the current one, but is considered a distinct relation for evaluation purposes (e.g. in SQL). For doing any self-referencing operations, like a self-join, you will use this operation to create a reference to the current table expression. Returns ------- expr : TableExpr """ return TableExpr(_ops.SelfReference(self))
def flatten_union(table: ir.TableExpr): """Extract all union queries from `table`. Parameters ---------- table : TableExpr Returns ------- Iterable[Union[TableExpr, bool]] """ op = table.op() if isinstance(op, ops.Union): return toolz.concatv(flatten_union(op.left), [op.distinct], flatten_union(op.right)) return [table]
def filter(table, predicates): """ Select rows from table based on boolean expressions Parameters ---------- predicates : boolean array expressions, or list thereof Returns ------- filtered_expr : TableExpr """ if isinstance(predicates, Expr): predicates = _L.unwrap_ands(predicates) op = _L.apply_filter(table, predicates) return TableExpr(op)
def _table_limit(table, n, offset=0): """ Select the first n rows at beginning of table (may not be deterministic depending on implementatino and presence of a sorting). Parameters ---------- n : int Rows to include offset : int, default 0 Number of rows to skip first Returns ------- limited : TableExpr """ op = _ops.Limit(table, n, offset=offset) return TableExpr(op)
def _table_union(left, right, distinct=False): """ Form the table set union of two table expressions having identical schemas. Parameters ---------- right : TableExpr distinct : boolean, default False Only union distinct rows not occurring in the calling table (this can be very expensive, be careful) Returns ------- union : TableExpr """ op = _ops.Union(left, right, distinct=distinct) return TableExpr(op)
def handle_filtering( self, ibis_table: TableExpr, where_expr: Tree, internal_transformer: InternalTransformer, ): """ Returns frame with appropriately selected and named columns :param ibis_table: Ibis expression table to manipulate :param where_expr: Syntax tree containing where clause :param internal_transformer: Transformer to transform the where clauses :return: Filtered TableExpr """ if where_expr is not None: where_expression: WhereExpression = internal_transformer.transform( where_expr) return ibis_table.filter(where_expression.value.get_value()) return ibis_table
def projection(table, exprs): """ Compute new table expression with the indicated column expressions from this table. Parameters ---------- exprs : column expression, or string, or list of column expressions and strings. If strings passed, must be columns in the table already Returns ------- projection : TableExpr """ import ibis.expr.analysis as L if isinstance(exprs, (Expr,) + six.string_types): exprs = [exprs] exprs = [table._ensure_expr(e) for e in exprs] op = L.Projector(table, exprs).get_result() return TableExpr(op)
def flatten_union(table: ir.TableExpr): """Extract all union queries from `table`. Parameters ---------- table : TableExpr Returns ------- Iterable[Union[TableExpr, bool]] """ op = table.op() if isinstance(op, ops.Union): # For some reason mypy considers `op.left` and `op.right` # of `Argument` type, and fails the validation. While in # `flatten` types are the same, and it works return toolz.concatv( flatten_union(op.left), # type: ignore [op.distinct], flatten_union(op.right), # type: ignore ) return [table]
def _table_sort_by(table, sort_exprs): """ Sort table by the indicated column expressions and sort orders (ascending/descending) Parameters ---------- sort_exprs : sorting expressions Must be one of: - Column name or expression - Sort key, e.g. desc(col) - (column name, True (ascending) / False (descending)) Examples -------- sorted = table.sort_by([('a', True), ('b', False)]) Returns ------- sorted : TableExpr """ op = _ops.SortBy(table, sort_exprs) return TableExpr(op)
def cross_join(*args, **kwargs): """ Perform a cross join (cartesian product) amongst a list of tables, with optional set of prefixes to apply to overlapping column names Parameters ---------- positional args: tables to join prefixes keyword : prefixes for each table Not yet implemented Examples -------- >>> joined1 = ibis.cross_join(a, b, c, d, e) >>> joined2 = ibis.cross_join(a, b, c, prefixes=['a_', 'b_', 'c_'])) Returns ------- joined : TableExpr If prefixes not provided, the result schema is not yet materialized """ op = _ops.CrossJoin(*args, **kwargs) return TableExpr(op)
def get_columns_with_alias(table: TableExpr, alias: str): return [ column.name(f"{alias}.{column_name}") for column_name, column in zip(table.columns, table.get_columns(table.columns)) ]
def _handle_count_star(self, aggregate: Aggregate, relation: TableExpr): if isinstance(aggregate.value, CountStar): aggregate.value = relation.count() return aggregate
def __call__(self, left: ir.TableExpr) -> ir.TableExpr: right = self.right on = self.on.resolve(left, {X: left, Y: right}) return left.join(right, on, how=self.how)
def __call__(self, expr: ir.TableExpr) -> ir.TableExpr: return expr.sort_by( list( itertools.chain.from_iterable( ibis.util.promote_list(key.resolve(expr, {X: expr})) for key in self.sort_keys)))
def resolve(self, expr: ir.TableExpr, scope: Scope) -> ir.TableExpr: return expr.head(self.expr.resolve(expr, scope))
def __call__(self, expr: ir.TableExpr) -> ir.TableExpr: return expr.mutate( **{ name: column.resolve(expr, {X: expr}) for name, column in self.mutations.items() })
def handle_aggregation( self, aggregates: Dict[str, Aggregate], group_columns: List[GroupByColumn], table: TableExpr, having_expr: Tree, internal_transformer: InternalTransformer, selected_columns: List[Value], ): """ Handles all aggregation operations when translating from dictionary info to dataframe """ selected_column_names = { column.get_name().lower() for column in selected_columns } aggregate_ibis_columns = self._get_aggregate_ibis_columns( aggregates, table) having = self._handle_having_expressions( having_expr, internal_transformer, table, aggregates, [group_column.get_name() for group_column in group_columns], ) if group_columns and not selected_column_names: for group_column in group_columns: group_column.set_ibis_name_to_name() if group_columns and having is not None and not aggregates: raise NotImplementedError( "Group by, having, without aggregation not yet implemented") if group_columns and not aggregates: for column in [ selected_column.get_name() for selected_column in selected_columns ]: if column not in group_columns: raise InvalidQueryException( self.format_column_needs_agg_or_group_msg(column)) table = table.distinct() elif aggregates and not group_columns: table = table.aggregate(aggregate_ibis_columns, having=having) elif aggregates and group_columns: table = table.group_by( [group_column.value for group_column in group_columns]) if having is not None: table = table.having(having) table = table.aggregate(aggregate_ibis_columns) non_selected_columns = [] if group_columns and aggregates: for group_column in group_columns: if group_column.get_name().lower( ) not in selected_column_names: non_selected_columns.append(group_column.group_by_name) if non_selected_columns: table = table.drop(non_selected_columns) return table
def _get_all_columns(table: TableExpr): return table.get_columns(table.columns)