def fire(self, expr): # don't allow swap-created join to be swapped if isinstance(expr, algebra.Join) and not hasattr(expr, '__swapped__'): assert type(expr) is algebra.Join # An apply will undo the effect of the swap on the scheme, # so above operators won't be affected left_sch = expr.left.scheme() right_sch = expr.right.scheme() leftlen = len(left_sch) rightlen = len(right_sch) assert leftlen + rightlen == len(expr.scheme()) emitters_left = [(left_sch.getName(i), UnnamedAttributeRef(rightlen + i)) for i in range(leftlen)] emitters_right = [(right_sch.getName(i), UnnamedAttributeRef(i)) for i in range(rightlen)] emitters = emitters_left + emitters_right # reindex the expression index_map = dict([(oldpos, attr[1].position) for (oldpos, attr) in enumerate(emitters)]) expression.reindex_expr(expr.condition, index_map) newjoin = algebra.Join(expr.condition, expr.right, expr.left) newjoin.__swapped__ = True return algebra.Apply(emitters=emitters, input=newjoin) else: return expr
def fire(self, op): if isinstance(op, algebra.GroupBy): child = op.input child_scheme = child.scheme() grp_list = [to_unnamed_recursive(g, child_scheme) for g in op.grouping_list] agg_list = [to_unnamed_recursive(a, child_scheme) for a in op.aggregate_list] agg = [accessed_columns(a) for a in agg_list] pos = [g.position for g in grp_list] accessed = sorted(set(itertools.chain(*(agg + [pos])))) if not accessed: # Bug #207: COUNTALL() does not access any columns. So if the # query is just a COUNT(*), we would generate an empty Apply. # If this happens, just keep the first column of the input. accessed = [0] if len(accessed) != len(child_scheme): emitters = [(None, UnnamedAttributeRef(i)) for i in accessed] new_apply = algebra.Apply(emitters, child) index_map = {a: i for (i, a) in enumerate(accessed)} for agg_expr in itertools.chain(grp_list, agg_list): expression.reindex_expr(agg_expr, index_map) op.grouping_list = grp_list op.aggregate_list = agg_list op.input = new_apply return op elif isinstance(op, algebra.ProjectingJoin): l_scheme = op.left.scheme() r_scheme = op.right.scheme() in_scheme = l_scheme + r_scheme condition = to_unnamed_recursive(op.condition, in_scheme) column_list = [to_unnamed_recursive(c, in_scheme) for c in op.output_columns] accessed = (accessed_columns(condition) | set(c.position for c in op.output_columns)) if len(accessed) == len(in_scheme): return op accessed = sorted(accessed) left = [a for a in accessed if a < len(l_scheme)] if len(left) < len(l_scheme): emits = [(None, UnnamedAttributeRef(a)) for a in left] apply = algebra.Apply(emits, op.left) op.left = apply right = [a - len(l_scheme) for a in accessed if a >= len(l_scheme)] if len(right) < len(r_scheme): emits = [(None, UnnamedAttributeRef(a)) for a in right] apply = algebra.Apply(emits, op.right) op.right = apply index_map = {a: i for (i, a) in enumerate(accessed)} expression.reindex_expr(condition, index_map) [expression.reindex_expr(c, index_map) for c in column_list] op.condition = condition op.output_columns = column_list return op return op
def fire(self, op): if not isinstance(op, algebra.Apply): return op child = op.input if isinstance(child, algebra.Apply): in_scheme = child.scheme() child_in_scheme = child.input.scheme() names, emits = zip(*op.emitters) emits = [to_unnamed_recursive(e, in_scheme) for e in emits] child_emits = [to_unnamed_recursive(e[1], child_in_scheme) for e in child.emitters] def convert(n): if isinstance(n, expression.UnnamedAttributeRef): n = child_emits[n.position] else: n.apply(convert) return n emits = [convert(copy.deepcopy(e)) for e in emits] new_apply = algebra.Apply(emitters=zip(names, emits), input=child.input) return self.fire(new_apply) elif isinstance(child, algebra.ProjectingJoin): in_scheme = child.scheme() names, emits = zip(*op.emitters) emits = [to_unnamed_recursive(e, in_scheme) for e in emits] accessed = sorted(set(itertools.chain(*(accessed_columns(e) for e in emits)))) index_map = {a: i for (i, a) in enumerate(accessed)} child.output_columns = [child.output_columns[i] for i in accessed] for e in emits: expression.reindex_expr(e, index_map) # TODO(dhalperi) we may not need the Apply if all it did was rename # and/or select certain columns. Figure out these cases and omit # the Apply return algebra.Apply(emitters=zip(names, emits), input=child) return op
def fire(self, expr): # don't allow swap-created join to be swapped if (isinstance(expr, algebra.Join) or isinstance(expr, algebra.CrossProduct)) \ and not hasattr(expr, '__swapped__'): assert ( isinstance( expr, algebra.Join)) or ( isinstance( expr, algebra.CrossProduct)) # An apply will undo the effect of the swap on the scheme, # so above operators won't be affected left_sch = expr.left.scheme() right_sch = expr.right.scheme() leftlen = len(left_sch) rightlen = len(right_sch) assert leftlen + rightlen == len(expr.scheme()) emitters_left = [(left_sch.getName(i), UnnamedAttributeRef(rightlen + i)) for i in range(leftlen)] emitters_right = [(right_sch.getName(i), UnnamedAttributeRef(i)) for i in range(rightlen)] emitters = emitters_left + emitters_right if isinstance(expr, algebra.Join): # reindex the expression index_map = dict([(oldpos, attr[1].position) for (oldpos, attr) in enumerate(emitters)]) expression.reindex_expr(expr.condition, index_map) newjoin = algebra.Join(expr.condition, expr.right, expr.left) else: newjoin = algebra.CrossProduct(expr.right, expr.left) newjoin.__swapped__ = True return algebra.Apply(emitters=emitters, input=newjoin) else: return expr
def fire(self, op): if not isinstance(op, algebra.Apply): return op child = op.input if isinstance(child, algebra.Apply): emits = op.get_unnamed_emit_exprs() child_emits = child.get_unnamed_emit_exprs() def convert(n): if isinstance(n, expression.UnnamedAttributeRef): return child_emits[n.position] n.apply(convert) return n emits = [convert(e) for e in emits] new_apply = algebra.Apply(emitters=zip(op.get_names(), emits), input=child.input) return self.fire(new_apply) elif isinstance(child, algebra.ProjectingJoin): emits = op.get_unnamed_emit_exprs() # If this apply is only AttributeRefs and the columns already # have the correct names, we can push it into the ProjectingJoin if (all(isinstance(e, expression.AttributeRef) for e in emits) and len(set(emits)) == len(emits)): new_cols = [child.output_columns[e.position] for e in emits] # We need to ensure that left columns come before right cols left_sch = child.left.scheme() right_sch = child.right.scheme() combined = left_sch + right_sch left_len = len(left_sch) new_cols = [expression.to_unnamed_recursive(e, combined) for e in new_cols] side = [e.position >= left_len for e in new_cols] if sorted(side) == side: # Left columns do come before right cols new_pj = algebra.ProjectingJoin( condition=child.condition, left=child.left, right=child.right, output_columns=new_cols) if new_pj.scheme() == op.scheme(): return new_pj accessed = sorted(set(itertools.chain(*(accessed_columns(e) for e in emits)))) index_map = {a: i for (i, a) in enumerate(accessed)} child.output_columns = [child.output_columns[i] for i in accessed] for e in emits: expression.reindex_expr(e, index_map) return algebra.Apply(emitters=zip(op.get_names(), emits), input=child) elif isinstance(child, algebra.GroupBy): emits = op.get_unnamed_emit_exprs() assert all(is_simple_agg_expr(agg) for agg in child.aggregate_list) accessed = sorted(set(itertools.chain(*(accessed_columns(e) for e in emits)))) num_grps = len(child.grouping_list) accessed_aggs = [i for i in accessed if i >= num_grps] if len(accessed_aggs) == len(child.aggregate_list): return op unused_map = {i: j + num_grps for j, i in enumerate(accessed_aggs)} # copy the groupby operator so we can modify it newgb = child.__class__() newgb.copy(child) # remove aggregates that are projected out newgb.aggregate_list = [newgb.aggregate_list[i - num_grps] for i in accessed_aggs] for e in emits: expression.reindex_expr(e, unused_map) return algebra.Apply(emitters=zip(op.get_names(), emits), input=newgb) return op
def descend_tree(op, cond): """Recursively push a selection condition down a tree of operators. :param op: The root of an operator tree :type op: raco.algebra.Operator :param cond: The selection condition :type cond: raco.expression.expression :return: A (possibly modified) operator. """ if isinstance(op, algebra.Select): # Keep pushing; selects are commutative op.input = PushSelects.descend_tree(op.input, cond) return op elif isinstance(op, algebra.CompositeBinaryOperator): # Joins and cross-products; consider conversion to an equijoin # Expressions containing random do not commute across joins has_random = any(isinstance(e, RANDOM) for e in cond.walk()) if not has_random: left_len = len(op.left.scheme()) accessed = accessed_columns(cond) in_left = [col < left_len for col in accessed] if all(in_left): # Push the select into the left sub-tree. op.left = PushSelects.descend_tree(op.left, cond) return op elif not any(in_left): # Push into right subtree; rebase column indexes expression.rebase_expr(cond, left_len) op.right = PushSelects.descend_tree(op.right, cond) return op else: # Selection includes both children; attempt to create an # equijoin condition cols = PushSelects.is_column_equality_comparison(cond) if cols: return op.add_equijoin_condition(cols[0], cols[1]) elif isinstance(op, algebra.Apply): # Convert accessed to a list from a set to ensure consistent order accessed = list(accessed_columns(cond)) accessed_emits = [op.emitters[i][1] for i in accessed] if all(isinstance(e, expression.AttributeRef) for e in accessed_emits): unnamed_emits = expression.ensure_unnamed( accessed_emits, op.input) # This condition only touches columns that are copied verbatim # from the child, so we can push it. index_map = {a: e.position for (a, e) in zip(accessed, unnamed_emits)} expression.reindex_expr(cond, index_map) op.input = PushSelects.descend_tree(op.input, cond) return op elif isinstance(op, algebra.GroupBy): # Convert accessed to a list from a set to ensure consistent order accessed = list(accessed_columns(cond)) if all((a < len(op.grouping_list)) for a in accessed): accessed_grps = [op.grouping_list[a] for a in accessed] # This condition only touches columns that are copied verbatim # from the child (grouping keys), so we can push it. assert all(isinstance(e, expression.AttributeRef) for e in op.grouping_list) unnamed_grps = expression.ensure_unnamed(accessed_grps, op.input) index_map = {a: e.position for (a, e) in zip(accessed, unnamed_grps)} expression.reindex_expr(cond, index_map) op.input = PushSelects.descend_tree(op.input, cond) return op # Can't push any more: instantiate the selection new_op = algebra.Select(cond, op) new_op.has_been_pushed = True return new_op
def fire(self, op): if isinstance(op, algebra.GroupBy): child = op.input child_scheme = child.scheme() grp_list = op.get_unnamed_grouping_list() agg_list = op.get_unnamed_aggregate_list() up_names = [name for name, ex in op.updaters] up_list = op.get_unnamed_update_exprs() agg = [accessed_columns(a) for a in agg_list] up = [accessed_columns(a) for a in up_list] pos = [{g.position} for g in grp_list] accessed = sorted(set(itertools.chain(*(up + agg + pos)))) if not accessed: # Bug #207: COUNTALL() does not access any columns. So if the # query is just a COUNT(*), we would generate an empty Apply. # If this happens, just keep the first column of the input. accessed = [0] if len(accessed) != len(child_scheme): emitters = [(None, UnnamedAttributeRef(i)) for i in accessed] new_apply = algebra.Apply(emitters, child) index_map = {a: i for (i, a) in enumerate(accessed)} for agg_expr in itertools.chain(grp_list, agg_list, up_list): expression.reindex_expr(agg_expr, index_map) op.grouping_list = grp_list op.aggregate_list = agg_list op.updaters = [(name, ex) for name, ex in zip(up_names, up_list)] op.input = new_apply return op elif isinstance(op, algebra.ProjectingJoin): l_scheme = op.left.scheme() r_scheme = op.right.scheme() in_scheme = l_scheme + r_scheme condition = to_unnamed_recursive(op.condition, in_scheme) column_list = [to_unnamed_recursive(c, in_scheme) for c in op.output_columns] accessed = (accessed_columns(condition) | set(c.position for c in op.output_columns)) if len(accessed) == len(in_scheme): return op accessed = sorted(accessed) left = [a for a in accessed if a < len(l_scheme)] if len(left) < len(l_scheme): emits = [(None, UnnamedAttributeRef(a)) for a in left] apply = algebra.Apply(emits, op.left) op.left = apply right = [a - len(l_scheme) for a in accessed if a >= len(l_scheme)] if len(right) < len(r_scheme): emits = [(None, UnnamedAttributeRef(a)) for a in right] apply = algebra.Apply(emits, op.right) op.right = apply index_map = {a: i for (i, a) in enumerate(accessed)} expression.reindex_expr(condition, index_map) [expression.reindex_expr(c, index_map) for c in column_list] op.condition = condition op.output_columns = column_list return op return op