def fire(self, expr): # If not a Project, who cares? if not isinstance(expr, algebra.Project): return expr mappings = [(None, x) for x in expr.columnlist] col_select = algebra.Apply(mappings, expr.input) return algebra.Distinct(input=col_select)
def fire(self, expr): if isinstance(expr, algebra.ProjectingJoin): return algebra.Apply([(None, x) for x in expr.output_columns], algebra.Join(expr.condition, expr.left, expr.right)) return expr
def fire(self, expr): if not isinstance(expr, algebra.GroupBy): return expr # A simple grouping expression is an AttributeRef def is_simple_grp_expr(grp): return isinstance(grp, expression.AttributeRef) complex_grp_exprs = [(i, grp) for (i, grp) in enumerate(expr.grouping_list) if not is_simple_grp_expr(grp)] complex_agg_exprs = [agg for agg in expr.aggregate_list if not is_simple_agg_expr(agg)] # There are no complicated expressions, we're okay with the existing # GroupBy. if not complex_grp_exprs and not complex_agg_exprs: return expr # Construct the Apply we're going to stick before the GroupBy child_scheme = expr.input.scheme() # First: copy every column from the input verbatim mappings = [(None, UnnamedAttributeRef(i)) for i in range(len(child_scheme))] # Next: move the complex grouping expressions into the Apply, replace # with simple refs for i, grp_expr in complex_grp_exprs: mappings.append((None, grp_expr)) expr.grouping_list[i] = UnnamedAttributeRef(len(mappings) - 1) # Finally: move the complex aggregate expressions into the Apply, # replace with simple refs for agg_expr in complex_agg_exprs: mappings.append((None, agg_expr.input)) agg_expr.input = UnnamedAttributeRef(len(mappings) - 1) # Construct and prepend the new Apply new_apply = algebra.Apply(mappings, expr.input) expr.input = new_apply # Don't overwrite expr.grouping_list or expr.aggregate_list, instead we # are mutating the objects it contains when we modify grp_expr or # agg_expr in the above for loops. return expr
def fire(self, expr): if isinstance(expr, algebra.GroupBy) and len(expr.aggregate_list) == 0: # We can turn an empty GroupBy into a Distinct. However, # we must ensure that the GroupBy does not do any column # re-ordering. group_cols = expr.get_unnamed_grouping_list() if all(e.position == i for i, e in enumerate(group_cols)): # No reordering is done return algebra.Distinct(input=expr.input) # Some reordering is done, so shim in the Apply to mimic it. reorder_cols = algebra.Apply( emitters=[(None, e) for e in group_cols], input=expr.input) return algebra.Distinct(input=reorder_cols) return expr
def fire(self, expr): if not isinstance(expr, algebra.GroupBy): return expr aggs = expr.get_unnamed_aggregate_list() # Maps aggregate index j to index i < j that j duplicates dups = {} # Maps non-dup aggregate index i to index i' <= i it will have # in the output aggregate list. orig = {} for i, a in enumerate(aggs): if i in dups: continue orig[i] = len(orig) dups.update({(j + i + 1): i for j, b in enumerate(aggs[(i + 1):]) if a == b}) if len(dups) == 0: # All the aggregate expressions are unique, we're good return expr ################################# # Construct a new Apply that drops all duplicates and replaces # them with repeated UnnamedAttributeRefs ################################# # First keep the grouping list intact num_grps = len(expr.grouping_list) mappings = [(None, UnnamedAttributeRef(i)) for i in range(num_grps)] # Construct the references to the grouping list for i in range(len(aggs)): if i in orig: m = orig[i] + num_grps else: m = orig[dups[i]] + num_grps mappings.append((None, UnnamedAttributeRef(m))) # Drop any duplicates from the agg list expr.aggregate_list = [aggs[i] for i in sorted(orig)] return algebra.Apply(emitters=mappings, input=expr)
def fire(self, expr): # don't allow swap-created join to be swapped if (isinstance(expr, algebra.Join) or isinstance(expr, algebra.CrossProduct)) \ and not hasattr(expr, '__swapped__'): assert ( isinstance( expr, algebra.Join)) or ( isinstance( expr, algebra.CrossProduct)) # An apply will undo the effect of the swap on the scheme, # so above operators won't be affected left_sch = expr.left.scheme() right_sch = expr.right.scheme() leftlen = len(left_sch) rightlen = len(right_sch) assert leftlen + rightlen == len(expr.scheme()) emitters_left = [(left_sch.getName(i), UnnamedAttributeRef(rightlen + i)) for i in range(leftlen)] emitters_right = [(right_sch.getName(i), UnnamedAttributeRef(i)) for i in range(rightlen)] emitters = emitters_left + emitters_right if isinstance(expr, algebra.Join): # reindex the expression index_map = dict([(oldpos, attr[1].position) for (oldpos, attr) in enumerate(emitters)]) expression.reindex_expr(expr.condition, index_map) newjoin = algebra.Join(expr.condition, expr.right, expr.left) else: newjoin = algebra.CrossProduct(expr.right, expr.left) newjoin.__swapped__ = True return algebra.Apply(emitters=emitters, input=newjoin) else: return expr
def renameIDB(self, plan): """Rename the attributes of the plan to match the current rule. Used when chaining multiple rules.""" term = self # Do we know the actual scheme? If it's recursive, we don't # So derive it from the rule head # Not really satisfied with this. try: sch = plan.scheme() except algebra.RecursionError: sch = Scheme([make_attr(i, r, term.name) for i, r in enumerate(term.valuerefs)]) oldscheme = [name for (name, _) in sch] termscheme = [expr for expr in term.valuerefs] if len(oldscheme) != len(termscheme): raise TypeError("Rule with head %s does not match Term %s" % (term.name, term)) # noqa pairs = zip(termscheme, oldscheme) # Merge the old and new schemes. Use new where we can. def choosename(new, old): if isinstance(new, Var): # Then use this new var as the column name return new.var else: # It's an implicit selection condition, like R(x,3). In R's # schema, don't call the column name '3' (new), instead call it # whatever the column name was in the prior rule where R is the # head variable R (old). return old mappings = [(choosename(new, old), expression.UnnamedAttributeRef(i)) for i, (new, old) in enumerate(pairs)] # Use an apply operator to implement the renaming plan = algebra.Apply(mappings, plan) return plan
def toRA(self, program): """Emit a relational plan for this rule""" if program.compiling(self.head): # recursive rule if not self.fixpoint: self.fixpoint = algebra.Fixpoint() state = algebra.State(self.head.name, self.fixpoint) return state else: self.compiling = True # get the terms, like A(X,Y,"foo") terms = [c for c in self.body if isinstance(c, Term)] # get the conditions, like Z=3 conditions = [c for c in self.body if isinstance(c, expression.BinaryBooleanOperator)] if len(conditions) > 0: LOG.debug("found conditions: %s (type=%s) for program %s", conditions, type(conditions[0]), program) # noqa else: LOG.debug("found conditions: %s (type=%s) for program %s", conditions, None, program) # noqa # construct the join graph joingraph = nx.Graph() N = len(terms) for i, term1 in enumerate(terms): # store the order for explaining queries later -- not strictly # necessary term1.originalorder = i # for each term, add it as a vertex, # and for each term it joins to, add an edge joingraph.add_node(term1, term=term1) for j in range(i + 1, N): term2 = terms[j] LOG.debug("joinsto? %s %s", term1, term2) joins = term1.joinsto(term2, conditions) if joins: conjunction = reduce(expression.AND, joins) LOG.debug("add edge: %s --[%s]--> %s", term1, conjunction, term2) joingraph.add_edge(term1, term2, condition=conjunction, terms=(term1, term2)) # find connected components (some non-determinism in the order here) comps = nx.connected_component_subgraphs(joingraph) component_plans = [] # for each component, choose a join order for component in comps: cycleconditions = [] # check for cycles cycles = nx.cycle_basis(component) while cycles: LOG.debug("found cycles: %s", cycles) # choose an edge to break the cycle # that edge will be a selection condition after the final join # oneedge = cycles[0][-2:] # try to make the chosen edge from cycle deterministic oneedge = sorted(cycles[0], key=lambda v: v.originalorder)[-2:] data = component.get_edge_data(*oneedge) LOG.debug("picked edge: %s, data: %s", oneedge, data) cycleconditions.append(data) component.remove_edge(*oneedge) cycles = nx.cycle_basis(component) if len(component) == 1: # no joins to plan onlyterm = component.nodes()[0] plan = onlyterm.makeLeaf(conditions, program) else: LOG.debug("component: %s", component) # TODO: clean this up. # joingraph -> joinsequence -> relational plan planner = BFSLeftDeepPlanner(component) joinsequence = planner.chooseplan() LOG.debug("join sequence: %s", joinsequence) # create a relational plan, finally # pass in the conditions to make the leaves of the plan plan = joinsequence.makePlan(conditions, program) LOG.debug("cycleconditions: %s", cycleconditions) for condition_info in cycleconditions: predicate = condition_info["condition"] terms = condition_info["terms"] # change all UnnamedAttributes based on the # offset of its Term termsToOffset = dict((t, joinsequence.offset(t)) for t in terms) LOG.debug("before add offset %s", predicate) predicate.add_offset_by_terms(termsToOffset) LOG.debug("after add offset %s", predicate) # create selections after each cycle plan = algebra.Select(predicate, plan) component_plans.append(plan) # link the components with a cross product plan = component_plans[0] for newplan in component_plans[1:]: plan = algebra.CrossProduct(plan, newplan) try: scheme = plan.scheme() except AttributeError: scheme = Scheme([make_attr(i, r, self.head.name) for i, r in enumerate(self.head.valuerefs)]) # noqa # Helper function for the next two steps (TODO: move this to a method?) def findvar(variable): var = variable.var if var not in scheme: msg = "Head variable %s does not appear in rule body: %s" % (var, self) # noqa raise SyntaxError(msg) return expression.UnnamedAttributeRef(scheme.getPosition(var)) class FindVarExpressionVisitor(SimpleExpressionVisitor): def __init__(self): self.stack = [] def getresult(self): assert len(self.stack) == 1 return self.stack.pop() def visit_unary(self, unaryexpr): inputexpr = self.stack.pop() self.stack.append(unaryexpr.__class__(inputexpr)) def visit_binary(self, binaryexpr): right = self.stack.pop() left = self.stack.pop() self.stack.append(binaryexpr.__class__(left, right)) def visit_zeroary(self, zeroaryexpr): self.stack.append(zeroaryexpr.__class__()) def visit_literal(self, literalexpr): self.stack.append(literalexpr.__class__(literalexpr.value)) def visit_nary(self, naryexpr): raise NotImplementedError( "TODO: implement findvar visit of nary expression") def visit_attr(self, attr): assert False, \ "FindVar should not be used on expressions with attributes" def visit_Case(self, caseExpr): raise NotImplementedError("Case now implemented for Datalog?") def visit_Var(self, var): asAttr = findvar(var) self.stack.append(asAttr) # TODO: add the other aggregates # TODO and move aggregates to expression-visitor def visit_SUM(self, x): self.visit_unary(x) def visit_COUNT(self, x): self.visit_unary(x) # if this Rule includes a server specification, add a partition # operator if self.isParallel(): if isinstance(self.head.serverspec, Broadcast): plan = algebra.Broadcast(plan) if isinstance(self.head.serverspec, PartitionBy): positions = [findvar(v) for v in self.head.serverspec.variables] plan = algebra.PartitionBy(positions, plan) def toAttrRef(e): """ Resolve variable references in the head; pass through aggregate expressions If expression requires an Apply then return True, else False """ LOG.debug("find reference for %s", e) visitor = FindVarExpressionVisitor() e.accept(visitor) return visitor.getresult() columnlist = [toAttrRef(v) for v in self.head.valuerefs] LOG.debug("columnlist for Project (or group by) is %s", columnlist) # If any of the expressions in the head are aggregate expression, # construct a group by if any(expression.expression_contains_aggregate(v) for v in self.head.valuerefs): emit_clause = [(None, a_or_g) for a_or_g in columnlist] return raco.myrial.groupby.groupby(plan, emit_clause, []) elif any([not isinstance(e, Var) for e in self.head.valuerefs]): # If complex expressions in head, then precede Project with Apply # NOTE: should Apply actually just append emitters to schema # instead of doing column select? # we decided probably not in # https://github.com/uwescience/raco/pull/209 plan = algebra.Apply([(None, e) for e in columnlist], plan) else: # otherwise, just build a Project plan = algebra.Apply(emitters=[(None, c) for c in columnlist], input=plan) # If we found a cycle, the "root" of the plan is the fixpoint operator if self.fixpoint: self.fixpoint.loopBody(plan) plan = self.fixpoint self.fixpoint = None self.compiling = False return plan
def fire(self, op): # Punt if it's not a group by or we've already converted this into an # an instance of self.gb_class if op.__class__ != algebra.GroupBy: return op if self._only_fire_on_multi_key and len(op.grouping_list) == 0: out_op = self._only_fire_on_multi_key() out_op.copy(op) return out_op # Do not shuffle and do not decompose if the data is shuffled already if DecomposeGroupBy.check_no_shuffle(op): out_op = self._gb_class() out_op.copy(op) return out_op # Bail early if we have any non-decomposable aggregates if not all(x.is_decomposable() for x in op.aggregate_list): out_op = self._gb_class() out_op.copy(op) DecomposeGroupBy.do_transfer(out_op) return out_op num_grouping_terms = len(op.grouping_list) local_emitters = [] local_statemods = [] remote_emitters = [] remote_statemods = [] finalizer_exprs = [] # The starting positions for the current local, remote aggregate local_output_pos = num_grouping_terms remote_output_pos = num_grouping_terms requires_finalizer = False for agg in op.aggregate_list: # Multiple emit arguments can be associated with a single # decomposition rule; coalesce them all together. state = agg.get_decomposable_state() assert state ################################ # Extract the set of emitters and statemods required for the # local aggregate. ################################ laggs = state.get_local_emitters() local_emitters.extend(laggs) local_statemods.extend(state.get_local_statemods()) ################################ # Extract the set of emitters and statemods required for the # remote aggregate. Remote expressions must be rebased to # remove instances of LocalAggregateOutput ################################ raggs = state.get_remote_emitters() raggs = [rebase_local_aggregate_output(x, local_output_pos) for x in raggs] remote_emitters.extend(raggs) rsms = state.get_remote_statemods() for sm in rsms: update_expr = rebase_local_aggregate_output( sm.update_expr, local_output_pos) remote_statemods.append( StateVar(sm.name, sm.init_expr, update_expr)) ################################ # Extract any required finalizers. These must be rebased to remove # instances of RemoteAggregateOutput ################################ finalizer = state.get_finalizer() if finalizer is not None: requires_finalizer = True finalizer_exprs.append( rebase_finalizer(finalizer, remote_output_pos)) else: for i in range(len(raggs)): finalizer_exprs.append( UnnamedAttributeRef(remote_output_pos + i)) local_output_pos += len(laggs) remote_output_pos += len(raggs) ################################ # Glue together the local and remote aggregates: # Local => Shuffle => Remote => (optional) Finalizer. ################################ local_gb = self._gb_class(op.grouping_list, local_emitters, op.input, local_statemods) grouping_fields = [UnnamedAttributeRef(i) for i in range(num_grouping_terms)] remote_gb = self._gb_class(grouping_fields, remote_emitters, local_gb, remote_statemods) DecomposeGroupBy.do_transfer(remote_gb) if requires_finalizer: # Pass through grouping terms gmappings = [(None, UnnamedAttributeRef(i)) for i in range(num_grouping_terms)] fmappings = [(None, fx) for fx in finalizer_exprs] return algebra.Apply(gmappings + fmappings, remote_gb) return remote_gb
def fire(self, op): if isinstance(op, algebra.GroupBy): child = op.input child_scheme = child.scheme() grp_list = op.get_unnamed_grouping_list() agg_list = op.get_unnamed_aggregate_list() up_names = [name for name, ex in op.updaters] up_list = op.get_unnamed_update_exprs() agg = [accessed_columns(a) for a in agg_list] up = [accessed_columns(a) for a in up_list] pos = [{g.position} for g in grp_list] accessed = sorted(set(itertools.chain(*(up + agg + pos)))) if not accessed: # Bug #207: COUNTALL() does not access any columns. So if the # query is just a COUNT(*), we would generate an empty Apply. # If this happens, just keep the first column of the input. accessed = [0] if len(accessed) != len(child_scheme): emitters = [(None, UnnamedAttributeRef(i)) for i in accessed] new_apply = algebra.Apply(emitters, child) index_map = {a: i for (i, a) in enumerate(accessed)} for agg_expr in itertools.chain(grp_list, agg_list, up_list): expression.reindex_expr(agg_expr, index_map) op.grouping_list = grp_list op.aggregate_list = agg_list op.updaters = [(name, ex) for name, ex in zip(up_names, up_list)] op.input = new_apply return op elif isinstance(op, algebra.ProjectingJoin): l_scheme = op.left.scheme() r_scheme = op.right.scheme() in_scheme = l_scheme + r_scheme condition = to_unnamed_recursive(op.condition, in_scheme) column_list = [to_unnamed_recursive(c, in_scheme) for c in op.output_columns] accessed = (accessed_columns(condition) | set(c.position for c in op.output_columns)) if len(accessed) == len(in_scheme): return op accessed = sorted(accessed) left = [a for a in accessed if a < len(l_scheme)] if len(left) < len(l_scheme): emits = [(None, UnnamedAttributeRef(a)) for a in left] apply = algebra.Apply(emits, op.left) op.left = apply right = [a - len(l_scheme) for a in accessed if a >= len(l_scheme)] if len(right) < len(r_scheme): emits = [(None, UnnamedAttributeRef(a)) for a in right] apply = algebra.Apply(emits, op.right) op.right = apply index_map = {a: i for (i, a) in enumerate(accessed)} expression.reindex_expr(condition, index_map) [expression.reindex_expr(c, index_map) for c in column_list] op.condition = condition op.output_columns = column_list return op return op
def fire(self, op): if not isinstance(op, algebra.Apply): return op child = op.input if isinstance(child, algebra.Apply): emits = op.get_unnamed_emit_exprs() child_emits = child.get_unnamed_emit_exprs() def convert(n): if isinstance(n, expression.UnnamedAttributeRef): return child_emits[n.position] n.apply(convert) return n emits = [convert(e) for e in emits] new_apply = algebra.Apply(emitters=zip(op.get_names(), emits), input=child.input) return self.fire(new_apply) elif isinstance(child, algebra.ProjectingJoin): emits = op.get_unnamed_emit_exprs() # If this apply is only AttributeRefs and the columns already # have the correct names, we can push it into the ProjectingJoin if (all(isinstance(e, expression.AttributeRef) for e in emits) and len(set(emits)) == len(emits)): new_cols = [child.output_columns[e.position] for e in emits] # We need to ensure that left columns come before right cols left_sch = child.left.scheme() right_sch = child.right.scheme() combined = left_sch + right_sch left_len = len(left_sch) new_cols = [expression.to_unnamed_recursive(e, combined) for e in new_cols] side = [e.position >= left_len for e in new_cols] if sorted(side) == side: # Left columns do come before right cols new_pj = algebra.ProjectingJoin( condition=child.condition, left=child.left, right=child.right, output_columns=new_cols) if new_pj.scheme() == op.scheme(): return new_pj accessed = sorted(set(itertools.chain(*(accessed_columns(e) for e in emits)))) index_map = {a: i for (i, a) in enumerate(accessed)} child.output_columns = [child.output_columns[i] for i in accessed] for e in emits: expression.reindex_expr(e, index_map) return algebra.Apply(emitters=zip(op.get_names(), emits), input=child) elif isinstance(child, algebra.GroupBy): emits = op.get_unnamed_emit_exprs() assert all(is_simple_agg_expr(agg) for agg in child.aggregate_list) accessed = sorted(set(itertools.chain(*(accessed_columns(e) for e in emits)))) num_grps = len(child.grouping_list) accessed_aggs = [i for i in accessed if i >= num_grps] if len(accessed_aggs) == len(child.aggregate_list): return op unused_map = {i: j + num_grps for j, i in enumerate(accessed_aggs)} # copy the groupby operator so we can modify it newgb = child.__class__() newgb.copy(child) # remove aggregates that are projected out newgb.aggregate_list = [newgb.aggregate_list[i - num_grps] for i in accessed_aggs] for e in emits: expression.reindex_expr(e, unused_map) return algebra.Apply(emitters=zip(op.get_names(), emits), input=newgb) return op