Esempio n. 1
0
    def fire(self, expr):
        # If not a Project, who cares?
        if not isinstance(expr, algebra.Project):
            return expr

        mappings = [(None, x) for x in expr.columnlist]
        col_select = algebra.Apply(mappings, expr.input)
        return algebra.Distinct(input=col_select)
Esempio n. 2
0
    def fire(self, expr):
        if isinstance(expr, algebra.ProjectingJoin):
            return algebra.Apply([(None, x) for x in expr.output_columns],
                                 algebra.Join(expr.condition,
                                              expr.left,
                                              expr.right))

        return expr
Esempio n. 3
0
    def fire(self, expr):
        if not isinstance(expr, algebra.GroupBy):
            return expr

        # A simple grouping expression is an AttributeRef
        def is_simple_grp_expr(grp):
            return isinstance(grp, expression.AttributeRef)

        complex_grp_exprs = [(i, grp)
                             for (i, grp) in enumerate(expr.grouping_list)
                             if not is_simple_grp_expr(grp)]

        complex_agg_exprs = [agg for agg in expr.aggregate_list
                             if not is_simple_agg_expr(agg)]

        # There are no complicated expressions, we're okay with the existing
        # GroupBy.
        if not complex_grp_exprs and not complex_agg_exprs:
            return expr

        # Construct the Apply we're going to stick before the GroupBy

        child_scheme = expr.input.scheme()

        # First: copy every column from the input verbatim
        mappings = [(None, UnnamedAttributeRef(i))
                    for i in range(len(child_scheme))]

        # Next: move the complex grouping expressions into the Apply, replace
        # with simple refs
        for i, grp_expr in complex_grp_exprs:
            mappings.append((None, grp_expr))
            expr.grouping_list[i] = UnnamedAttributeRef(len(mappings) - 1)

        # Finally: move the complex aggregate expressions into the Apply,
        # replace with simple refs
        for agg_expr in complex_agg_exprs:
            mappings.append((None, agg_expr.input))
            agg_expr.input = UnnamedAttributeRef(len(mappings) - 1)

        # Construct and prepend the new Apply
        new_apply = algebra.Apply(mappings, expr.input)
        expr.input = new_apply

        # Don't overwrite expr.grouping_list or expr.aggregate_list, instead we
        # are mutating the objects it contains when we modify grp_expr or
        # agg_expr in the above for loops.
        return expr
Esempio n. 4
0
    def fire(self, expr):
        if isinstance(expr, algebra.GroupBy) and len(expr.aggregate_list) == 0:
            # We can turn an empty GroupBy into a Distinct. However,
            # we must ensure that the GroupBy does not do any column
            # re-ordering.
            group_cols = expr.get_unnamed_grouping_list()
            if all(e.position == i for i, e in enumerate(group_cols)):
                # No reordering is done
                return algebra.Distinct(input=expr.input)

            # Some reordering is done, so shim in the Apply to mimic it.
            reorder_cols = algebra.Apply(
                emitters=[(None, e) for e in group_cols], input=expr.input)
            return algebra.Distinct(input=reorder_cols)

        return expr
Esempio n. 5
0
    def fire(self, expr):
        if not isinstance(expr, algebra.GroupBy):
            return expr

        aggs = expr.get_unnamed_aggregate_list()

        # Maps aggregate index j to index i < j that j duplicates
        dups = {}
        # Maps non-dup aggregate index i to index i' <= i it will have
        # in the output aggregate list.
        orig = {}
        for i, a in enumerate(aggs):
            if i in dups:
                continue
            orig[i] = len(orig)
            dups.update({(j + i + 1): i
                         for j, b in enumerate(aggs[(i + 1):])
                         if a == b})

        if len(dups) == 0:
            # All the aggregate expressions are unique, we're good
            return expr

        #################################
        # Construct a new Apply that drops all duplicates and replaces
        # them with repeated UnnamedAttributeRefs
        #################################

        # First keep the grouping list intact
        num_grps = len(expr.grouping_list)
        mappings = [(None, UnnamedAttributeRef(i))
                    for i in range(num_grps)]

        # Construct the references to the grouping list
        for i in range(len(aggs)):
            if i in orig:
                m = orig[i] + num_grps
            else:
                m = orig[dups[i]] + num_grps
            mappings.append((None, UnnamedAttributeRef(m)))

        # Drop any duplicates from the agg list
        expr.aggregate_list = [aggs[i] for i in sorted(orig)]

        return algebra.Apply(emitters=mappings, input=expr)
Esempio n. 6
0
    def fire(self, expr):
        # don't allow swap-created join to be swapped
        if (isinstance(expr, algebra.Join) or
            isinstance(expr, algebra.CrossProduct)) \
                and not hasattr(expr, '__swapped__'):

            assert (
                isinstance(
                    expr,
                    algebra.Join)) or (
                isinstance(
                    expr,
                    algebra.CrossProduct))

            # An apply will undo the effect of the swap on the scheme,
            # so above operators won't be affected
            left_sch = expr.left.scheme()
            right_sch = expr.right.scheme()
            leftlen = len(left_sch)
            rightlen = len(right_sch)
            assert leftlen + rightlen == len(expr.scheme())
            emitters_left = [(left_sch.getName(i),
                              UnnamedAttributeRef(rightlen + i))
                             for i in range(leftlen)]
            emitters_right = [(right_sch.getName(i), UnnamedAttributeRef(i))
                              for i in range(rightlen)]
            emitters = emitters_left + emitters_right

            if isinstance(expr, algebra.Join):
                # reindex the expression
                index_map = dict([(oldpos, attr[1].position)
                                  for (oldpos, attr) in enumerate(emitters)])

                expression.reindex_expr(expr.condition, index_map)

                newjoin = algebra.Join(expr.condition, expr.right, expr.left)
            else:
                newjoin = algebra.CrossProduct(expr.right, expr.left)

            newjoin.__swapped__ = True

            return algebra.Apply(emitters=emitters, input=newjoin)
        else:
            return expr
Esempio n. 7
0
    def renameIDB(self, plan):
        """Rename the attributes of the plan to match the current rule. Used
        when chaining multiple rules."""
        term = self

        # Do we know the actual scheme? If it's recursive, we don't
        # So derive it from the rule head
        # Not really satisfied with this.
        try:
            sch = plan.scheme()
        except algebra.RecursionError:
            sch = Scheme([make_attr(i, r, term.name)
                          for i, r in enumerate(term.valuerefs)])

        oldscheme = [name for (name, _) in sch]
        termscheme = [expr for expr in term.valuerefs]

        if len(oldscheme) != len(termscheme):
            raise TypeError("Rule with head %s does not match Term %s" % (term.name, term))  # noqa

        pairs = zip(termscheme, oldscheme)

        # Merge the old and new schemes.  Use new where we can.
        def choosename(new, old):
            if isinstance(new, Var):
                # Then use this new var as the column name
                return new.var
            else:
                # It's an implicit selection condition, like R(x,3). In R's
                # schema, don't call the column name '3' (new), instead call it
                # whatever the column name was in the prior rule where R is the
                # head variable R (old).
                return old

        mappings = [(choosename(new, old), expression.UnnamedAttributeRef(i))
                    for i, (new, old) in enumerate(pairs)]

        # Use an apply operator to implement the renaming
        plan = algebra.Apply(mappings, plan)

        return plan
Esempio n. 8
0
    def toRA(self, program):
        """Emit a relational plan for this rule"""
        if program.compiling(self.head):
            # recursive rule
            if not self.fixpoint:
                self.fixpoint = algebra.Fixpoint()
            state = algebra.State(self.head.name, self.fixpoint)
            return state
        else:
            self.compiling = True

        # get the terms, like A(X,Y,"foo")
        terms = [c for c in self.body if isinstance(c, Term)]

        # get the conditions, like Z=3
        conditions = [c for c in self.body
                      if isinstance(c, expression.BinaryBooleanOperator)]
        if len(conditions) > 0:
            LOG.debug("found conditions: %s (type=%s) for program %s", conditions, type(conditions[0]), program)  # noqa
        else:
            LOG.debug("found conditions: %s (type=%s) for program %s", conditions, None, program)  # noqa

        # construct the join graph
        joingraph = nx.Graph()
        N = len(terms)
        for i, term1 in enumerate(terms):
            # store the order for explaining queries later -- not strictly
            # necessary
            term1.originalorder = i

            # for each term, add it as a vertex,
            # and for each term it joins to, add an edge
            joingraph.add_node(term1, term=term1)
            for j in range(i + 1, N):
                term2 = terms[j]
                LOG.debug("joinsto? %s %s", term1, term2)
                joins = term1.joinsto(term2, conditions)
                if joins:
                    conjunction = reduce(expression.AND, joins)
                    LOG.debug("add edge: %s --[%s]--> %s", term1, conjunction,
                              term2)
                    joingraph.add_edge(term1, term2, condition=conjunction,
                                       terms=(term1, term2))

        # find connected components (some non-determinism in the order here)
        comps = nx.connected_component_subgraphs(joingraph)

        component_plans = []

        # for each component, choose a join order
        for component in comps:
            cycleconditions = []
            # check for cycles
            cycles = nx.cycle_basis(component)
            while cycles:
                LOG.debug("found cycles: %s", cycles)

                # choose an edge to break the cycle
                # that edge will be a selection condition after the final join
                # oneedge = cycles[0][-2:]
                # try to make the chosen edge from cycle deterministic
                oneedge = sorted(cycles[0], key=lambda v: v.originalorder)[-2:]

                data = component.get_edge_data(*oneedge)
                LOG.debug("picked edge: %s, data: %s", oneedge, data)
                cycleconditions.append(data)
                component.remove_edge(*oneedge)
                cycles = nx.cycle_basis(component)

            if len(component) == 1:
                # no joins to plan
                onlyterm = component.nodes()[0]
                plan = onlyterm.makeLeaf(conditions, program)
            else:
                LOG.debug("component: %s", component)
                # TODO: clean this up.
                # joingraph -> joinsequence -> relational plan
                planner = BFSLeftDeepPlanner(component)

                joinsequence = planner.chooseplan()
                LOG.debug("join sequence: %s", joinsequence)

                # create a relational plan, finally
                # pass in the conditions to make the leaves of the plan
                plan = joinsequence.makePlan(conditions, program)

            LOG.debug("cycleconditions: %s", cycleconditions)
            for condition_info in cycleconditions:
                predicate = condition_info["condition"]
                terms = condition_info["terms"]

                # change all UnnamedAttributes based on the
                # offset of its Term
                termsToOffset = dict((t, joinsequence.offset(t))
                                     for t in terms)

                LOG.debug("before add offset %s", predicate)
                predicate.add_offset_by_terms(termsToOffset)
                LOG.debug("after add offset %s", predicate)

                # create selections after each cycle
                plan = algebra.Select(predicate, plan)

            component_plans.append(plan)

        # link the components with a cross product
        plan = component_plans[0]
        for newplan in component_plans[1:]:
            plan = algebra.CrossProduct(plan, newplan)

        try:
            scheme = plan.scheme()
        except AttributeError:
            scheme = Scheme([make_attr(i, r, self.head.name) for i, r in enumerate(self.head.valuerefs)])  # noqa

        # Helper function for the next two steps (TODO: move this to a method?)
        def findvar(variable):
            var = variable.var
            if var not in scheme:
                msg = "Head variable %s does not appear in rule body: %s" % (var, self)  # noqa
                raise SyntaxError(msg)
            return expression.UnnamedAttributeRef(scheme.getPosition(var))

        class FindVarExpressionVisitor(SimpleExpressionVisitor):
            def __init__(self):
                self.stack = []

            def getresult(self):
                assert len(self.stack) == 1
                return self.stack.pop()

            def visit_unary(self, unaryexpr):
                inputexpr = self.stack.pop()
                self.stack.append(unaryexpr.__class__(inputexpr))

            def visit_binary(self, binaryexpr):
                right = self.stack.pop()
                left = self.stack.pop()
                self.stack.append(binaryexpr.__class__(left, right))

            def visit_zeroary(self, zeroaryexpr):
                self.stack.append(zeroaryexpr.__class__())

            def visit_literal(self, literalexpr):
                self.stack.append(literalexpr.__class__(literalexpr.value))

            def visit_nary(self, naryexpr):
                raise NotImplementedError(
                    "TODO: implement findvar visit of nary expression")

            def visit_attr(self, attr):
                assert False, \
                    "FindVar should not be used on expressions with attributes"

            def visit_Case(self, caseExpr):
                raise NotImplementedError("Case now implemented for Datalog?")

            def visit_Var(self, var):
                asAttr = findvar(var)
                self.stack.append(asAttr)

            # TODO: add the other aggregates
            # TODO and move aggregates to expression-visitor
            def visit_SUM(self, x):
                self.visit_unary(x)

            def visit_COUNT(self, x):
                self.visit_unary(x)

        # if this Rule includes a server specification, add a partition
        # operator
        if self.isParallel():
            if isinstance(self.head.serverspec, Broadcast):
                plan = algebra.Broadcast(plan)
            if isinstance(self.head.serverspec, PartitionBy):
                positions = [findvar(v)
                             for v in self.head.serverspec.variables]
                plan = algebra.PartitionBy(positions, plan)

        def toAttrRef(e):
            """
             Resolve variable references in the head; pass through aggregate
             expressions

             If expression requires an Apply then return True, else False
             """
            LOG.debug("find reference for %s", e)
            visitor = FindVarExpressionVisitor()
            e.accept(visitor)
            return visitor.getresult()

        columnlist = [toAttrRef(v) for v in self.head.valuerefs]
        LOG.debug("columnlist for Project (or group by) is %s", columnlist)

        # If any of the expressions in the head are aggregate expression,
        # construct a group by
        if any(expression.expression_contains_aggregate(v)
               for v in self.head.valuerefs):
            emit_clause = [(None, a_or_g) for a_or_g in columnlist]
            return raco.myrial.groupby.groupby(plan, emit_clause, [])
        elif any([not isinstance(e, Var) for e in self.head.valuerefs]):
            # If complex expressions in head, then precede Project with Apply
            # NOTE: should Apply actually just append emitters to schema
            # instead of doing column select?
            # we decided probably not in
            # https://github.com/uwescience/raco/pull/209
            plan = algebra.Apply([(None, e) for e in columnlist], plan)
        else:
            # otherwise, just build a Project
            plan = algebra.Apply(emitters=[(None, c) for c in columnlist],
                                 input=plan)

        # If we found a cycle, the "root" of the plan is the fixpoint operator
        if self.fixpoint:
            self.fixpoint.loopBody(plan)
            plan = self.fixpoint
            self.fixpoint = None

        self.compiling = False

        return plan
Esempio n. 9
0
    def fire(self, op):
        # Punt if it's not a group by or we've already converted this into an
        # an instance of self.gb_class
        if op.__class__ != algebra.GroupBy:
            return op

        if self._only_fire_on_multi_key and len(op.grouping_list) == 0:
            out_op = self._only_fire_on_multi_key()
            out_op.copy(op)
            return out_op

        # Do not shuffle and do not decompose if the data is shuffled already
        if DecomposeGroupBy.check_no_shuffle(op):
            out_op = self._gb_class()
            out_op.copy(op)
            return out_op

        # Bail early if we have any non-decomposable aggregates
        if not all(x.is_decomposable() for x in op.aggregate_list):
            out_op = self._gb_class()
            out_op.copy(op)
            DecomposeGroupBy.do_transfer(out_op)
            return out_op

        num_grouping_terms = len(op.grouping_list)

        local_emitters = []
        local_statemods = []
        remote_emitters = []
        remote_statemods = []
        finalizer_exprs = []

        # The starting positions for the current local, remote aggregate
        local_output_pos = num_grouping_terms
        remote_output_pos = num_grouping_terms
        requires_finalizer = False

        for agg in op.aggregate_list:
            # Multiple emit arguments can be associated with a single
            # decomposition rule; coalesce them all together.
            state = agg.get_decomposable_state()
            assert state

            ################################
            # Extract the set of emitters and statemods required for the
            # local aggregate.
            ################################

            laggs = state.get_local_emitters()
            local_emitters.extend(laggs)
            local_statemods.extend(state.get_local_statemods())

            ################################
            # Extract the set of emitters and statemods required for the
            # remote aggregate.  Remote expressions must be rebased to
            # remove instances of LocalAggregateOutput
            ################################

            raggs = state.get_remote_emitters()
            raggs = [rebase_local_aggregate_output(x, local_output_pos)
                     for x in raggs]
            remote_emitters.extend(raggs)

            rsms = state.get_remote_statemods()
            for sm in rsms:
                update_expr = rebase_local_aggregate_output(
                    sm.update_expr, local_output_pos)
                remote_statemods.append(
                    StateVar(sm.name, sm.init_expr, update_expr))

            ################################
            # Extract any required finalizers.  These must be rebased to remove
            # instances of RemoteAggregateOutput
            ################################

            finalizer = state.get_finalizer()
            if finalizer is not None:
                requires_finalizer = True
                finalizer_exprs.append(
                    rebase_finalizer(finalizer, remote_output_pos))
            else:
                for i in range(len(raggs)):
                    finalizer_exprs.append(
                        UnnamedAttributeRef(remote_output_pos + i))

            local_output_pos += len(laggs)
            remote_output_pos += len(raggs)

        ################################
        # Glue together the local and remote aggregates:
        # Local => Shuffle => Remote => (optional) Finalizer.
        ################################

        local_gb = self._gb_class(op.grouping_list, local_emitters, op.input,
                                  local_statemods)

        grouping_fields = [UnnamedAttributeRef(i)
                           for i in range(num_grouping_terms)]

        remote_gb = self._gb_class(grouping_fields, remote_emitters, local_gb,
                                   remote_statemods)

        DecomposeGroupBy.do_transfer(remote_gb)

        if requires_finalizer:
            # Pass through grouping terms
            gmappings = [(None, UnnamedAttributeRef(i))
                         for i in range(num_grouping_terms)]
            fmappings = [(None, fx) for fx in finalizer_exprs]
            return algebra.Apply(gmappings + fmappings, remote_gb)
        return remote_gb
Esempio n. 10
0
    def fire(self, op):
        if isinstance(op, algebra.GroupBy):
            child = op.input
            child_scheme = child.scheme()
            grp_list = op.get_unnamed_grouping_list()
            agg_list = op.get_unnamed_aggregate_list()

            up_names = [name for name, ex in op.updaters]
            up_list = op.get_unnamed_update_exprs()

            agg = [accessed_columns(a) for a in agg_list]
            up = [accessed_columns(a) for a in up_list]
            pos = [{g.position} for g in grp_list]

            accessed = sorted(set(itertools.chain(*(up + agg + pos))))
            if not accessed:
                # Bug #207: COUNTALL() does not access any columns. So if the
                # query is just a COUNT(*), we would generate an empty Apply.
                # If this happens, just keep the first column of the input.
                accessed = [0]
            if len(accessed) != len(child_scheme):
                emitters = [(None, UnnamedAttributeRef(i)) for i in accessed]
                new_apply = algebra.Apply(emitters, child)
                index_map = {a: i for (i, a) in enumerate(accessed)}
                for agg_expr in itertools.chain(grp_list, agg_list, up_list):
                    expression.reindex_expr(agg_expr, index_map)
                op.grouping_list = grp_list
                op.aggregate_list = agg_list
                op.updaters = [(name, ex) for name, ex in
                               zip(up_names, up_list)]
                op.input = new_apply
                return op
        elif isinstance(op, algebra.ProjectingJoin):
            l_scheme = op.left.scheme()
            r_scheme = op.right.scheme()
            in_scheme = l_scheme + r_scheme
            condition = to_unnamed_recursive(op.condition, in_scheme)
            column_list = [to_unnamed_recursive(c, in_scheme)
                           for c in op.output_columns]

            accessed = (accessed_columns(condition) |
                        set(c.position for c in op.output_columns))
            if len(accessed) == len(in_scheme):
                return op

            accessed = sorted(accessed)
            left = [a for a in accessed if a < len(l_scheme)]
            if len(left) < len(l_scheme):
                emits = [(None, UnnamedAttributeRef(a)) for a in left]
                apply = algebra.Apply(emits, op.left)
                op.left = apply
            right = [a - len(l_scheme) for a in accessed
                     if a >= len(l_scheme)]
            if len(right) < len(r_scheme):
                emits = [(None, UnnamedAttributeRef(a)) for a in right]
                apply = algebra.Apply(emits, op.right)
                op.right = apply
            index_map = {a: i for (i, a) in enumerate(accessed)}
            expression.reindex_expr(condition, index_map)
            [expression.reindex_expr(c, index_map) for c in column_list]
            op.condition = condition
            op.output_columns = column_list
            return op

        return op
Esempio n. 11
0
    def fire(self, op):
        if not isinstance(op, algebra.Apply):
            return op

        child = op.input

        if isinstance(child, algebra.Apply):

            emits = op.get_unnamed_emit_exprs()
            child_emits = child.get_unnamed_emit_exprs()

            def convert(n):
                if isinstance(n, expression.UnnamedAttributeRef):
                    return child_emits[n.position]
                n.apply(convert)
                return n
            emits = [convert(e) for e in emits]

            new_apply = algebra.Apply(emitters=zip(op.get_names(), emits),
                                      input=child.input)
            return self.fire(new_apply)

        elif isinstance(child, algebra.ProjectingJoin):
            emits = op.get_unnamed_emit_exprs()

            # If this apply is only AttributeRefs and the columns already
            # have the correct names, we can push it into the ProjectingJoin
            if (all(isinstance(e, expression.AttributeRef) for e in emits) and
                    len(set(emits)) == len(emits)):
                new_cols = [child.output_columns[e.position] for e in emits]
                # We need to ensure that left columns come before right cols
                left_sch = child.left.scheme()
                right_sch = child.right.scheme()
                combined = left_sch + right_sch
                left_len = len(left_sch)
                new_cols = [expression.to_unnamed_recursive(e, combined)
                            for e in new_cols]
                side = [e.position >= left_len for e in new_cols]
                if sorted(side) == side:
                    # Left columns do come before right cols
                    new_pj = algebra.ProjectingJoin(
                        condition=child.condition, left=child.left,
                        right=child.right, output_columns=new_cols)
                    if new_pj.scheme() == op.scheme():
                        return new_pj

            accessed = sorted(set(itertools.chain(*(accessed_columns(e)
                                                    for e in emits))))
            index_map = {a: i for (i, a) in enumerate(accessed)}
            child.output_columns = [child.output_columns[i] for i in accessed]
            for e in emits:
                expression.reindex_expr(e, index_map)

            return algebra.Apply(emitters=zip(op.get_names(), emits),
                                 input=child)

        elif isinstance(child, algebra.GroupBy):
            emits = op.get_unnamed_emit_exprs()
            assert all(is_simple_agg_expr(agg) for agg in child.aggregate_list)

            accessed = sorted(set(itertools.chain(*(accessed_columns(e)
                                                    for e in emits))))
            num_grps = len(child.grouping_list)
            accessed_aggs = [i for i in accessed if i >= num_grps]
            if len(accessed_aggs) == len(child.aggregate_list):
                return op

            unused_map = {i: j + num_grps for j, i in enumerate(accessed_aggs)}

            # copy the groupby operator so we can modify it
            newgb = child.__class__()
            newgb.copy(child)

            # remove aggregates that are projected out
            newgb.aggregate_list = [newgb.aggregate_list[i - num_grps]
                                    for i in accessed_aggs]
            for e in emits:
                expression.reindex_expr(e, unused_map)

            return algebra.Apply(emitters=zip(op.get_names(), emits),
                                 input=newgb)

        return op