Beispiel #1
0
    def _visit_filter_TopK(self, expr):
        # Top K is rewritten as an
        # - aggregation
        # - sort by
        # - limit
        # - left semi join with table set

        metric_name = '__tmp__'

        op = expr.op()

        metrics = [op.by.name(metric_name)]

        arg_table = L.find_base_table(op.arg)
        by_table = L.find_base_table(op.by)

        if arg_table.equals(by_table):
            agg = arg_table.aggregate(metrics, by=[op.arg])
        else:
            agg = self.table_set.aggregate(metrics, by=[op.arg])

        rank_set = agg.sort_by([(metric_name, False)]).limit(op.k)

        pred = (op.arg == getattr(rank_set, op.arg.get_name()))
        self.table_set = self.table_set.semi_join(rank_set, [pred])

        return None
Beispiel #2
0
    def _visit_filter_TopK(self, expr):
        # Top K is rewritten as an
        # - aggregation
        # - sort by
        # - limit
        # - left semi join with table set

        metric_name = '__tmp__'

        op = expr.op()

        metrics = [op.by.name(metric_name)]

        arg_table = L.find_base_table(op.arg)
        by_table = L.find_base_table(op.by)

        if arg_table.equals(by_table):
            agg = arg_table.aggregate(metrics, by=[op.arg])
        else:
            agg = self.table_set.aggregate(metrics, by=[op.arg])

        rank_set = agg.sort_by([(metric_name, False)]).limit(op.k)

        pred = (op.arg == getattr(rank_set, op.arg.get_name()))
        self.table_set = self.table_set.semi_join(rank_set, [pred])

        return None
Beispiel #3
0
def value_counts(arg, metric_name="count"):
    """
    Compute a frequency table for this value expression

    Parameters
    ----------

    Returns
    -------
    counts : TableExpr
      Aggregated table
    """
    base = _L.find_base_table(arg)
    metric = base.count().name(metric_name)

    try:
        arg.get_name()
    except _com.ExpressionError:
        arg = arg.name("unnamed")

    return base.group_by(arg).aggregate(metric)
Beispiel #4
0
def value_counts(arg, metric_name='count'):
    """
    Compute a frequency table for this value expression

    Parameters
    ----------

    Returns
    -------
    counts : TableExpr
      Aggregated table
    """
    base = _L.find_base_table(arg)
    metric = base.count().name(metric_name)

    try:
        arg.get_name()
    except _com.ExpressionError:
        arg = arg.name('unnamed')

    return base.group_by(arg).aggregate(metric)
Beispiel #5
0
def _reduction_to_aggregation(expr, agg_name='tmp'):
    table = L.find_base_table(expr)
    return table.aggregate([expr.name(agg_name)])
Beispiel #6
0
def _adapt_expr(expr):
    # Non-table expressions need to be adapted to some well-formed table
    # expression, along with a way to adapt the results to the desired
    # arity (whether array-like or scalar, for example)
    #
    # Canonical case is scalar values or arrays produced by some reductions
    # (simple reductions, or distinct, say)
    def as_is(x):
        return x

    if isinstance(expr, ir.TableExpr):
        return expr, as_is

    def _scalar_reduce(x):
        return isinstance(x, ir.ScalarExpr) and ops.is_reduction(x)

    if isinstance(expr, ir.ScalarExpr):

        def scalar_handler(results):
            return results['tmp'][0]

        if _scalar_reduce(expr):
            table_expr = _reduction_to_aggregation(expr, agg_name='tmp')
            return table_expr, scalar_handler
        else:
            base_table = L.find_base_table(expr)
            if base_table is None:
                # expr with no table refs
                return expr.name('tmp'), scalar_handler
            else:
                raise NotImplementedError(expr._repr())

    elif isinstance(expr, ir.ExprList):
        exprs = expr.exprs()

        is_aggregation = True
        any_aggregation = False

        for x in exprs:
            if not _scalar_reduce(x):
                is_aggregation = False
            else:
                any_aggregation = True

        if is_aggregation:
            table = L.find_base_table(exprs[0])
            return table.aggregate(exprs), as_is
        elif not any_aggregation:
            return expr, as_is
        else:
            raise NotImplementedError(expr._repr())

    elif isinstance(expr, ir.ArrayExpr):
        op = expr.op()

        def _get_column(name):
            def column_handler(results):
                return results[name]

            return column_handler

        if isinstance(op, ops.TableColumn):
            table_expr = op.table
            result_handler = _get_column(op.name)
        else:
            # Something more complicated.
            base_table = L.find_source_table(expr)

            if isinstance(op, ops.DistinctArray):
                expr = op.arg
                try:
                    name = op.arg.get_name()
                except Exception:
                    name = 'tmp'

                table_expr = (base_table.projection([expr.name(name)
                                                     ]).distinct())
                result_handler = _get_column(name)
            else:
                table_expr = base_table.projection([expr.name('tmp')])
                result_handler = _get_column('tmp')

        return table_expr, result_handler
    else:
        raise NotImplementedError
Beispiel #7
0
def _notall_expand(expr):
    arg = expr.op().args[0]
    t = L.find_base_table(arg)
    return arg.sum() < t.count()
Beispiel #8
0
def _reduction_to_aggregation(expr, agg_name='tmp'):
    table = L.find_base_table(expr)
    return table.aggregate([expr.name(agg_name)])
Beispiel #9
0
def _adapt_expr(expr):
    # Non-table expressions need to be adapted to some well-formed table
    # expression, along with a way to adapt the results to the desired
    # arity (whether array-like or scalar, for example)
    #
    # Canonical case is scalar values or arrays produced by some reductions
    # (simple reductions, or distinct, say)
    def as_is(x):
        return x

    if isinstance(expr, ir.TableExpr):
        return expr, as_is

    def _scalar_reduce(x):
        return isinstance(x, ir.ScalarExpr) and ops.is_reduction(x)

    if isinstance(expr, ir.ScalarExpr):
        def scalar_handler(results):
            return results['tmp'][0]

        if _scalar_reduce(expr):
            table_expr = _reduction_to_aggregation(expr, agg_name='tmp')
            return table_expr, scalar_handler
        else:
            base_table = L.find_base_table(expr)
            if base_table is None:
                # expr with no table refs
                return expr.name('tmp'), scalar_handler
            else:
                raise NotImplementedError(expr._repr())

    elif isinstance(expr, ir.ExprList):
        exprs = expr.exprs()

        is_aggregation = True
        any_aggregation = False

        for x in exprs:
            if not _scalar_reduce(x):
                is_aggregation = False
            else:
                any_aggregation = True

        if is_aggregation:
            table = L.find_base_table(exprs[0])
            return table.aggregate(exprs), as_is
        elif not any_aggregation:
            return expr, as_is
        else:
            raise NotImplementedError(expr._repr())

    elif isinstance(expr, ir.ArrayExpr):
        op = expr.op()

        def _get_column(name):
            def column_handler(results):
                return results[name]
            return column_handler

        if isinstance(op, ops.TableColumn):
            table_expr = op.table
            result_handler = _get_column(op.name)
        else:
            # Something more complicated.
            base_table = L.find_source_table(expr)

            if isinstance(op, ops.DistinctArray):
                expr = op.arg
                try:
                    name = op.arg.get_name()
                except Exception:
                    name = 'tmp'

                table_expr = (base_table.projection([expr.name(name)])
                              .distinct())
                result_handler = _get_column(name)
            else:
                table_expr = base_table.projection([expr.name('tmp')])
                result_handler = _get_column('tmp')

        return table_expr, result_handler
    else:
        raise NotImplementedError