Beispiel #1
0
    def _validate_join_predicates(self, predicates):
        for pred in predicates:
            op = pred.op()
            if not isinstance(op, ops.Equals):
                raise com.TranslationError('Non-equality join predicates are '
                                           'not supported')

            left_on, right_on = op.args
            if left_on.get_name() != right_on.get_name():
                raise com.TranslationError('Joining on different column names '
                                           'is not supported')
Beispiel #2
0
def selection(op, expr):
    plan = translate(op.table)

    selections = []
    for expr in op.selections or [op.table]:
        # TODO(kszucs) it would be nice if we wouldn't need to handle the
        # specific cases in the backend implementations, we could add a
        # new operator which retrieves all of the TableExpr columns
        # (.e.g. Asterisk) so the translate() would handle this
        # automatically
        if isinstance(expr, ir.TableExpr):
            for name in expr.columns:
                column = expr.get_column(name)
                field = translate(column)
                selections.append(field)
        elif isinstance(expr, ir.ValueExpr):
            field = translate(expr)
            selections.append(field)
        else:
            raise com.TranslationError(
                "DataFusion backend is unable to compile selection with "
                f"expression type of {type(expr)}")

    plan = plan.select(*selections)

    if op.predicates:
        predicates = map(translate, op.predicates)
        predicate = functools.reduce(operator.and_, predicates)
        plan = plan.filter(predicate)

    if op.sort_keys:
        sort_keys = map(translate, op.sort_keys)
        plan = plan.sort(*sort_keys)

    return plan
Beispiel #3
0
def filter_by_time_context(df: DataFrame,
                           timecontext: Optional[TimeContext] = None
                           ) -> DataFrame:
    """ Filter a Dataframe by given time context
    Parameters
    ----------
    df : pyspark.sql.dataframe.DataFrame
    timecontext: TimeContext

    Returns
    -------
    filtered Spark Dataframe
    """
    if not timecontext:
        return df

    if TIME_COL in df.columns:
        # For py3.8, underlying spark type converter calls utctimetuple()
        # and will throw excpetion for Timestamp type if tz is set.
        # See https://github.com/pandas-dev/pandas/issues/32174
        # Dropping tz will cause spark to interpret begin, end with session
        # timezone & os env TZ. We convert Timestamp to pydatetime to
        # workaround.
        begin, end = timecontext
        return df.filter((F.col(TIME_COL) >= begin.to_pydatetime())
                         & (F.col(TIME_COL) < end.to_pydatetime()))
    else:
        raise com.TranslationError(
            "'time' column missing in Dataframe {}."
            "To use time context, a Timestamp column name 'time' must"
            "present in the table. ".format(df))
Beispiel #4
0
    def _validate_join_predicates(self, predicates):
        for pred in predicates:
            op = pred.op()

            if (not isinstance(op, ops.Equals)
                    and not self._non_equijoin_supported):
                raise com.TranslationError('Non-equality join predicates, '
                                           'i.e. non-equijoins, are not '
                                           'supported')
Beispiel #5
0
def compile_variance(t, expr, scope, context=None, **kwargs):
    how = expr.op().how

    if how == 'sample':
        fn = F.var_samp
    elif how == 'pop':
        fn = F.var_pop
    else:
        raise com.TranslationError(
            "Unexpected 'how' in translation: {}".format(how))

    return compile_aggregator(t, expr, scope, fn, context)
Beispiel #6
0
    def _adapt_expr(expr):
        # Non-table expressions need to be adapted to some well-formed table
        # expression, along with a way to adapt the results to the desired
        # arity (whether array-like or scalar, for example)
        #
        # Canonical case is scalar values or arrays produced by some reductions
        # (simple reductions, or distinct, say)

        if isinstance(expr, ir.Table):
            return expr, toolz.identity

        if isinstance(expr, ir.Scalar):
            if not expr.has_name():
                expr = expr.name('tmp')

            if L.is_scalar_reduction(expr):
                table_expr = L.reduction_to_aggregation(expr)
                return table_expr, _get_scalar(expr.get_name())
            else:
                return expr, _get_scalar(expr.get_name())

        elif isinstance(expr, ir.Analytic):
            return expr.to_aggregation(), toolz.identity

        elif isinstance(expr, ir.Column):
            op = expr.op()

            if isinstance(op, ops.TableColumn):
                table_expr = op.table[[op.name]]
                result_handler = _get_column(op.name)
            else:
                if not expr.has_name():
                    expr = expr.name('tmp')
                table_expr = expr.to_projection()
                result_handler = _get_column(expr.get_name())

            return table_expr, result_handler
        else:
            raise com.TranslationError(
                f'Do not know how to execute: {type(expr)}')
Beispiel #7
0
def filter_by_time_context(df: DataFrame,
                           timecontext: Optional[TimeContext] = None
                           ) -> DataFrame:
    """ Filter a Dataframe by given time context
    Parameters
    ----------
    df : pyspark.sql.dataframe.DataFrame
    timecontext: TimeContext

    Returns
    -------
    filtered Spark Dataframe
    """
    if not timecontext:
        return df

    begin, end = timecontext
    if TIME_COL in df.columns:
        return df.filter((F.col(TIME_COL) >= begin) & (F.col(TIME_COL) < end))
    else:
        raise com.TranslationError(
            "'time' column missing in Dataframe {}."
            "To use time context, a Timestamp column name 'time' must"
            "present in the table. ".format(df))
Beispiel #8
0
    def _adapt_expr(expr):
        # Non-table expressions need to be adapted to some well-formed table
        # expression, along with a way to adapt the results to the desired
        # arity (whether array-like or scalar, for example)
        #
        # Canonical case is scalar values or arrays produced by some reductions
        # (simple reductions, or distinct, say)

        if isinstance(expr, ir.TableExpr):
            return expr, toolz.identity

        def _get_scalar(field):
            def scalar_handler(results):
                return results[field][0]

            return scalar_handler

        if isinstance(expr, ir.ScalarExpr):

            if L.is_scalar_reduction(expr):
                table_expr, name = L.reduction_to_aggregation(
                    expr, default_name='tmp'
                )
                return table_expr, _get_scalar(name)
            else:
                base_table = ir.find_base_table(expr)
                if base_table is None:
                    # exprs with no table refs
                    # TODO(phillipc): remove ScalarParameter hack
                    if isinstance(expr.op(), ops.ScalarParameter):
                        name = expr.get_name()
                        assert (
                            name is not None
                        ), f'scalar parameter {expr} has no name'
                        return expr, _get_scalar(name)
                    return expr.name('tmp'), _get_scalar('tmp')

                raise NotImplementedError(repr(expr))

        elif isinstance(expr, ir.AnalyticExpr):
            return expr.to_aggregation(), toolz.identity

        elif isinstance(expr, ir.ColumnExpr):
            op = expr.op()

            def _get_column(name):
                def column_handler(results):
                    return results[name]

                return column_handler

            if isinstance(op, ops.TableColumn):
                table_expr = op.table[[op.name]]
                result_handler = _get_column(op.name)
            else:
                # Something more complicated.
                base_table = L.find_source_table(expr)

                if isinstance(op, ops.DistinctColumn):
                    expr = op.arg
                    try:
                        name = op.arg.get_name()
                    except Exception:
                        name = 'tmp'

                    table_expr = base_table.projection(
                        [expr.name(name)]
                    ).distinct()
                    result_handler = _get_column(name)
                else:
                    table_expr = base_table.projection([expr.name('tmp')])
                    result_handler = _get_column('tmp')

            return table_expr, result_handler
        else:
            raise com.TranslationError(
                f'Do not know how to execute: {type(expr)}'
            )