Exemple #1
0
    def translate(self, expr, scope, **kwargs):
        """
        Translate Ibis expression into a PySpark object.

        All translated expressions are cached within scope. If an expression is
        found within scope, it's returned. Otherwise, the it's translated and
        cached for future reference.

        :param expr: ibis expression
        :param scope: dictionary mapping from operation to translated result
        :param kwargs: parameters passed as keyword args (e.g. window)
        :return: translated PySpark DataFrame or Column object
        """
        # The operation node type the typed expression wraps
        op = expr.op()

        if op in scope:
            return scope[op]
        elif type(op) in self._registry:
            formatter = self._registry[type(op)]
            result = formatter(self, expr, scope, **kwargs)
            scope[op] = result
            return result
        else:
            raise com.OperationNotDefinedError(
                'No translation rule for {}'.format(type(op)))
Exemple #2
0
def execute_arbitrary_series_groupby(op, data, _, aggcontext=None, **kwargs):
    how = op.how
    if how is None:
        how = 'first'

    if how not in {'first', 'last'}:
        raise com.OperationNotDefinedError(
            'Arbitrary {!r} is not supported'.format(how))
    return aggcontext.agg(data, how)
Exemple #3
0
def execute_arbitrary_series_mask(op, data, mask, aggcontext=None, **kwargs):
    if op.how == 'first':
        index = 0
    elif op.how == 'last':
        index = -1
    else:
        raise com.OperationNotDefinedError(
            'Arbitrary {!r} is not supported'.format(op.how))

    data = data[mask] if mask is not None else data
    return data.iloc[index]
Exemple #4
0
    def translate(self, expr, scope, **kwargs):
        # The operation node type the typed expression wraps
        op = expr.op()

        if type(op) in self._registry:
            formatter = self._registry[type(op)]
            return formatter(self, expr, scope, **kwargs)
        else:
            raise com.OperationNotDefinedError(
                'No translation rule for {}'.format(type(op))
            )
Exemple #5
0
def execute_arbitrary_series_mask(op, data, mask, aggcontext=None, **kwargs):
    """
    Note: we cannot use the pandas version because Dask does not support .iloc
    """
    if op.how == 'first':
        index = 0
    elif op.how == 'last':
        index = -1
    else:
        raise com.OperationNotDefinedError(
            'Arbitrary {!r} is not supported'.format(op.how))

    data = data[mask] if mask is not None else data
    return data.loc[index]
Exemple #6
0
def execute_arbitrary_series_mask(op, data, mask, aggcontext=None, **kwargs):
    """
    Note: we cannot use the pandas version because Dask does not support .iloc
    See https://docs.dask.org/en/latest/dataframe-indexing.html. .loc will
    only work if our index lines up with the label.
    """
    data = data[mask] if mask is not None else data
    if op.how == 'first':
        index = 0
    elif op.how == 'last':
        index = len(data) - 1  # TODO - computation
    else:
        raise com.OperationNotDefinedError(
            f'Arbitrary {op.how!r} is not supported')

    return data.loc[index]
Exemple #7
0
    def translate(self, expr):
        # The operation node type the typed expression wraps
        op = expr.op()

        if type(op) in self._rewrites:  # even if type(op) is in self._registry
            expr = self._rewrites[type(op)](expr)
            op = expr.op()

        # TODO: use op MRO for subclasses instead of this isinstance spaghetti
        if isinstance(op, ops.ScalarParameter):
            return self._trans_param(expr)
        elif isinstance(op, ops.TableNode):
            # HACK/TODO: revisit for more complex cases
            return '*'
        elif type(op) in self._registry:
            formatter = self._registry[type(op)]
            return formatter(self, expr)
        else:
            raise com.OperationNotDefinedError(
                f'No translation rule for {type(op)}')
Exemple #8
0
def execute_window_op(op,
                      data,
                      window,
                      scope=None,
                      aggcontext=None,
                      clients=None,
                      **kwargs):
    operand = op.expr
    # pre execute "manually" here because otherwise we wouldn't pickup
    # relevant scope changes from the child operand since we're managing
    # execution of that by hand
    operand_op = operand.op()
    pre_executed_scope = pre_execute(operand_op,
                                     *clients,
                                     scope=scope,
                                     aggcontext=aggcontext,
                                     **kwargs)
    scope = toolz.merge(scope, pre_executed_scope)
    (root, ) = op.root_tables()
    root_expr = root.to_expr()
    data = execute(
        root_expr,
        scope=scope,
        clients=clients,
        aggcontext=aggcontext,
        **kwargs,
    )

    following = window.following
    order_by = window._order_by

    if (order_by and following != 0
            and not isinstance(operand_op, ops.ShiftBase)):
        raise com.OperationNotDefinedError(
            'Window functions affected by following with order_by are not '
            'implemented')

    group_by = window._group_by
    grouping_keys = [
        key_op.name if isinstance(key_op, ops.TableColumn) else execute(
            key, scope=scope, clients=clients, aggcontext=aggcontext, **kwargs)
        for key, key_op in zip(group_by,
                               map(operator.methodcaller('op'), group_by))
    ]

    order_by = window._order_by
    if not order_by:
        ordering_keys = ()

    if group_by:
        if order_by:
            (
                sorted_df,
                grouping_keys,
                ordering_keys,
            ) = util.compute_sorted_frame(data,
                                          order_by,
                                          group_by=group_by,
                                          **kwargs)
            source = sorted_df.groupby(grouping_keys, sort=True)
            post_process = _post_process_group_by_order_by
        else:
            source = data.groupby(grouping_keys, sort=False)
            post_process = _post_process_group_by
    else:
        if order_by:
            source, grouping_keys, ordering_keys = util.compute_sorted_frame(
                data, order_by, **kwargs)
            post_process = _post_process_order_by
        else:
            source = data
            post_process = _post_process_empty

    new_scope = toolz.merge(
        scope,
        OrderedDict((t, source) for t in operand.op().root_tables()),
        factory=OrderedDict,
    )

    # figure out what the dtype of the operand is
    operand_type = operand.type()
    operand_dtype = operand_type.to_pandas()

    aggcontext = get_aggcontext(
        window,
        operand=operand,
        operand_dtype=operand_dtype,
        parent=source,
        group_by=grouping_keys,
        order_by=ordering_keys,
    )

    result = execute(
        operand,
        scope=new_scope,
        aggcontext=aggcontext,
        clients=clients,
        **kwargs,
    )
    series = post_process(result, data, ordering_keys, grouping_keys)
    assert len(data) == len(
        series
    ), 'input data source and computed column do not have the same length'
    return series
Exemple #9
0
def operation(op, expr):
    raise com.OperationNotDefinedError(f'No translation rule for {type(op)}')
Exemple #10
0
def execute_window_op(
    op,
    data,
    window,
    scope: Scope = None,
    timecontext: Optional[TimeContext] = None,
    aggcontext=None,
    clients=None,
    **kwargs,
):
    operand = op.expr
    # pre execute "manually" here because otherwise we wouldn't pickup
    # relevant scope changes from the child operand since we're managing
    # execution of that by hand
    operand_op = operand.op()

    adjusted_timecontext = None
    if timecontext:
        arg_timecontexts = compute_time_context(
            op, timecontext=timecontext, clients=clients
        )
        # timecontext is the original time context required by parent node
        # of this WindowOp, while adjusted_timecontext is the adjusted context
        # of this Window, since we are doing a manual execution here, use
        # adjusted_timecontext in later execution phases
        adjusted_timecontext = arg_timecontexts[0]

    pre_executed_scope = pre_execute(
        operand_op,
        *clients,
        scope=scope,
        timecontext=adjusted_timecontext,
        aggcontext=aggcontext,
        **kwargs,
    )
    scope = scope.merge_scope(pre_executed_scope)
    (root,) = op.root_tables()
    root_expr = root.to_expr()

    data = execute(
        root_expr,
        scope=scope,
        timecontext=adjusted_timecontext,
        clients=clients,
        aggcontext=aggcontext,
        **kwargs,
    )
    following = window.following
    order_by = window._order_by

    if (
        order_by
        and following != 0
        and not isinstance(operand_op, ops.ShiftBase)
    ):
        raise com.OperationNotDefinedError(
            'Window functions affected by following with order_by are not '
            'implemented'
        )

    group_by = window._group_by
    grouping_keys = [
        key_op.name
        if isinstance(key_op, ops.TableColumn)
        else execute(
            key,
            scope=scope,
            clients=clients,
            timecontext=adjusted_timecontext,
            aggcontext=aggcontext,
            **kwargs,
        )
        for key, key_op in zip(
            group_by, map(operator.methodcaller('op'), group_by)
        )
    ]

    order_by = window._order_by
    if not order_by:
        ordering_keys = []

    if group_by:
        if order_by:
            (
                sorted_df,
                grouping_keys,
                ordering_keys,
            ) = util.compute_sorted_frame(
                data,
                order_by,
                group_by=group_by,
                timecontext=adjusted_timecontext,
                **kwargs,
            )
            source = sorted_df.groupby(grouping_keys, sort=True)
            post_process = _post_process_group_by_order_by
        else:
            source = data.groupby(grouping_keys, sort=False)
            post_process = _post_process_group_by
    else:
        if order_by:
            source, grouping_keys, ordering_keys = util.compute_sorted_frame(
                data, order_by, timecontext=adjusted_timecontext, **kwargs
            )
            post_process = _post_process_order_by
        else:
            source = data
            post_process = _post_process_empty

    # Here groupby object should be add to the corresponding node in scope
    # for execution, data will be overwrite to a groupby object, so we
    # force an update regardless of time context
    new_scope = scope.merge_scopes(
        [
            Scope({t: source}, adjusted_timecontext)
            for t in operand.op().root_tables()
        ],
        overwrite=True,
    )

    aggcontext = get_aggcontext(
        window,
        scope=scope,
        operand=operand,
        parent=source,
        group_by=grouping_keys,
        order_by=ordering_keys,
        **kwargs,
    )
    result = execute(
        operand,
        scope=new_scope,
        timecontext=adjusted_timecontext,
        aggcontext=aggcontext,
        clients=clients,
        **kwargs,
    )
    series = post_process(
        result, data, ordering_keys, grouping_keys, adjusted_timecontext,
    )
    assert len(data) == len(
        series
    ), 'input data source and computed column do not have the same length'
    # trim data to original time context
    series = trim_with_timecontext(series, timecontext)
    return series
Exemple #11
0
def execute_window_op(op,
                      data,
                      window,
                      scope=None,
                      aggcontext=None,
                      clients=None,
                      **kwargs):
    operand = op.expr
    # pre execute "manually" here because otherwise we wouldn't pickup
    # relevant scope changes from the child operand since we're managing
    # execution of that by hand
    operand_op = operand.op()
    pre_executed_scope = pre_execute(operand_op,
                                     *clients,
                                     scope=scope,
                                     aggcontext=aggcontext,
                                     **kwargs)
    scope = toolz.merge(scope, pre_executed_scope)
    (root, ) = op.root_tables()
    root_expr = root.to_expr()
    data = execute(
        root_expr,
        scope=scope,
        clients=clients,
        aggcontext=aggcontext,
        **kwargs,
    )

    following = window.following
    order_by = window._order_by

    if (order_by and following != 0
            and not isinstance(operand_op, ops.ShiftBase)):
        raise com.OperationNotDefinedError(
            'Window functions affected by following with order_by are not '
            'implemented')

    group_by = window._group_by
    grouping_keys = [
        key_op.name if isinstance(key_op, ops.TableColumn) else execute(
            key, scope=scope, clients=clients, aggcontext=aggcontext, **kwargs)
        for key, key_op in zip(group_by,
                               map(operator.methodcaller('op'), group_by))
    ]

    order_by = window._order_by
    if not order_by:
        ordering_keys = ()

    if group_by:
        if order_by:
            (
                sorted_df,
                grouping_keys,
                ordering_keys,
            ) = util.compute_sorted_frame(data,
                                          order_by,
                                          group_by=group_by,
                                          **kwargs)
            source = sorted_df.groupby(grouping_keys, sort=True)
            post_process = _post_process_group_by_order_by
        else:
            source = data.groupby(grouping_keys, sort=False)
            post_process = _post_process_group_by
    else:
        if order_by:
            source, grouping_keys, ordering_keys = util.compute_sorted_frame(
                data, order_by, **kwargs)
            post_process = _post_process_order_by
        else:
            source = data
            post_process = _post_process_empty

    new_scope = toolz.merge(
        scope,
        OrderedDict((t, source) for t in operand.op().root_tables()),
        factory=OrderedDict,
    )

    # figure out what the dtype of the operand is
    operand_type = operand.type()
    operand_dtype = operand_type.to_pandas()

    # no order by or group by: default summarization aggcontext
    #
    # if we're reducing and we have an order by expression then we need to
    # expand or roll.
    #
    # otherwise we're transforming
    if not grouping_keys and not ordering_keys:
        aggcontext = agg_ctx.Summarize()
    elif (isinstance(operand.op(),
                     (ops.Reduction, ops.CumulativeOp, ops.Any, ops.All))
          and ordering_keys):
        # XXX(phillipc): What a horror show
        preceding = window.preceding
        if preceding is not None:
            max_lookback = window.max_lookback
            assert not isinstance(operand.op(), ops.CumulativeOp)
            aggcontext = agg_ctx.Moving(
                preceding,
                max_lookback,
                parent=source,
                group_by=grouping_keys,
                order_by=ordering_keys,
                dtype=operand_dtype,
            )
        else:
            # expanding window
            aggcontext = agg_ctx.Cumulative(
                parent=source,
                group_by=grouping_keys,
                order_by=ordering_keys,
                dtype=operand_dtype,
            )
    else:
        # groupby transform (window with a partition by clause in SQL parlance)
        aggcontext = agg_ctx.Transform(
            parent=source,
            group_by=grouping_keys,
            order_by=ordering_keys,
            dtype=operand_dtype,
        )

    result = execute(
        operand,
        scope=new_scope,
        aggcontext=aggcontext,
        clients=clients,
        **kwargs,
    )
    series = post_process(result, data, ordering_keys, grouping_keys)
    assert len(data) == len(
        series
    ), 'input data source and computed column do not have the same length'
    return series