Exemple #1
0
def compute_projection_column_expr(
    expr,
    parent,
    data,
    scope: Scope,
    timecontext: Optional[TimeContext],
    **kwargs,
):
    result_name = expr._safe_name
    op = expr.op()
    parent_table_op = parent.table.op()

    if isinstance(op, ops.TableColumn):
        # slightly faster path for simple column selection
        name = op.name
        assert isinstance(name, str)

        if name in data:
            return data[name].rename(result_name or name)

        if not isinstance(parent_table_op, ops.Join):
            raise KeyError(name)

        suffix = util.get_join_suffix_for_op(op, parent_table_op)
        return data.loc[:, name + suffix].rename(result_name or name)

    data_columns = frozenset(data.columns)

    scope = scope.merge_scopes(
        Scope(
            {
                t:
                map_new_column_names_to_data(
                    remap_overlapping_column_names(parent_table_op, t,
                                                   data_columns),
                    data,
                )
            },
            timecontext,
        ) for t in op.root_tables())

    result = coerce_to_output(
        execute(expr, scope=scope, timecontext=timecontext, **kwargs),
        expr,
        data.index,
    )
    return result
Exemple #2
0
def coerce_to_output(
        result: Any,
        expr: ir.Expr,
        index: Optional[pd.Index] = None) -> Union[dd.Series, dd.DataFrame]:
    """Cast the result to either a Series of DataFrame, renaming as needed.

    Reimplementation of `coerce_to_output` in the pandas backend, but
    creates dask objects and adds special handling for dd.Scalars.

    Parameters
    ----------
    result: Any
        The result to cast
    expr: ibis.expr.types.Expr
        The expression associated with the result
    index: pd.Index
        Optional. If passed, scalar results will be broadcasted according
        to the index.

    Returns
    -------
    result: A `dd.Series` or `dd.DataFrame`

    Raises
    ------
    ValueError
        If unable to coerce result

    Examples
    --------
    For dataframe outputs, see ``_coerce_to_dataframe``. Examples below use
    pandas objects for legibility, but functionality is the same on dask
    objects.

    >>> coerce_to_output(pd.Series(1), expr)
    0    1
    Name: result, dtype: int64
    >>> coerce_to_output(1, expr)
    0    1
    Name: result, dtype: int64
    >>> coerce_to_output(1, expr, [1,2,3])
    1    1
    2    1
    3    1
    Name: result, dtype: int64
    >>> coerce_to_output([1,2,3], expr)
    0    [1, 2, 3]
    Name: result, dtype: object
    """
    result_name = expr.get_name()
    dataframe_exprs = (
        ir.DestructColumn,
        ir.StructColumn,
        ir.DestructScalar,
        ir.StructScalar,
    )
    if isinstance(expr, dataframe_exprs):
        return _coerce_to_dataframe(result,
                                    expr.type().names,
                                    expr.type().types)
    elif isinstance(result, (pd.Series, dd.Series)):
        # Series from https://github.com/ibis-project/ibis/issues/2711
        return result.rename(result_name)
    elif isinstance(expr.op(), ops.Reduction):
        if isinstance(result, dd.core.Scalar):
            # wrap the scalar in a series
            out_dtype = _pandas_dtype_from_dd_scalar(result)
            out_len = 1 if index is None else len(index)
            meta = make_meta_series(dtype=out_dtype, name=result_name)
            # Specify `divisions` so that the created Dask object has
            # known divisions (to be concatenatable with Dask objects
            # created using `dd.from_pandas`)
            series = dd.from_delayed(
                _wrap_dd_scalar(result, result_name, out_len),
                meta=meta,
                divisions=(0, out_len - 1),
            )

            return series
        else:
            return dd.from_pandas(pd_util.coerce_to_output(
                result, expr, index),
                                  npartitions=1)
    else:
        raise ValueError(f"Cannot coerce_to_output. Result: {result}")
Exemple #3
0
def execute_aggregation_dataframe(
    op,
    data,
    metrics,
    by,
    having,
    predicates,
    sort_keys,
    scope=None,
    timecontext: Optional[TimeContext] = None,
    **kwargs,
):
    assert metrics, 'no metrics found during aggregation execution'

    if sort_keys:
        raise NotImplementedError(
            'sorting on aggregations not yet implemented')

    if predicates:
        predicate = functools.reduce(
            operator.and_,
            (execute(p, scope=scope, timecontext=timecontext, **kwargs)
             for p in predicates),
        )
        data = data.loc[predicate]

    columns: Dict[str, str] = {}

    if op.by:
        grouping_key_pairs = list(zip(by, map(operator.methodcaller('op'),
                                              by)))
        grouping_keys = [
            by_op.name if isinstance(by_op, ops.TableColumn) else execute(
                by, scope=scope, timecontext=timecontext, **kwargs).rename(
                    by.get_name()) for by, by_op in grouping_key_pairs
        ]
        columns.update((by_op.name, by.get_name())
                       for by, by_op in grouping_key_pairs
                       if hasattr(by_op, 'name'))
        source = data.groupby(grouping_keys)
    else:
        source = data

    scope = scope.merge_scope(Scope({op.table.op(): source}, timecontext))

    pieces = [
        coerce_to_output(
            execute(metric, scope=scope, timecontext=timecontext, **kwargs),
            metric,
        ) for metric in metrics
    ]

    result = pd.concat(pieces, axis=1)

    # If grouping, need a reset to get the grouping key back as a column
    if by:
        result = result.reset_index()

    result.columns = [columns.get(c, c) for c in result.columns]

    if having:
        # .having(...) is only accessible on groupby, so this should never
        # raise
        if not by:
            raise ValueError(
                'Filtering out aggregation values is not allowed without at '
                'least one grouping key')

        # TODO(phillipc): Don't recompute identical subexpressions
        predicate = functools.reduce(
            operator.and_,
            (execute(h, scope=scope, timecontext=timecontext, **kwargs)
             for h in having),
        )
        assert len(predicate) == len(
            result), 'length of predicate does not match length of DataFrame'
        result = result.loc[predicate.values]
    return result