Esempio n. 1
0
def execute_aggregation_dataframe(op, data, scope=None, **kwargs):
    assert op.metrics, 'no metrics found during aggregation execution'

    if op.sort_keys:
        raise NotImplementedError(
            'sorting on aggregations not yet implemented')

    predicates = op.predicates
    if predicates:
        predicate = functools.reduce(
            operator.and_,
            (execute(p, scope=scope, **kwargs) for p in predicates),
        )
        data = data.loc[predicate]

    columns = {}

    if op.by:
        grouping_key_pairs = list(
            zip(op.by, map(operator.methodcaller('op'), op.by)))
        grouping_keys = [
            by_op.name if isinstance(by_op, ops.TableColumn) else execute(
                by, scope=scope, **kwargs).rename(by.get_name())
            for by, by_op in grouping_key_pairs
        ]
        columns.update((by_op.name, by.get_name())
                       for by, by_op in grouping_key_pairs
                       if hasattr(by_op, 'name'))
        source = data.groupby(grouping_keys)
    else:
        source = data

    new_scope = toolz.merge(scope, {op.table.op(): source})
    pieces = [
        pd.Series(execute(metric, scope=new_scope, **kwargs),
                  name=metric.get_name()) for metric in op.metrics
    ]

    # group by always needs a reset to get the grouping key back as a column
    result = pd.concat(pieces, axis=1).reset_index()
    result.columns = [columns.get(c, c) for c in result.columns]

    if op.having:
        # .having(...) is only accessible on groupby, so this should never
        # raise
        if not op.by:
            raise ValueError(
                'Filtering out aggregation values is not allowed without at '
                'least one grouping key')

        # TODO(phillipc): Don't recompute identical subexpressions
        predicate = functools.reduce(
            operator.and_,
            (execute(having, scope=new_scope, **kwargs)
             for having in op.having),
        )
        assert len(predicate) == len(
            result), 'length of predicate does not match length of DataFrame'
        result = result.loc[predicate.values]
    return result
Esempio n. 2
0
def compute_projection_scalar_expr(expr, parent, data, scope=None, **kwargs):
    name = expr._name
    assert name is not None, 'Scalar selection name is None'

    op = expr.op()
    parent_table_op = parent.table.op()

    data_columns = frozenset(data.columns)

    additional_scope = OrderedDict(
        (
            t,
            map_new_column_names_to_data(
                remap_overlapping_column_names(
                    parent_table_op, t, data_columns
                ),
                data
            )
        ) for t in op.root_tables()
    )

    new_scope = toolz.merge(scope, additional_scope, factory=OrderedDict)
    scalar = execute(expr, new_scope, **kwargs)
    result = pd.Series([scalar], name=name).repeat(len(data.index))
    result.index = data.index
    return result
Esempio n. 3
0
    def insert(self, path, key, expr, format='table',
               data_columns=True, **kwargs):

        path = self.root / path
        data = execute(expr)
        data.to_hdf(str(path), key, format=format,
                    data_columns=data_columns, **kwargs)
Esempio n. 4
0
def compute_projection_scalar_expr(
    expr,
    parent,
    data,
    scope: Scope = None,
    timecontext: Optional[TimeContext] = None,
    **kwargs,
):
    name = expr._name
    assert name is not None, 'Scalar selection name is None'

    op = expr.op()
    parent_table_op = parent.table.op()

    data_columns = frozenset(data.columns)

    scope = scope.merge_scopes(
        Scope(
            {
                t:
                map_new_column_names_to_data(
                    remap_overlapping_column_names(parent_table_op, t,
                                                   data_columns),
                    data,
                )
            },
            timecontext,
        ) for t in op.root_tables())
    scalar = execute(expr, scope=scope, **kwargs)
    result = pd.Series([scalar], name=name).repeat(len(data.index))
    result.index = data.index
    return result
Esempio n. 5
0
def compute_projection_scalar_expr(expr, parent, data, scope=None, **kwargs):
    name = expr._name
    assert name is not None, 'Scalar selection name is None'

    op = expr.op()
    parent_table_op = parent.table.op()

    data_columns = frozenset(data.columns)

    additional_scope = OrderedDict(
        (
            t,
            map_new_column_names_to_data(
                remap_overlapping_column_names(
                    parent_table_op, t, data_columns
                ),
                data,
            ),
        )
        for t in op.root_tables()
    )

    new_scope = toolz.merge(scope, additional_scope, factory=OrderedDict)
    scalar = execute(expr, scope=new_scope, **kwargs)
    result = pd.Series([scalar], name=name).repeat(len(data.index))
    result.index = data.index
    return result
Esempio n. 6
0
    def insert(
        self, path, key, expr, format='table', data_columns=True, **kwargs
    ):

        path = self.root / path
        data = execute(expr)
        data.to_hdf(
            str(path), key, format=format, data_columns=data_columns, **kwargs
        )
Esempio n. 7
0
def _compute_join_column(column_expr, **kwargs):
    column_op = column_expr.op()

    if isinstance(column_op, ops.TableColumn):
        new_column = column_op.name
    else:
        new_column = execute(column_expr, **kwargs)
    (root_table, ) = column_op.root_tables()
    return new_column, root_table
Esempio n. 8
0
 def execute(self, expr, params=None, **kwargs):  # noqa
     assert isinstance(expr, ir.Expr), "Expected ir.Expr, got {}".format(
         type(expr))
     return execute_last(
         expr.op(),
         execute(expr, params=params, **kwargs),
         params=params,
         **kwargs,
     )
Esempio n. 9
0
File: util.py Progetto: xmnlab/ibis
def compute_sort_key(key, data, **kwargs):
    by = key.args[0]
    try:
        return by.get_name(), None
    except com.ExpressionError:
        name = ibis.util.guid()
        new_scope = {t: data for t in by.op().root_tables()}
        new_column = execute(by, new_scope, **kwargs)
        new_column.name = name
        return name, new_column
Esempio n. 10
0
def _compute_join_column(column_expr, **kwargs):
    column_op = column_expr.op()

    if isinstance(column_op, ops.TableColumn):
        new_column = column_op.name
    else:
        new_column = execute(column_expr, **kwargs)

    root_table, = column_op.root_tables()
    return new_column, root_table
Esempio n. 11
0
def compute_projection_column_expr(
    expr,
    parent,
    data,
    scope: Scope,
    timecontext: Optional[TimeContext],
    **kwargs,
):
    result_name = getattr(expr, '_name', None)
    op = expr.op()
    parent_table_op = parent.table.op()

    if isinstance(op, ops.TableColumn):
        # slightly faster path for simple column selection
        name = op.name

        if name in data:
            return data[name].rename(result_name or name)

        if not isinstance(parent_table_op, ops.Join):
            raise KeyError(name)
        (root_table, ) = op.root_tables()
        left_root, right_root = ops.distinct_roots(parent_table_op.left,
                                                   parent_table_op.right)
        suffixes = {
            left_root: constants.LEFT_JOIN_SUFFIX,
            right_root: constants.RIGHT_JOIN_SUFFIX,
        }
        return data.loc[:, name + suffixes[root_table]].rename(result_name
                                                               or name)

    data_columns = frozenset(data.columns)

    scope = scope.merge_scopes(
        Scope(
            {
                t:
                map_new_column_names_to_data(
                    remap_overlapping_column_names(parent_table_op, t,
                                                   data_columns),
                    data,
                )
            },
            timecontext,
        ) for t in op.root_tables())

    result = execute(expr, scope=scope, timecontext=timecontext, **kwargs)
    assert result_name is not None, 'Column selection name is None'
    if np.isscalar(result):
        return pd.Series(
            np.repeat(result, len(data.index)),
            index=data.index,
            name=result_name,
        )
    return result.rename(result_name)
Esempio n. 12
0
def compute_sort_key(key, data, scope=None, **kwargs):
    by = key.to_expr()
    try:
        if isinstance(by, str):
            return by, None
        return by.get_name(), None
    except com.ExpressionError:
        new_scope = {t: data for t in by.op().root_tables()}
        new_column = execute(by, scope=toolz.merge(scope, new_scope), **kwargs)
        name = ibis.util.guid()
        new_column.name = name
        return name, new_column
Esempio n. 13
0
def compute_sort_key(key, data, scope=None, **kwargs):
    by = key.to_expr()
    try:
        if isinstance(by, str):
            return by, None
        return by.get_name(), None
    except com.ExpressionError:
        new_scope = {t: data for t in by.op().root_tables()}
        new_column = execute(by, scope=toolz.merge(scope, new_scope), **kwargs)
        name = ibis.util.guid()
        new_column.name = name
        return name, new_column
Esempio n. 14
0
def _compute_predicates(
    table_op,
    predicates,
    data,
    scope: Scope,
    timecontext: Optional[TimeContext],
    **kwargs,
):
    """Compute the predicates for a table operation.

    Parameters
    ----------
    table_op : TableNode
    predicates : List[ir.ColumnExpr]
    data : pd.DataFrame
    scope : Scope
    timecontext: Optional[TimeContext]
    kwargs : dict

    Returns
    -------
    computed_predicate : pd.Series[bool]

    Notes
    -----
    This handles the cases where the predicates are computed columns, in
    addition to the simple case of named columns coming directly from the input
    table.
    """
    for predicate in predicates:
        # Map each root table of the predicate to the data so that we compute
        # predicates on the result instead of any left or right tables if the
        # Selection is on a Join. Project data to only inlude columns from
        # the root table.
        root_tables = predicate.op().root_tables()

        # handle suffixes
        data_columns = frozenset(data.columns)

        additional_scope = Scope()
        for root_table in root_tables:
            mapping = remap_overlapping_column_names(table_op, root_table,
                                                     data_columns)
            if mapping is not None:
                new_data = data.loc[:, mapping.keys()].rename(columns=mapping)
            else:
                new_data = data
            additional_scope = additional_scope.merge_scope(
                Scope({root_table: new_data}, timecontext))

        scope = scope.merge_scope(additional_scope)
        yield execute(predicate, scope=scope, **kwargs)
Esempio n. 15
0
    def execute(self, query, params=None, limit='default', **kwargs):
        if limit != 'default':
            raise ValueError(
                'limit parameter to execute is not yet implemented in the '
                'pandas backend')

        if not isinstance(query, ir.Expr):
            raise TypeError(
                "`query` has type {!r}, expected ibis.expr.types.Expr".format(
                    type(query).__name__))
        result = execute(query, params=params, **kwargs)
        query_op = query.op()
        return execute_last(query_op, result, params=params, **kwargs)
Esempio n. 16
0
def compute_sort_key(key, data, timecontext, scope=None, **kwargs):
    by = key.to_expr()
    try:
        if isinstance(by, str):
            return by, None
        return by.get_name(), None
    except com.ExpressionError:
        if scope is None:
            scope = Scope()
        scope = scope.merge_scopes(
            Scope({t: data}, timecontext) for t in by.op().root_tables())
        new_column = execute(by, scope=scope, **kwargs)
        name = ibis.util.guid()
        new_column.name = name
        return name, new_column
Esempio n. 17
0
def _compute_predicates(table_op, predicates, data, scope, **kwargs):
    """Compute the predicates for a table operation.

    Parameters
    ----------
    table_op : TableNode
    predicates : List[ir.ColumnExpr]
    data : pd.DataFrame
    scope : dict
    kwargs : dict

    Returns
    -------
    computed_predicate : pd.Series[bool]

    Notes
    -----
    This handles the cases where the predicates are computed columns, in
    addition to the simple case of named columns coming directly from the input
    table.
    """
    for predicate in predicates:
        # Map each root table of the predicate to the data so that we compute
        # predicates on the result instead of any left or right tables if the
        # Selection is on a Join. Project data to only inlude columns from
        # the root table.
        root_tables = predicate.op().root_tables()

        # handle suffixes
        additional_scope = {}
        data_columns = frozenset(data.columns)

        for root_table in root_tables:
            mapping = remap_overlapping_column_names(
                table_op, root_table, data_columns
            )
            if mapping is not None:
                new_data = data.loc[:, mapping.keys()].rename(columns=mapping)
            else:
                new_data = data
            additional_scope[root_table] = new_data

        new_scope = toolz.merge(scope, additional_scope)
        yield execute(predicate, scope=new_scope, **kwargs)
Esempio n. 18
0
def compute_projection_column_expr(expr, parent, data, scope=None, **kwargs):
    result_name = getattr(expr, '_name', None)
    op = expr.op()
    parent_table_op = parent.table.op()

    if isinstance(op, ops.TableColumn):
        # slightly faster path for simple column selection
        name = op.name

        if name in data:
            return data[name].rename(result_name or name)

        if not isinstance(parent_table_op, ops.Join):
            raise KeyError(name)

        root_table, = op.root_tables()
        left_root, right_root = ops.distinct_roots(
            parent_table_op.left, parent_table_op.right
        )
        suffixes = {
            left_root: constants.LEFT_JOIN_SUFFIX,
            right_root: constants.RIGHT_JOIN_SUFFIX,
        }
        return data.loc[:, name + suffixes[root_table]].rename(
            result_name or name
        )

    data_columns = frozenset(data.columns)
    additional_scope = {
        t: map_new_column_names_to_data(
            remap_overlapping_column_names(parent_table_op, t, data_columns),
            data,
        )
        for t in op.root_tables()
    }

    new_scope = toolz.merge(scope, additional_scope)
    result = execute(expr, scope=new_scope, **kwargs)
    assert result_name is not None, 'Column selection name is None'
    return result.rename(result_name)
Esempio n. 19
0
def compute_projection_column_expr(expr, parent, data, scope=None, **kwargs):
    result_name = getattr(expr, '_name', None)
    op = expr.op()
    parent_table_op = parent.table.op()

    if isinstance(op, ops.TableColumn):
        # slightly faster path for simple column selection
        name = op.name

        if name in data:
            return data[name].rename(result_name or name)

        if not isinstance(parent_table_op, ops.Join):
            raise KeyError(name)

        root_table, = op.root_tables()
        left_root, right_root = ops.distinct_roots(parent_table_op.left,
                                                   parent_table_op.right)
        suffixes = {
            left_root: constants.LEFT_JOIN_SUFFIX,
            right_root: constants.RIGHT_JOIN_SUFFIX,
        }
        return data.loc[:, name + suffixes[root_table]].rename(result_name
                                                               or name)

    data_columns = frozenset(data.columns)
    additional_scope = {
        t: map_new_column_names_to_data(
            remap_overlapping_column_names(parent_table_op, t, data_columns),
            data,
        )
        for t in op.root_tables()
    }

    new_scope = toolz.merge(scope, additional_scope)
    result = execute(expr, scope=new_scope, **kwargs)
    assert result_name is not None, 'Column selection name is None'
    return result.rename(result_name)
Esempio n. 20
0
def execute_node_expr_list(op, sequence, **kwargs):
    # TODO: no true approx count distinct for pandas, so we use exact for now
    columns = [e.get_name() for e in op.exprs]
    schema = ibis.schema(list(zip(columns, (e.type() for e in op.exprs))))
    data = {col: [execute(el, **kwargs)] for col, el in zip(columns, sequence)}
    return schema.apply_to(pd.DataFrame(data, columns=columns))
Esempio n. 21
0
def execute_window_op(op, data, window, scope=None, context=None, **kwargs):
    operand = op.expr
    root, = op.root_tables()
    try:
        data = scope[root]
    except KeyError:
        data = execute(root.to_expr(), scope=scope, context=context, **kwargs)

    following = window.following
    order_by = window._order_by

    if order_by and following != 0:
        raise com.OperationNotDefinedError(
            'Following with a value other than 0 (current row) with order_by '
            'is not yet implemented in the pandas backend. Use '
            'ibis.trailing_window or ibis.cumulative_window to '
            'construct windows when using the pandas backend.'
        )

    group_by = window._group_by
    grouping_keys = [
        key_op.name if isinstance(key_op, ops.TableColumn) else execute(
            key,
            context=context,
            **kwargs
        ) for key, key_op in zip(
            group_by, map(operator.methodcaller('op'), group_by)
        )
    ]

    order_by = window._order_by

    if grouping_keys:
        source = data.groupby(grouping_keys, sort=False, as_index=not order_by)

        if order_by:
            sorted_df = source.apply(
                lambda df, order_by=order_by, kwargs=kwargs: (
                    util.compute_sorted_frame(order_by, df, **kwargs)
                )
            )
            source = sorted_df.groupby(grouping_keys, sort=False)
            post_process = _post_process_group_by_order_by
        else:
            post_process = _post_process_group_by
    else:
        if order_by:
            source = util.compute_sorted_frame(order_by, data, **kwargs)
            post_process = _post_process_order_by
        else:
            source = data
            post_process = _post_process_empty

    new_scope = toolz.merge(
        scope,
        OrderedDict((t, source) for t in operand.op().root_tables()),
        factory=OrderedDict,
    )

    # no order by or group by: default summarization context
    #
    # if we're reducing and we have an order by expression then we need to
    # expand or roll.
    #
    # otherwise we're transforming
    if not grouping_keys and not order_by:
        context = agg_ctx.Summarize()
    elif isinstance(operand.op(), ops.Reduction) and order_by:
        # XXX(phillipc): What a horror show
        preceding = window.preceding
        if preceding is not None:
            context = agg_ctx.Moving(preceding)
        else:
            # expanding window
            context = agg_ctx.Cumulative()
    else:
        # groupby transform (window with a partition by clause in SQL parlance)
        context = agg_ctx.Transform()

    result = execute(operand, new_scope, context=context, **kwargs)
    series = post_process(result, data.index)
    assert len(data) == len(series), \
        'input data source and computed column do not have the same length'
    return series
Esempio n. 22
0
def execute_window_op(
    op,
    data,
    window,
    scope: Scope = None,
    timecontext: Optional[TimeContext] = None,
    aggcontext=None,
    clients=None,
    **kwargs,
):
    operand = op.expr
    # pre execute "manually" here because otherwise we wouldn't pickup
    # relevant scope changes from the child operand since we're managing
    # execution of that by hand
    operand_op = operand.op()

    adjusted_timecontext = None
    if timecontext:
        arg_timecontexts = compute_time_context(op,
                                                timecontext=timecontext,
                                                clients=clients)
        # timecontext is the original time context required by parent node
        # of this WindowOp, while adjusted_timecontext is the adjusted context
        # of this Window, since we are doing a manual execution here, use
        # adjusted_timecontext in later execution phases
        adjusted_timecontext = arg_timecontexts[0]

    pre_executed_scope = pre_execute(
        operand_op,
        *clients,
        scope=scope,
        timecontext=adjusted_timecontext,
        aggcontext=aggcontext,
        **kwargs,
    )
    scope = scope.merge_scope(pre_executed_scope)
    (root, ) = op.root_tables()
    root_expr = root.to_expr()

    data = execute(
        root_expr,
        scope=scope,
        timecontext=adjusted_timecontext,
        clients=clients,
        aggcontext=aggcontext,
        **kwargs,
    )
    following = window.following
    order_by = window._order_by

    if (order_by and following != 0
            and not isinstance(operand_op, ops.ShiftBase)):
        raise com.OperationNotDefinedError(
            'Window functions affected by following with order_by are not '
            'implemented')

    group_by = window._group_by
    grouping_keys = [
        key_op.name if isinstance(key_op, ops.TableColumn) else execute(
            key,
            scope=scope,
            clients=clients,
            timecontext=adjusted_timecontext,
            aggcontext=aggcontext,
            **kwargs,
        ) for key, key_op in zip(group_by,
                                 map(operator.methodcaller('op'), group_by))
    ]

    order_by = window._order_by
    if not order_by:
        ordering_keys = []

    if group_by:
        if order_by:
            (
                sorted_df,
                grouping_keys,
                ordering_keys,
            ) = util.compute_sorted_frame(
                data,
                order_by,
                group_by=group_by,
                timecontext=adjusted_timecontext,
                **kwargs,
            )
            source = sorted_df.groupby(grouping_keys, sort=True)
            post_process = _post_process_group_by_order_by
        else:
            source = data.groupby(grouping_keys, sort=False)
            post_process = _post_process_group_by
    else:
        if order_by:
            source, grouping_keys, ordering_keys = util.compute_sorted_frame(
                data, order_by, timecontext=adjusted_timecontext, **kwargs)
            post_process = _post_process_order_by
        else:
            source = data
            post_process = _post_process_empty

    # Here groupby object should be add to the corresponding node in scope
    # for execution, data will be overwrite to a groupby object, so we
    # force an update regardless of time context
    new_scope = scope.merge_scopes(
        [
            Scope({t: source}, adjusted_timecontext)
            for t in operand.op().root_tables()
        ],
        overwrite=True,
    )

    # figure out what the dtype of the operand is
    operand_type = operand.type()
    operand_dtype = operand_type.to_pandas()

    aggcontext = get_aggcontext(
        window,
        scope=scope,
        operand=operand,
        operand_dtype=operand_dtype,
        parent=source,
        group_by=grouping_keys,
        order_by=ordering_keys,
        **kwargs,
    )
    result = execute(
        operand,
        scope=new_scope,
        timecontext=adjusted_timecontext,
        aggcontext=aggcontext,
        clients=clients,
        **kwargs,
    )
    series = post_process(result, data, ordering_keys, grouping_keys)
    assert len(data) == len(
        series
    ), 'input data source and computed column do not have the same length'
    # trim data to original time context
    series = trim_with_timecontext(series, timecontext)
    return series
Esempio n. 23
0
def execute_window_op(op,
                      data,
                      window,
                      scope=None,
                      aggcontext=None,
                      clients=None,
                      **kwargs):
    operand = op.expr
    # pre execute "manually" here because otherwise we wouldn't pickup
    # relevant scope changes from the child operand since we're managing
    # execution of that by hand
    operand_op = operand.op()
    pre_executed_scope = pre_execute(operand_op,
                                     *clients,
                                     scope=scope,
                                     aggcontext=aggcontext,
                                     **kwargs)
    scope = toolz.merge(scope, pre_executed_scope)
    (root, ) = op.root_tables()
    root_expr = root.to_expr()
    data = execute(
        root_expr,
        scope=scope,
        clients=clients,
        aggcontext=aggcontext,
        **kwargs,
    )

    following = window.following
    order_by = window._order_by

    if (order_by and following != 0
            and not isinstance(operand_op, ops.ShiftBase)):
        raise com.OperationNotDefinedError(
            'Window functions affected by following with order_by are not '
            'implemented')

    group_by = window._group_by
    grouping_keys = [
        key_op.name if isinstance(key_op, ops.TableColumn) else execute(
            key, scope=scope, clients=clients, aggcontext=aggcontext, **kwargs)
        for key, key_op in zip(group_by,
                               map(operator.methodcaller('op'), group_by))
    ]

    order_by = window._order_by
    if not order_by:
        ordering_keys = ()

    if group_by:
        if order_by:
            (
                sorted_df,
                grouping_keys,
                ordering_keys,
            ) = util.compute_sorted_frame(data,
                                          order_by,
                                          group_by=group_by,
                                          **kwargs)
            source = sorted_df.groupby(grouping_keys, sort=True)
            post_process = _post_process_group_by_order_by
        else:
            source = data.groupby(grouping_keys, sort=False)
            post_process = _post_process_group_by
    else:
        if order_by:
            source, grouping_keys, ordering_keys = util.compute_sorted_frame(
                data, order_by, **kwargs)
            post_process = _post_process_order_by
        else:
            source = data
            post_process = _post_process_empty

    new_scope = toolz.merge(
        scope,
        OrderedDict((t, source) for t in operand.op().root_tables()),
        factory=OrderedDict,
    )

    # figure out what the dtype of the operand is
    operand_type = operand.type()
    operand_dtype = operand_type.to_pandas()

    aggcontext = get_aggcontext(
        window,
        operand=operand,
        operand_dtype=operand_dtype,
        parent=source,
        group_by=grouping_keys,
        order_by=ordering_keys,
    )

    result = execute(
        operand,
        scope=new_scope,
        aggcontext=aggcontext,
        clients=clients,
        **kwargs,
    )
    series = post_process(result, data, ordering_keys, grouping_keys)
    assert len(data) == len(
        series
    ), 'input data source and computed column do not have the same length'
    return series
Esempio n. 24
0
def execute_window_op(
    op, data, window, scope=None, aggcontext=None, clients=None, **kwargs
):
    operand = op.expr
    # pre execute "manually" here because otherwise we wouldn't pickup
    # relevant scope changes from the child operand since we're managing
    # execution of that by hand
    operand_op = operand.op()
    pre_executed_scope = pre_execute(
        operand_op, *clients, scope=scope, aggcontext=aggcontext, **kwargs
    )
    scope = toolz.merge(scope, pre_executed_scope)

    root, = op.root_tables()
    root_expr = root.to_expr()
    data = execute(root_expr, scope=scope, aggcontext=aggcontext, **kwargs)

    following = window.following
    order_by = window._order_by

    if (
        order_by
        and following != 0
        and not isinstance(operand_op, ops.ShiftBase)
    ):
        raise com.OperationNotDefinedError(
            'Window functions affected by following with order_by are not '
            'implemented'
        )

    group_by = window._group_by
    grouping_keys = [
        key_op.name
        if isinstance(key_op, ops.TableColumn)
        else execute(key, aggcontext=aggcontext, **kwargs)
        for key, key_op in zip(
            group_by, map(operator.methodcaller('op'), group_by)
        )
    ]

    order_by = window._order_by
    if not order_by:
        ordering_keys = ()

    if group_by:
        if order_by:
            (
                sorted_df,
                grouping_keys,
                ordering_keys,
            ) = util.compute_sorted_frame(
                data, order_by, group_by=group_by, **kwargs
            )
            source = sorted_df.groupby(grouping_keys, sort=True)
            post_process = _post_process_group_by_order_by
        else:
            source = data.groupby(grouping_keys, sort=False)
            post_process = _post_process_group_by
    else:
        if order_by:
            source, grouping_keys, ordering_keys = util.compute_sorted_frame(
                data, order_by, **kwargs
            )
            post_process = _post_process_order_by
        else:
            source = data
            post_process = _post_process_empty

    new_scope = toolz.merge(
        scope,
        OrderedDict((t, source) for t in operand.op().root_tables()),
        factory=OrderedDict,
    )

    # figure out what the dtype of the operand is
    operand_type = operand.type()
    if isinstance(operand_type, dt.Integer) and operand_type.nullable:
        operand_dtype = np.float64
    else:
        operand_dtype = operand.type().to_pandas()

    # no order by or group by: default summarization aggcontext
    #
    # if we're reducing and we have an order by expression then we need to
    # expand or roll.
    #
    # otherwise we're transforming
    if not grouping_keys and not ordering_keys:
        aggcontext = agg_ctx.Summarize()
    elif isinstance(operand.op(), ops.Reduction) and ordering_keys:
        # XXX(phillipc): What a horror show
        preceding = window.preceding
        if preceding is not None:
            aggcontext = agg_ctx.Moving(
                preceding,
                parent=source,
                group_by=grouping_keys,
                order_by=ordering_keys,
                dtype=operand_dtype,
            )
        else:
            # expanding window
            aggcontext = agg_ctx.Cumulative(
                parent=source,
                group_by=grouping_keys,
                order_by=ordering_keys,
                dtype=operand_dtype,
            )
    else:
        # groupby transform (window with a partition by clause in SQL parlance)
        aggcontext = agg_ctx.Transform(
            parent=source,
            group_by=grouping_keys,
            order_by=ordering_keys,
            dtype=operand_dtype,
        )

    result = execute(operand, scope=new_scope, aggcontext=aggcontext, **kwargs)
    series = post_process(result, data, ordering_keys, grouping_keys)
    assert len(data) == len(
        series
    ), 'input data source and computed column do not have the same length'
    return series
Esempio n. 25
0
 def execute(self, expr, params=None, **kwargs):  # noqa
     assert isinstance(expr, ir.Expr)
     scope = kwargs.pop('scope', {})
     return execute(expr, scope=scope, params=params, **kwargs)
Esempio n. 26
0
def execute_aggregation_dataframe(op, data, scope=None, **kwargs):
    assert op.metrics, 'no metrics found during aggregation execution'

    if op.sort_keys:
        raise NotImplementedError(
            'sorting on aggregations not yet implemented'
        )

    predicates = op.predicates
    if predicates:
        predicate = functools.reduce(
            operator.and_,
            (execute(p, scope=scope, **kwargs) for p in predicates),
        )
        data = data.loc[predicate]

    columns = {}

    if op.by:
        grouping_key_pairs = list(
            zip(op.by, map(operator.methodcaller('op'), op.by))
        )
        grouping_keys = [
            by_op.name
            if isinstance(by_op, ops.TableColumn)
            else execute(by, scope=scope, **kwargs).rename(by.get_name())
            for by, by_op in grouping_key_pairs
        ]
        columns.update(
            (by_op.name, by.get_name())
            for by, by_op in grouping_key_pairs
            if hasattr(by_op, 'name')
        )
        source = data.groupby(grouping_keys)
    else:
        source = data

    new_scope = toolz.merge(scope, {op.table.op(): source})
    pieces = [
        pd.Series(
            execute(metric, scope=new_scope, **kwargs), name=metric.get_name()
        )
        for metric in op.metrics
    ]

    # group by always needs a reset to get the grouping key back as a column
    result = pd.concat(pieces, axis=1).reset_index()
    result.columns = [columns.get(c, c) for c in result.columns]

    if op.having:
        # .having(...) is only accessible on groupby, so this should never
        # raise
        if not op.by:
            raise ValueError(
                'Filtering out aggregation values is not allowed without at '
                'least one grouping key'
            )

        # TODO(phillipc): Don't recompute identical subexpressions
        predicate = functools.reduce(
            operator.and_,
            (
                execute(having, scope=new_scope, **kwargs)
                for having in op.having
            ),
        )
        assert len(predicate) == len(
            result
        ), 'length of predicate does not match length of DataFrame'
        result = result.loc[predicate.values]
    return result
Esempio n. 27
0
File: csv.py Progetto: zdog234/ibis
 def insert(self, path, expr, index=False, **kwargs):
     path = self.root / path
     data = execute(expr)
     data.to_csv(str(path), index=index, **kwargs)
Esempio n. 28
0
def execute_window_op(op,
                      data,
                      window,
                      scope=None,
                      aggcontext=None,
                      clients=None,
                      **kwargs):
    operand = op.expr
    # pre execute "manually" here because otherwise we wouldn't pickup
    # relevant scope changes from the child operand since we're managing
    # execution of that by hand
    operand_op = operand.op()
    pre_executed_scope = pre_execute(operand_op,
                                     *clients,
                                     scope=scope,
                                     aggcontext=aggcontext,
                                     **kwargs)
    scope = toolz.merge(scope, pre_executed_scope)

    root, = op.root_tables()
    root_expr = root.to_expr()
    data = execute(
        root_expr,
        scope=scope,
        clients=clients,
        aggcontext=aggcontext,
        **kwargs,
    )

    following = window.following
    order_by = window._order_by

    if (order_by and following != 0
            and not isinstance(operand_op, ops.ShiftBase)):
        raise com.OperationNotDefinedError(
            'Window functions affected by following with order_by are not '
            'implemented')

    group_by = window._group_by
    grouping_keys = [
        key_op.name if isinstance(key_op, ops.TableColumn) else execute(
            key, aggcontext=aggcontext, **kwargs) for key, key_op in zip(
                group_by, map(operator.methodcaller('op'), group_by))
    ]

    order_by = window._order_by
    if not order_by:
        ordering_keys = ()

    if group_by:
        if order_by:
            (
                sorted_df,
                grouping_keys,
                ordering_keys,
            ) = util.compute_sorted_frame(data,
                                          order_by,
                                          group_by=group_by,
                                          **kwargs)
            source = sorted_df.groupby(grouping_keys, sort=True)
            post_process = _post_process_group_by_order_by
        else:
            source = data.groupby(grouping_keys, sort=False)
            post_process = _post_process_group_by
    else:
        if order_by:
            source, grouping_keys, ordering_keys = util.compute_sorted_frame(
                data, order_by, **kwargs)
            post_process = _post_process_order_by
        else:
            source = data
            post_process = _post_process_empty

    new_scope = toolz.merge(
        scope,
        OrderedDict((t, source) for t in operand.op().root_tables()),
        factory=OrderedDict,
    )

    # figure out what the dtype of the operand is
    operand_type = operand.type()
    operand_dtype = operand_type.to_pandas()

    # no order by or group by: default summarization aggcontext
    #
    # if we're reducing and we have an order by expression then we need to
    # expand or roll.
    #
    # otherwise we're transforming
    if not grouping_keys and not ordering_keys:
        aggcontext = agg_ctx.Summarize()
    elif (isinstance(operand.op(),
                     (ops.Reduction, ops.CumulativeOp, ops.Any, ops.All))
          and ordering_keys):
        # XXX(phillipc): What a horror show
        preceding = window.preceding
        if preceding is not None:
            max_lookback = window.max_lookback
            assert not isinstance(operand.op(), ops.CumulativeOp)
            aggcontext = agg_ctx.Moving(
                preceding,
                max_lookback,
                parent=source,
                group_by=grouping_keys,
                order_by=ordering_keys,
                dtype=operand_dtype,
            )
        else:
            # expanding window
            aggcontext = agg_ctx.Cumulative(
                parent=source,
                group_by=grouping_keys,
                order_by=ordering_keys,
                dtype=operand_dtype,
            )
    else:
        # groupby transform (window with a partition by clause in SQL parlance)
        aggcontext = agg_ctx.Transform(
            parent=source,
            group_by=grouping_keys,
            order_by=ordering_keys,
            dtype=operand_dtype,
        )

    result = execute(
        operand,
        scope=new_scope,
        aggcontext=aggcontext,
        clients=clients,
        **kwargs,
    )
    series = post_process(result, data, ordering_keys, grouping_keys)
    assert len(data) == len(
        series
    ), 'input data source and computed column do not have the same length'
    return series
Esempio n. 29
0
def execute_node_expr_list(op, sequence, **kwargs):
    # TODO: no true approx count distinct for pandas, so we use exact for now
    columns = [e.get_name() for e in op.exprs]
    schema = ibis.schema(list(zip(columns, (e.type() for e in op.exprs))))
    data = {col: [execute(el, **kwargs)] for col, el in zip(columns, sequence)}
    return schema.apply_to(pd.DataFrame(data, columns=columns))
Esempio n. 30
0
def execute_node_value_list(op, _, **kwargs):
    return [execute(arg, **kwargs) for arg in op.values]
Esempio n. 31
0
def execute_node_value_list(op, _, **kwargs):
    return [execute(arg, **kwargs) for arg in op.values]
Esempio n. 32
0
File: csv.py Progetto: cloudera/ibis
 def insert(self, path, expr, index=False, **kwargs):
     path = self.root / path
     data = execute(expr)
     data.to_csv(str(path), index=index, **kwargs)
Esempio n. 33
0
    def insert(self, path, expr, **kwargs):

        path = self.root / path
        df = execute(expr)
        table = pa.Table.from_pandas(df)
        pq.write_table(table, str(path))
Esempio n. 34
0
 def insert(self, path, expr, **kwargs):
     path = self.root / path
     df = execute(expr)
     table = pa.Table.from_pandas(df)
     pq.write_table(table, str(path))