コード例 #1
0
ファイル: generic.py プロジェクト: shshe/ibis
def execute_aggregation_dataframe(op, data, scope=None, **kwargs):
    assert op.metrics, 'no metrics found during aggregation execution'

    if op.sort_keys:
        raise NotImplementedError(
            'sorting on aggregations not yet implemented')

    predicates = op.predicates
    if predicates:
        predicate = functools.reduce(operator.and_,
                                     (execute(p, scope, **kwargs)
                                      for p in predicates))
        data = data.loc[predicate]

    columns = {}

    if op.by:
        grouping_key_pairs = list(
            zip(op.by, map(operator.methodcaller('op'), op.by)))
        grouping_keys = [
            by_op.name if isinstance(by_op, ir.TableColumn) else execute(
                by, scope, **kwargs).rename(by.get_name())
            for by, by_op in grouping_key_pairs
        ]
        columns.update((by_op.name, by.get_name())
                       for by, by_op in grouping_key_pairs
                       if hasattr(by_op, 'name'))
        source = data.groupby(grouping_keys)
    else:
        source = data

    new_scope = toolz.merge(scope, {op.table.op(): source})
    pieces = [
        pd.Series(execute(metric, new_scope, **kwargs), name=metric.get_name())
        for metric in op.metrics
    ]

    result = pd.concat(pieces, axis=1).reset_index()
    result.columns = [columns.get(c, c) for c in result.columns]

    if op.having:
        # .having(...) is only accessible on groupby, so this should never
        # raise
        if not op.by:
            raise ValueError(
                'Filtering out aggregation values is not allowed without at '
                'least one grouping key')

        # TODO(phillipc): Don't recompute identical subexpressions
        predicate = functools.reduce(operator.and_,
                                     (execute(having, new_scope, **kwargs)
                                      for having in op.having))
        assert len(predicate) == len(result), \
            'length of predicate does not match length of DataFrame'
        result = result.loc[predicate.values].reset_index(drop=True)
    return result
コード例 #2
0
ファイル: core.py プロジェクト: resurgo-genetics/ibis
def execute_with_scope(expr, scope, context=None, **kwargs):
    """Execute an expression `expr`, with data provided in `scope`.

    Parameters
    ----------
    expr : ir.Expr
        The expression to execute.
    scope : dict
        A dictionary mapping :class:`~ibis.expr.types.Node` subclass instances
        to concrete data such as a pandas DataFrame.

    Returns
    -------
    result : scalar, pd.Series, pd.DataFrame
    """
    op = expr.op()

    # base case: our op has been computed (or is a leaf data node), so
    # return the corresponding value
    if op in scope:
        return scope[op]

    if context is None:
        context = ctx.Summarize()

    try:
        computed_args = [scope[t] for t in op.root_tables()]
    except KeyError:
        pass
    else:
        try:
            # special case: we have a definition of execute_first that matches
            # our current operation and data leaves
            return execute_first(op,
                                 *computed_args,
                                 scope=scope,
                                 context=context,
                                 **kwargs)
        except NotImplementedError:
            pass

    args = op.args

    # recursively compute the op's arguments
    computed_args = [
        execute(arg, scope, context=context, **kwargs)
        if hasattr(arg, 'op') else arg for arg in args
        if isinstance(arg, _VALID_INPUT_TYPES)
    ]

    # Compute our op, with its computed arguments
    return execute_node(op,
                        *computed_args,
                        scope=scope,
                        context=context,
                        **kwargs)
コード例 #3
0
def compute_sort_key(key, data, **kwargs):
    by = key.args[0]
    try:
        return by.get_name(), None
    except com.ExpressionError:
        name = ibis.util.guid()
        new_scope = {t: data for t in by.op().root_tables()}
        new_column = execute(by, new_scope, **kwargs)
        new_column.name = name
        return name, new_column
コード例 #4
0
ファイル: join.py プロジェクト: shubhampachori12110095/ibis
def _compute_join_column(column_expr, **kwargs):
    column_op = column_expr.op()

    if isinstance(column_op, ops.TableColumn):
        new_column = column_op.name
    else:
        new_column = execute(column_expr, **kwargs)

    root_table, = column_op.root_tables()
    return new_column, root_table
コード例 #5
0
ファイル: execution.py プロジェクト: resurgo-genetics/ibis
def execute_aggregation_dataframe(op, data, scope=None, **kwargs):
    assert op.metrics

    if op.having:
        raise NotImplementedError('having expressions not yet implemented')

    if op.sort_keys:
        raise NotImplementedError(
            'sorting on aggregations not yet implemented')

    predicates = op.predicates
    if predicates:
        predicate = functools.reduce(operator.and_,
                                     (execute(p, scope, **kwargs)
                                      for p in predicates))
        data = data.loc[predicate]

    columns = {}

    if op.by:
        grouping_key_pairs = list(
            zip(op.by, map(operator.methodcaller('op'), op.by)))
        grouping_keys = [
            by_op.name if isinstance(by_op, ir.TableColumn) else execute(
                by, scope, **kwargs).rename(by.get_name())
            for by, by_op in grouping_key_pairs
        ]
        columns.update((by_op.name, by.get_name())
                       for by, by_op in grouping_key_pairs
                       if hasattr(by_op, 'name'))
        source = data.groupby(grouping_keys)
    else:
        source = data

    new_scope = toolz.merge(scope, {op.table.op(): source})
    pieces = [
        pd.Series(execute(metric, new_scope, **kwargs), name=metric.get_name())
        for metric in op.metrics
    ]

    df = pd.concat(pieces, axis=1).reset_index()
    df.columns = [columns.get(c, c) for c in df.columns]
    return df
コード例 #6
0
def execute_without_scope(
        expr, params=None, scope=None, context=None, **kwargs):
    """Execute an expression against data that are bound to it. If no data
    are bound, raise an Exception.

    Parameters
    ----------
    expr : ir.Expr
        The expression to execute
    params : Dict[Expr, object]

    Returns
    -------
    result : scalar, pd.Series, pd.DataFrame

    Raises
    ------
    ValueError
        * If no data are bound to the input expression
    """

    data_scope = find_data(expr)
    if not data_scope:
        raise ValueError(
            'No data sources found while trying to execute against the pandas '
            'backend'
        )

    factory = type(data_scope)

    if scope is None:
        scope = factory()

    if params is None:
        params = factory()

    params = {k.op() if hasattr(k, 'op') else k: v for k, v in params.items()}

    new_scope = toolz.merge(scope, data_scope, params, factory=factory)

    # data_preload
    new_scope.update(
        (node, data_preload(node, data, scope=new_scope))
        for node, data in new_scope.items()
    )

    # By default, our aggregate functions are N -> 1
    return execute(
        expr,
        new_scope,
        context=context if context is not None else agg_ctx.Summarize(),
        **kwargs
    )
コード例 #7
0
def compute_projection_scalar_expr(expr, parent, data, scope=None, **kwargs):
    name = expr._name
    assert name is not None, 'Scalar selection name is None'

    op = expr.op()
    parent_table_op = parent.table.op()

    data_columns = frozenset(data.columns)

    additional_scope = OrderedDict(
        (t,
         map_new_column_names_to_data(
             remap_overlapping_column_names(parent_table_op, t, data_columns),
             data)) for t in op.root_tables())

    new_scope = toolz.merge(scope, additional_scope, factory=OrderedDict)
    result = execute(expr, new_scope, **kwargs)
    return pd.Series([result], name=name, index=data.index)
コード例 #8
0
ファイル: selection.py プロジェクト: bochuxt/ibis
def _compute_predicates(table_op, predicates, data, scope, **kwargs):
    """Compute the predicates for a table operation.

    Parameters
    ----------
    table_op : TableNode
    predicates : List[ir.ColumnExpr]
    data : pd.DataFrame
    scope : dict
    kwargs : dict

    Returns
    -------
    computed_predicate : pd.Series[bool]

    Notes
    -----
    This handles the cases where the predicates are computed columns, in
    addition to the simple case of named columns coming directly from the input
    table.
    """
    for predicate in predicates:
        # Map each root table of the predicate to the data so that we compute
        # predicates on the result instead of any left or right tables if the
        # Selection is on a Join. Project data to only inlude columns from
        # the root table.
        root_tables = predicate.op().root_tables()

        # handle suffixes
        additional_scope = {}
        data_columns = frozenset(data.columns)

        for root_table in root_tables:
            mapping = remap_overlapping_column_names(
                table_op, root_table, data_columns
            )
            if mapping is not None:
                new_data = data.loc[:, mapping.keys()].rename(columns=mapping)
            else:
                new_data = data
            additional_scope[root_table] = new_data

        new_scope = toolz.merge(scope, additional_scope)
        yield execute(predicate, new_scope, **kwargs)
コード例 #9
0
ファイル: selection.py プロジェクト: bochuxt/ibis
def compute_projection_column_expr(expr, parent, data, scope=None, **kwargs):
    result_name = getattr(expr, '_name', None)
    op = expr.op()
    parent_table_op = parent.table.op()

    if isinstance(op, ir.TableColumn):
        # slightly faster path for simple column selection
        name = op.name

        if name in data:
            return data[name].rename(result_name or name)

        if not isinstance(parent_table_op, ops.Join):
            raise KeyError(name)

        root_table, = op.root_tables()
        left_root, right_root = ir.distinct_roots(
            parent_table_op.left, parent_table_op.right
        )
        suffixes = {left_root: constants.LEFT_JOIN_SUFFIX,
                    right_root: constants.RIGHT_JOIN_SUFFIX}
        return data.loc[:, name + suffixes[root_table]].rename(
            result_name or name
        )

    data_columns = frozenset(data.columns)
    additional_scope = {
        t: map_new_column_names_to_data(
            remap_overlapping_column_names(parent_table_op, t, data_columns),
            data
        ) for t in op.root_tables()
    }

    new_scope = toolz.merge(scope, additional_scope)
    result = execute(expr, new_scope, **kwargs)
    assert result_name is not None, 'Column selection name is None'
    return result.rename(result_name)
コード例 #10
0
ファイル: generic.py プロジェクト: shshe/ibis
def execute_node_value_list(op, **kwargs):
    return [execute(arg, **kwargs) for arg in op.values]
コード例 #11
0
ファイル: window.py プロジェクト: sestus/ibis
def execute_frame_window_op(op, data, scope=None, context=None, **kwargs):
    operand, window = op.args

    following = window.following
    order_by = window._order_by

    if order_by and following != 0:
        raise ValueError(
            'Following with a value other than 0 (current row) with order_by '
            'is not yet implemented in the pandas backend. Use '
            'ibis.trailing_window or ibis.cumulative_window to '
            'construct windows when using the pandas backend.')

    group_by = window._group_by
    grouping_keys = [
        key_op.name if isinstance(key_op, ir.TableColumn) else execute(
            key, context=context, **kwargs) for key, key_op in zip(
                group_by, map(operator.methodcaller('op'), group_by))
    ]

    order_by = window._order_by

    if grouping_keys:
        source = data.groupby(grouping_keys, sort=False, as_index=not order_by)

        if order_by:
            sorted_df = source.apply(
                lambda df, order_by=order_by, kwargs=kwargs:
                (util.compute_sorted_frame(order_by, df, **kwargs)))
            source = sorted_df.groupby(grouping_keys, sort=False)
            post_process = _post_process_group_by_order_by
        else:
            post_process = _post_process_group_by
    else:
        if order_by:
            source = util.compute_sorted_frame(order_by, data, **kwargs)
            post_process = _post_process_order_by
        else:
            source = data
            post_process = _post_process_empty

    new_scope = toolz.merge(
        scope,
        OrderedDict((t, source) for t in operand.op().root_tables()),
        factory=OrderedDict,
    )

    # no order by or group by: default summarization context
    #
    # if we're reducing and we have an order by expression then we need to
    # expand or roll.
    #
    # otherwise we're transforming
    if not grouping_keys and not order_by:
        context = agg_ctx.Summarize()
    elif isinstance(operand.op(), ops.Reduction) and order_by:
        preceding = window.preceding
        if preceding is not None:
            context = agg_ctx.Trailing(preceding)
        else:
            context = agg_ctx.Cumulative()
    else:
        context = agg_ctx.Transform()

    result = execute(operand, new_scope, context=context, **kwargs)
    series = post_process(result, data.index)
    assert len(data) == len(series), \
        'input data source and computed column do not have the same length'
    return series
コード例 #12
0
ファイル: execution.py プロジェクト: resurgo-genetics/ibis
def execute_selection_dataframe(op, data, scope=None, **kwargs):
    selections = op.selections
    predicates = op.predicates
    sort_keys = op.sort_keys

    result = data

    if selections:
        data_pieces = []
        for selection in selections:
            table_op = op.table.op()
            selection_operation = selection.op()

            if op.table is selection:
                pandas_object = data
            elif isinstance(selection, ir.ScalarExpr):
                root_tables = selection_operation.root_tables()
                additional_scope = collections.OrderedDict(
                    zip(root_tables, (data for _ in range(len(root_tables)))))
                new_scope = toolz.merge(
                    scope,
                    additional_scope,
                    factory=collections.OrderedDict,
                )
                pandas_object = execute(selection, new_scope, **kwargs)
            elif isinstance(selection, ir.ColumnExpr):
                if isinstance(selection_operation, ir.TableColumn):
                    # slightly faster path for simple column selection
                    pandas_object = data[selection_operation.name]
                elif isinstance(table_op, ops.Join):
                    pandas_object = execute(
                        selection,
                        toolz.merge(scope,
                                    {selection_operation.table.op(): data}),
                        **kwargs)
                else:
                    pandas_object = execute(
                        selection, toolz.merge(scope, {op.table.op(): data}),
                        **kwargs)
            elif isinstance(selection, ir.TableExpr):
                # These two statements should never raise unless our
                # assumptions are wrong because:
                # 1. If we're selecting ourself, then we've already caught that
                #    case above
                # 2. We've checked that `s` originates from its parent before
                #    executing
                assert isinstance(table_op, ops.Join)
                assert selection.equals(table_op.left) or selection.equals(
                    table_op.right)
                pandas_object = data[selection.columns]
            else:
                raise TypeError(
                    "Don't know how to compute selection of type {}".format(
                        type(selection_operation).__name__))

            if isinstance(pandas_object, pd.Series):
                pandas_object = pandas_object.rename(
                    getattr(selection, '_name', pandas_object.name))
            data_pieces.append(pandas_object)
        result = pd.concat(data_pieces, axis=1)

    if predicates:
        where = functools.reduce(operator.and_, (execute(p, scope, **kwargs)
                                                 for p in predicates))
        result = result.loc[where]

    if sort_keys:
        result = _compute_sorted_frame(sort_keys, result, **kwargs)
    return result.reset_index(drop=True)