Ejemplo n.º 1
0
def build_df_from_projection(
    selection_exprs: List[ir.Expr],
    op: ops.Selection,
    data: dd.DataFrame,
    **kwargs,
) -> dd.DataFrame:
    """
    Build up a df from individual pieces by dispatching to `compute_projection`
    for each expression.
    """
    # Fast path for when we're assigning columns into the same table.
    if (selection_exprs[0] is op.table) and all(
            is_row_order_preserving(selection_exprs[1:])):
        for expr in selection_exprs[1:]:
            projection = compute_projection(expr, op, data, **kwargs)
            if isinstance(projection, dd.Series):
                data = data.assign(**{projection.name: projection})
            else:
                data = data.assign(
                    **{c: projection[c]
                       for c in projection.columns})
        return data

    # Slow path when we cannot do direct assigns
    # Create a unique row identifier and set it as the index. This is
    # used in dd.concat to merge the pieces back together.
    data = add_partitioned_sorted_column(data)
    data_pieces = [
        compute_projection(expr, op, data, **kwargs)
        for expr in selection_exprs
    ]

    return dd.concat(data_pieces, axis=1).reset_index(drop=True)
Ejemplo n.º 2
0
def build_df_from_projection(selections: List[ir.Expr], op: ops.Selection,
                             data: dd.DataFrame, **kwargs) -> dd.DataFrame:
    """
    Build up a df from individual pieces by dispatching to `compute_projection`
    for each expression.
    """

    # Create a unique row identifier and set it as the index. This is
    # used in dd.concat to merge the pieces back together.
    data = add_partitioned_sorted_column(data)
    data_pieces = [
        compute_projection(s, op, data, **kwargs) for s in selections
    ]

    return dd.concat(data_pieces, axis=1).reset_index(drop=True)
Ejemplo n.º 3
0
def execute_selection_dataframe(
    op, data, scope: Scope, timecontext: Optional[TimeContext], **kwargs
):
    selections = op.selections
    predicates = op.predicates
    sort_keys = op.sort_keys
    result = data

    # Build up the individual dask structures from column expressions
    if selections:
        data_pieces = []
        for selection in selections:
            dask_object = compute_projection(
                selection,
                op,
                data,
                scope=scope,
                timecontext=timecontext,
                **kwargs,
            )
            data_pieces.append(dask_object)

        result = dd.concat(data_pieces, axis=1)

    if predicates:
        predicates = _compute_predicates(
            op.table.op(), predicates, data, scope, timecontext, **kwargs
        )
        predicate = functools.reduce(operator.and_, predicates)
        assert len(predicate) == len(
            result
        ), 'Selection predicate length does not match underlying table'
        result = result.loc[predicate]

    if sort_keys:
        if len(sort_keys) > 1:
            raise NotImplementedError(
                """
                Multi-key sorting is not implemented for the Dask backend
                """
            )
        sort_key = sort_keys[0]
        ascending = getattr(sort_key.op(), 'ascending', True)
        if not ascending:
            raise NotImplementedError(
                "Descending sort is not supported for the Dask backend"
            )
        result = compute_sorted_frame(
            result,
            order_by=sort_key,
            scope=scope,
            timecontext=timecontext,
            **kwargs,
        )

        return result
    else:
        grouping_keys = ordering_keys = ()

    # return early if we do not have any temporary grouping or ordering columns
    assert not grouping_keys, 'group by should never show up in Selection'
    if not ordering_keys:
        return result

    # create a sequence of columns that we need to drop
    temporary_columns = pandas.Index(
        concatv(grouping_keys, ordering_keys)
    ).difference(data.columns)

    # no reason to call drop if we don't need to
    if temporary_columns.empty:
        return result

    # drop every temporary column we created for ordering or grouping
    return result.drop(temporary_columns, axis=1)