Exemple #1
0
    def _extract_groupby(
        self,
        df: dd.DataFrame,
        window: org.apache.calcite.rel.core.Window.Group,
        dc: DataContainer,
        context: "dask_sql.Context",
    ) -> Tuple[dd.DataFrame, str]:
        """Prepare grouping columns we can later use while applying the main function"""
        partition_keys = list(window.keys)
        if partition_keys:
            group_columns = [
                df[dc.column_container.get_backend_by_frontend_index(o)]
                for o in partition_keys
            ]
            group_columns = get_groupby_with_nulls_cols(df, group_columns)
            group_columns = {
                new_temporary_column(df): group_col
                for group_col in group_columns
            }
        else:
            group_columns = {new_temporary_column(df): 1}

        df = df.assign(**group_columns)
        group_columns = list(group_columns.keys())

        return df, group_columns
Exemple #2
0
def get_groupby_with_nulls_cols(df: dd.DataFrame,
                                group_columns: List[str],
                                additional_column_name: str = None):
    """
    SQL and dask are treating null columns a bit different:
    SQL will put them to the front, dask will just ignore them
    Therefore we use the same trick as fugue does:
    we will group by both the NaN and the real column value
    """
    if additional_column_name is None:
        additional_column_name = new_temporary_column(df)

    group_columns_and_nulls = []
    for group_column in group_columns:
        # the ~ makes NaN come first
        is_null_column = ~(group_column.isnull())
        non_nan_group_column = group_column.fillna(0)

        group_columns_and_nulls += [is_null_column, non_nan_group_column]

    if not group_columns_and_nulls:
        # This can happen in statements like
        # SELECT SUM(x) FROM data
        # without any groupby statement
        group_columns_and_nulls = [additional_column_name]

    return group_columns_and_nulls
Exemple #3
0
    def _perform_aggregation(
        self,
        df: dd.DataFrame,
        filter_column: str,
        aggregations: List[Tuple[str, str, Any]],
        additional_column_name: str,
        group_columns: List[str],
        groupby_agg_options: Dict[str, Any] = {},
    ):
        tmp_df = df

        # format aggregations for Dask; also check if we can use fast path for
        # groupby, which is only supported if we are not using any custom aggregations
        # and our pandas version support dropna for groupbys
        aggregations_dict = defaultdict(dict)
        fast_groupby = True
        for aggregation in aggregations:
            input_col, output_col, aggregation_f = aggregation
            aggregations_dict[input_col][output_col] = aggregation_f
            if not isinstance(aggregation_f, str):
                fast_groupby = False

        # filter dataframe if specified
        if filter_column:
            filter_expression = tmp_df[filter_column]
            tmp_df = tmp_df[filter_expression]
            logger.debug(f"Filtered by {filter_column} before aggregation.")

        # we might need a temporary column name if no groupby columns are specified
        if additional_column_name is None:
            additional_column_name = new_temporary_column(df)

        # perform groupby operation; if we are using custom aggreagations, we must handle
        # null values manually (this is slow)
        if fast_groupby:
            grouped_df = tmp_df.groupby(by=(group_columns
                                            or [additional_column_name]),
                                        dropna=False)
        else:
            group_columns = [
                tmp_df[group_column] for group_column in group_columns
            ]
            group_columns_and_nulls = get_groupby_with_nulls_cols(
                tmp_df, group_columns, additional_column_name)
            grouped_df = tmp_df.groupby(by=group_columns_and_nulls)

        # apply the aggregation(s)
        logger.debug(f"Performing aggregation {dict(aggregations_dict)}")
        agg_result = grouped_df.agg(aggregations_dict, **groupby_agg_options)

        # fix the column names to a single level
        agg_result.columns = agg_result.columns.get_level_values(-1)

        return agg_result
Exemple #4
0
    def _extract_operations(
        self,
        window: org.apache.calcite.rel.core.Window.Group,
        df: dd.DataFrame,
        dc: DataContainer,
        context: "dask_sql.Context",
    ) -> List[Tuple[Callable, str, List[str]]]:
        # Finally apply the actual function on each group separately
        operations = []
        for agg_call in window.aggCalls:
            operator = agg_call.getOperator()
            operator_name = str(operator.getName())
            operator_name = operator_name.lower()

            try:
                operation = self.OPERATION_MAPPING[operator_name]
            except KeyError:  # pragma: no cover
                try:
                    operation = context.schema[
                        context.schema_name].functions[operator_name]
                except KeyError:  # pragma: no cover
                    raise NotImplementedError(
                        f"{operator_name} not (yet) implemented")

            logger.debug(
                f"Executing {operator_name} on {str(LoggableDataFrame(df))}")

            # TODO: can be optimized by re-using already present columns
            temporary_operand_columns = {
                new_temporary_column(df): RexConverter.convert(o,
                                                               dc,
                                                               context=context)
                for o in agg_call.getOperands()
            }
            df = df.assign(**temporary_operand_columns)
            temporary_operand_columns = list(temporary_operand_columns.keys())

            operations.append((operation, new_temporary_column(df),
                               temporary_operand_columns))

        return operations, df
Exemple #5
0
    def _preserve_index_and_sort(
        self, df: dd.DataFrame
    ) -> Tuple[dd.DataFrame, str, str, str]:
        """Store the partition number, index and sort order separately to make any shuffling reversible"""
        partition_col, index_col, sort_col = (
            new_temporary_column(df),
            new_temporary_column(df),
            new_temporary_column(df),
        )

        def store_index_columns(partition, partition_index):
            return partition.assign(
                **{
                    partition_col: partition_index,
                    index_col: partition.index,
                    sort_col: range(len(partition)),
                }
            )

        df = map_on_partition_index(df, store_index_columns)

        return df, partition_col, index_col, sort_col
Exemple #6
0
    def convert(
        self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context"
    ) -> DataContainer:
        # Get the input of the previous step
        (dc,) = self.assert_inputs(rel, 1, context)

        df = dc.df
        cc = dc.column_container

        # Collect all (new) columns
        named_projects = rel.getNamedProjects()

        column_names = []
        new_columns = {}
        new_mappings = {}
        for expr, key in named_projects:
            key = str(key)
            column_names.append(key)

            # shortcut: if we have a column already, there is no need to re-assign it again
            # this is only the case if the expr is a RexInputRef
            if isinstance(expr, org.apache.calcite.rex.RexInputRef):
                index = expr.getIndex()
                backend_column_name = cc.get_backend_by_frontend_index(index)
                logger.debug(
                    f"Not re-adding the same column {key} (but just referencing it)"
                )
                new_mappings[key] = backend_column_name
            else:
                random_name = new_temporary_column(df)
                new_columns[random_name] = RexConverter.convert(
                    expr, dc, context=context
                )
                logger.debug(f"Adding a new column {key} out of {expr}")
                new_mappings[key] = random_name

        # Actually add the new columns
        if new_columns:
            df = df.assign(**new_columns)

        # and the new mappings
        for key, backend_column_name in new_mappings.items():
            cc = cc.add(key, backend_column_name)

        # Make sure the order is correct
        cc = cc.limit_to(column_names)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Exemple #7
0
    def _extract_groupby(
        self,
        df: dd.DataFrame,
        window: org.apache.calcite.rex.RexWindow,
        dc: DataContainer,
        context: "dask_sql.Context",
    ) -> Tuple[dd.DataFrame, str]:
        """Prepare grouping columns we can later use while applying the main function"""
        partition_keys = list(window.partitionKeys)
        if partition_keys:
            group_columns = [
                RexConverter.convert(o, dc, context=context) for o in partition_keys
            ]
            group_columns = get_groupby_with_nulls_cols(df, group_columns)
            group_columns = {
                new_temporary_column(df): group_col for group_col in group_columns
            }
        else:
            group_columns = {new_temporary_column(df): 1}

        df = df.assign(**group_columns)
        group_columns = list(group_columns.keys())

        return df, group_columns
Exemple #8
0
    def _do_aggregations(
        self,
        rel: "org.apache.calcite.rel.RelNode",
        dc: DataContainer,
        group_columns: List[str],
        context: "dask_sql.Context",
    ) -> Tuple[dd.DataFrame, List[str]]:
        """
        Main functionality: return the result dataframe
        and the output column order
        """
        df = dc.df
        cc = dc.column_container

        # We might need it later.
        # If not, lets hope that adding a single column should not
        # be a huge problem...
        additional_column_name = new_temporary_column(df)
        df = df.assign(**{additional_column_name: 1})

        # Add an entry for every grouped column, as SQL wants them first
        output_column_order = group_columns.copy()

        # Collect all aggregations we need to do
        collected_aggregations, output_column_order = self._collect_aggregations(
            rel, df, cc, context, additional_column_name, output_column_order
        )

        if not collected_aggregations:
            return df[group_columns].drop_duplicates(), output_column_order

        # SQL needs to have a column with the grouped values as the first
        # output column.
        # As the values of the group columns
        # are the same for a single group anyways, we just use the first row
        for col in group_columns:
            collected_aggregations[None].append((col, col, "first"))

        # Now we can go ahead and use these grouped aggregations
        # to perform the actual aggregation
        # It is very important to start with the non-filtered entry.
        # Otherwise we might loose some entries in the grouped columns
        df_result = None
        key = None
        if key in collected_aggregations:
            aggregations = collected_aggregations.pop(key)
            df_result = self._perform_aggregation(
                df, None, aggregations, additional_column_name, group_columns,
            )

        # Now we can also the the rest
        for filter_column, aggregations in collected_aggregations.items():
            agg_result = self._perform_aggregation(
                df, filter_column, aggregations, additional_column_name, group_columns,
            )

            # ... and finally concat the new data with the already present columns
            if df_result is None:
                df_result = agg_result
            else:
                df_result = df_result.assign(
                    **{col: agg_result[col] for col in agg_result.columns}
                )

        return df_result, output_column_order
Exemple #9
0
    def _collect_aggregations(
        self,
        rel: "org.apache.calcite.rel.RelNode",
        df: dd.DataFrame,
        cc: ColumnContainer,
        context: "dask_sql.Context",
        additional_column_name: str,
        output_column_order: List[str],
    ) -> Tuple[Dict[Tuple[str, str], List[Tuple[str, str, Any]]], List[str],
               dd.DataFrame]:
        """
        Collect all aggregations together, which have the same filter column
        so that the aggregations only need to be done once.

        Returns the aggregations as mapping filter_column -> List of Aggregations
        where the aggregations are in the form (input_col, output_col, aggregation function (or string))
        """
        collected_aggregations = defaultdict(list)

        for agg_call in rel.getNamedAggCalls():
            expr = agg_call.getKey()
            # Find out which aggregation function to use
            schema_name, aggregation_name = context.fqn(
                expr.getAggregation().getNameAsId())
            aggregation_name = aggregation_name.lower()
            # Find out about the input column
            inputs = expr.getArgList()
            if aggregation_name == "regr_count":
                is_null = IsNullOperation()
                two_columns_proxy = new_temporary_column(df)
                if len(inputs) == 1:
                    # calcite some times gives one input/col to regr_count and
                    # another col has filter column
                    col1 = cc.get_backend_by_frontend_index(inputs[0])
                    df = df.assign(**{two_columns_proxy: (~is_null(df[col1]))})

                else:
                    col1 = cc.get_backend_by_frontend_index(inputs[0])
                    col2 = cc.get_backend_by_frontend_index(inputs[1])
                    # both cols should be not null
                    df = df.assign(
                        **{
                            two_columns_proxy: (~is_null(df[col1])
                                                & (~is_null(df[col2])))
                        })
                input_col = two_columns_proxy
            elif len(inputs) == 1:
                input_col = cc.get_backend_by_frontend_index(inputs[0])
            elif len(inputs) == 0:
                input_col = additional_column_name
            else:
                raise NotImplementedError(
                    "Can not cope with more than one input")

            # Extract flags (filtering/distinct)
            if expr.isDistinct():  # pragma: no cover
                raise ValueError("Apache Calcite should optimize them away!")

            filter_column = None
            if expr.hasFilter():
                filter_column = cc.get_backend_by_frontend_index(
                    expr.filterArg)

            try:
                aggregation_function = self.AGGREGATION_MAPPING[
                    aggregation_name]
            except KeyError:
                try:
                    aggregation_function = context.schema[
                        schema_name].functions[aggregation_name]
                except KeyError:  # pragma: no cover
                    raise NotImplementedError(
                        f"Aggregation function {aggregation_name} not implemented (yet)."
                    )
            if isinstance(aggregation_function, AggregationSpecification):
                aggregation_function = aggregation_function.get_supported_aggregation(
                    df[input_col])

            # Finally, extract the output column name
            output_col = str(agg_call.getValue())

            # Store the aggregation
            key = filter_column
            value = (input_col, output_col, aggregation_function)
            collected_aggregations[key].append(value)
            output_column_order.append(output_col)

        return collected_aggregations, output_column_order, df
Exemple #10
0
    def _apply_function_over(
        self,
        df: dd.DataFrame,
        f: Callable,
        operands: List[dd.Series],
        window: org.apache.calcite.rex.RexWindow,
        group_columns: List[str],
        sort_columns: List[str],
        sort_ascending: List[bool],
        sort_null_first: List[bool],
    ) -> Tuple[dd.DataFrame, str]:
        """Apply the given function over the dataframe, possibly grouped and sorted per group"""
        temporary_operand_columns = {
            new_temporary_column(df): operand for operand in operands
        }
        df = df.assign(**temporary_operand_columns)
        # Important: move as few bytes as possible to the pickled function,
        # which is evaluated on the workers
        temporary_operand_columns = temporary_operand_columns.keys()

        # Extract the window definition
        lower_bound = to_bound_description(window.getLowerBound())
        upper_bound = to_bound_description(window.getUpperBound())

        new_column_name = new_temporary_column(df)

        @make_pickable_without_dask_sql
        def map_on_each_group(partitioned_group):
            # Apply sorting
            if sort_columns:
                partitioned_group = sort_partition_func(
                    partitioned_group, sort_columns, sort_ascending, sort_null_first
                )

            if f is None:
                # This is the row_number operator.
                # We do not need to do any windowing
                column_result = range(1, len(partitioned_group) + 1)
            else:
                # In all other cases, apply the windowing operation
                if lower_bound.is_unbounded and (
                    upper_bound.is_current_row or upper_bound.offset == 0
                ):
                    windowed_group = partitioned_group.expanding(min_periods=0)
                elif lower_bound.is_preceding and (
                    upper_bound.is_current_row or upper_bound.offset == 0
                ):
                    windowed_group = partitioned_group.rolling(
                        window=lower_bound.offset + 1, min_periods=0,
                    )
                else:
                    lower_offset = (
                        lower_bound.offset if not lower_bound.is_current_row else 0
                    )
                    if lower_bound.is_preceding and lower_offset is not None:
                        lower_offset *= -1
                    upper_offset = (
                        upper_bound.offset if not upper_bound.is_current_row else 0
                    )
                    if upper_bound.is_preceding and upper_offset is not None:
                        upper_offset *= -1

                    indexer = Indexer(lower_offset, upper_offset)
                    windowed_group = partitioned_group.rolling(
                        window=indexer, min_periods=0
                    )

                column_result = f(windowed_group, *temporary_operand_columns)

            partitioned_group = partitioned_group.assign(
                **{new_column_name: column_result}
            )

            return partitioned_group

        # Currently, pandas will always return a float for windowing operations
        meta = df._meta_nonempty.assign(**{new_column_name: 0.0})

        df = df.groupby(group_columns).apply(map_on_each_group, meta=meta)

        return df, new_column_name