Example #1
0
    def sql(self,
            sql: str,
            return_futures: bool = True) -> Union[dd.DataFrame, pd.DataFrame]:
        """
        Query the registered tables with the given SQL.
        The SQL follows approximately the postgreSQL standard - however, not all
        operations are already implemented.
        In general, only select statements (no data manipulation) works.

        For more information, see :ref:`sql`.

        Example:
            In this example, a query is called
            using the registered tables and then
            executed using dask.

            .. code-block:: python

                result = c.sql("SELECT a, b FROM my_table")
                print(result.compute())

        Args:
            sql (:obj:`str`): The query string to execute
            return_futures (:obj:`bool`): Return the unexecuted dask dataframe or the data itself.
                Defaults to returning the dask dataframe.

        Returns:
            :obj:`dask.dataframe.DataFrame`: the created data frame of this query.

        """
        rel, select_names, _ = self._get_ral(sql)

        dc = RelConverter.convert(rel, context=self)

        if dc is None:
            return

        if select_names:
            # Rename any columns named EXPR$* to a more human readable name
            cc = dc.column_container
            cc = cc.rename({
                df_col:
                df_col if not df_col.startswith("EXPR$") else select_name
                for df_col, select_name in zip(cc.columns, select_names)
            })
            dc = DataContainer(dc.df, cc)

        df = dc.assign()
        if not return_futures:
            df = df.compute()

        return df
Example #2
0
    def sql(self, sql: str) -> dd.DataFrame:
        """
        Query the registered tables with the given SQL.
        The SQL follows approximately the postgreSQL standard - however, not all
        operations are already implemented.
        In general, only select statements (no data manipulation) works.

        For more information, see :ref:`sql`.

        Example:
            In this example, a query is called
            using the registered tables and then
            executed using dask.

            .. code-block:: python

                result = c.sql("SELECT a, b FROM my_table")
                print(result.compute())

        Args:
            sql (:obj:`str`): The query string to execute
            debug (:obj:`bool`): Turn on printing of debug information.

        Returns:
            :obj:`dask.dataframe.DataFrame`: the created data frame of this query.

        """
        try:
            rel, select_names = self._get_ral(sql)
            dc = RelConverter.convert(rel, context=self)
        except (ValidationException, SqlParseException) as e:
            logger.debug(f"Original exception raised by Java:\n {e}")
            # We do not want to re-raise an exception here
            # as this would print the full java stack trace
            # if debug is not set.
            # Instead, we raise a nice exception
            raise ParsingException(sql, str(e.message())) from None

        if dc is not None:
            if select_names:
                # Rename any columns named EXPR$* to a more human readable name
                cc = dc.column_container
                cc = cc.rename({
                    df_col:
                    df_col if not df_col.startswith("EXPR$") else select_name
                    for df_col, select_name in zip(cc.columns, select_names)
                })
                dc = DataContainer(dc.df, cc)

            return dc.assign()
Example #3
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        (dc, ) = self.assert_inputs(rel, 1, context)

        df = dc.df
        cc = dc.column_container

        # We make our life easier with having unique column names
        cc = cc.make_unique()

        # I have no idea what that is, but so far it was always of length 1
        assert len(
            rel.getGroupSets()) == 1, "Do not know how to handle this case!"

        # Extract the information, which columns we need to group for
        group_column_indices = [int(i) for i in rel.getGroupSet()]
        group_columns = [
            cc.get_backend_by_frontend_index(i) for i in group_column_indices
        ]

        dc = DataContainer(df, cc)

        if not group_columns:
            # There was actually no GROUP BY specified in the SQL
            # Still, this plan can also be used if we need to aggregate something over the full
            # data sample
            # To reuse the code, we just create a new column at the end with a single value
            logger.debug("Performing full-table aggregation")

        # Do all aggregates
        df_result, output_column_order = self._do_aggregations(
            rel,
            dc,
            group_columns,
            context,
        )

        # SQL does not care about the index, but we do not want to have any multiindices
        df_agg = df_result.reset_index(drop=True)

        # Fix the column names and the order of them, as this was messed with during the aggregations
        df_agg.columns = df_agg.columns.get_level_values(-1)
        cc = ColumnContainer(df_agg.columns).limit_to(output_column_order)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df_agg, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Example #4
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        # There should not be any input. This is the first step.
        self.assert_inputs(rel, 0)

        rex_expression_rows = list(rel.getTuples())
        rows = []
        for rex_expression_row in rex_expression_rows:
            # We convert each of the cells in the row
            # using a RexConverter.
            # As we do not have any information on the
            # column headers, we just name them with
            # their index.
            rows.append({
                str(i): RexConverter.convert(rex_cell, None, context=context)
                for i, rex_cell in enumerate(rex_expression_row)
            })

        # TODO: we explicitely reference pandas and dask here -> might we worth making this more general
        # We assume here that when using the values plan, the resulting dataframe will be quite small
        if rows:
            df = pd.DataFrame(rows)
        else:
            field_names = [str(x) for x in rel.getRowType().getFieldNames()]
            df = pd.DataFrame(columns=field_names)

        df = dd.from_pandas(df, npartitions=1)
        cc = ColumnContainer(df.columns)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Example #5
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        (dc, ) = self.assert_inputs(rel, 1, context)

        # During optimization, some constants might end up in an internal
        # constant pool. We need to dereference them here, as they
        # are treated as "normal" columns.
        # Unfortunately they are only referenced by their index,
        # (which come after the real columns), so we need
        # to always substract the number of real columns.
        constants = list(rel.getConstants())
        constant_count_offset = len(dc.column_container.columns)

        # Output to the right field names right away
        field_names = rel.getRowType().getFieldNames()

        for window in rel.getGroups():
            dc = self._apply_window(window, constants, constant_count_offset,
                                    dc, field_names, context)

        # Finally, fix the output schema if needed
        df = dc.df
        cc = dc.column_container

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())

        return dc
Example #6
0
    def fix_dtype_to_row_type(
        dc: DataContainer, row_type: "org.apache.calcite.rel.type.RelDataType"
    ):
        """
        Fix the dtype of the given data container (or: the df within it)
        to the data type given as argument.
        To prevent unneeded conversions, do only convert if really needed,
        e.g. if the two types are "similar" enough, do not convert.
        Similarity involves the same general type (int, float, string etc)
        but not necessary the size (int64 and int32 are compatible)
        or the nullability.
        TODO: we should check the nullability of the SQL type
        """
        df = dc.df
        cc = dc.column_container

        field_types = {
            int(field.getIndex()): str(field.getType())
            for field in row_type.getFieldList()
        }

        for index, field_type in field_types.items():
            expected_type = sql_to_python_type(field_type)
            field_name = cc.get_backend_by_frontend_index(index)

            df = cast_column_type(df, field_name, expected_type)

        return DataContainer(df, dc.column_container)
Example #7
0
def to_dc(
    input_item: InputType,
    file_format: str = None,
    persist: bool = True,
    hive_table_name: str = None,
    hive_schema_name: str = "default",
    **kwargs,
) -> DataContainer:
    """
    Turn possible input descriptions or formats (e.g. dask dataframes, pandas dataframes,
    locations as string, hive tables) into the loaded data containers,
    maybe persist them to cluster memory before.
    """
    filled_get_dask_dataframe = lambda *args: _get_dask_dataframe(
        *args,
        file_format=file_format,
        persist=persist,
        hive_table_name=hive_table_name,
        hive_schema_name=hive_schema_name,
        **kwargs,
    )

    if isinstance(input_item, list):
        table = dd.concat(
            [filled_get_dask_dataframe(item) for item in input_item])
    else:
        table = filled_get_dask_dataframe(input_item)

    if persist:
        table = table.persist()

    return DataContainer(table.copy(), ColumnContainer(table.columns))
Example #8
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        components = list(map(str, sql.getTable().names))

        tableName = components[-1]

        if len(components) == 2:
            if components[0] != context.schema_name:
                raise AttributeError(f"Schema {components[0]} is not defined.")
        elif len(components) > 2:
            raise AttributeError(
                "Table specification must be in the form [schema.]table")

        dc = context.tables[tableName]
        cols = dc.column_container.columns
        dtypes = list(
            map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes))
        df = pd.DataFrame({
            "Column": cols,
            "Type": dtypes,
            "Extra": [""] * len(cols),
            "Comment": [""] * len(cols),
        })

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #9
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        components = list(map(str, sql.getTable().names))

        # some queries might also include the database
        # as we do not have such a concept, we just get rid of it
        components = components[-2:]
        tableName = components[-1]

        if len(components) == 2:
            if components[0] != context.schema_name:
                raise AttributeError(f"Schema {components[0]} is not defined.")

        try:
            dc = context.tables[tableName]
        except KeyError:  # pragma: no cover
            raise AttributeError(f"Table {tableName} is not defined.")

        cols = dc.column_container.columns
        dtypes = list(
            map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes))
        df = pd.DataFrame({
            "Column": cols,
            "Type": dtypes,
            "Extra": [""] * len(cols),
            "Comment": [""] * len(cols),
        })

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #10
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        # There should not be any input. This is the first step.
        self.assert_inputs(rel, 0)

        # The table(s) we need to return
        table = rel.getTable()

        # The table names are all names split by "."
        # We assume to always have the form something.something
        # And the first something is fixed to "schema" by the context
        # For us, it makes no difference anyways.
        table_names = [str(n) for n in table.getQualifiedName()]
        assert table_names[0] == context.schema_name
        assert len(table_names) == 2
        table_name = table_names[1]
        table_name = table_name.lower()

        dc = context.tables[table_name]
        df = dc.df
        cc = dc.column_container

        # Make sure we only return the requested columns
        row_type = table.getRowType()
        field_specifications = [str(f) for f in row_type.getFieldNames()]
        cc = cc.limit_to(field_specifications)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Example #11
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        df = pd.DataFrame({"Schema": [context.schema_name]})

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #12
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        (dc, ) = self.assert_inputs(rel, 1, context)
        df = dc.df
        cc = dc.column_container

        sort_collation = rel.getCollation().getFieldCollations()
        sort_columns = [
            cc.get_backend_by_frontend_index(int(x.getFieldIndex()))
            for x in sort_collation
        ]
        sort_ascending = [
            str(x.getDirection()) == "ASCENDING" for x in sort_collation
        ]

        offset = rel.offset
        if offset:
            offset = RexConverter.convert(offset, df, context=context)

        end = rel.fetch
        if end:
            end = RexConverter.convert(end, df, context=context)

            if offset:
                end += offset

        if sort_columns:
            df = self._apply_sort(df, sort_columns, sort_ascending)

        if offset is not None or end is not None:
            df = self._apply_offset(df, offset, end)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        # No column type has changed, so no need to cast again
        return DataContainer(df, cc)
Example #13
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        (dc, ) = self.assert_inputs(rel, 1, context)
        df = dc.df
        cc = dc.column_container

        sort_collation = rel.getCollation().getFieldCollations()
        sort_columns = [
            cc.get_backend_by_frontend_index(int(x.getFieldIndex()))
            for x in sort_collation
        ]

        ASCENDING = org.apache.calcite.rel.RelFieldCollation.Direction.ASCENDING
        FIRST = org.apache.calcite.rel.RelFieldCollation.NullDirection.FIRST
        sort_ascending = [
            x.getDirection() == ASCENDING for x in sort_collation
        ]
        sort_null_first = [x.nullDirection == FIRST for x in sort_collation]

        df = df.persist()
        df = apply_sort(df, sort_columns, sort_ascending, sort_null_first)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        # No column type has changed, so no need to cast again
        return DataContainer(df, cc)
Example #14
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        (dc, ) = self.assert_inputs(rel, 1, context)
        df = dc.df
        cc = dc.column_container

        parameters = rel.getSamplingParameters()
        is_bernoulli = parameters.isBernoulli()
        fraction = float(parameters.getSamplingPercentage())
        seed = parameters.getRepeatableSeed() if parameters.isRepeatable(
        ) else None

        if is_bernoulli:
            df = df.sample(frac=fraction, replace=False, random_state=seed)
        else:
            random_state = np.random.RandomState(seed)
            random_choice = random_state.choice(
                [True, False],
                size=df.npartitions,
                replace=True,
                p=[fraction, 1 - fraction],
            )

            if random_choice.any():
                df = df.partitions[random_choice]
            else:
                df = df.head(0, compute=False)

        return DataContainer(df, cc)
Example #15
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        schema = str(sql.getSchema()).split(".")[-1]
        if schema != context.schema_name:
            raise AttributeError(f"Schema {schema} is not defined.")

        df = pd.DataFrame({"Table": list(context.tables.keys())})

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #16
0
    def convert(
        self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context"
    ) -> DataContainer:
        # Get the input of the previous step
        (dc,) = self.assert_inputs(rel, 1, context)

        df = dc.df
        cc = dc.column_container

        # Collect all (new) columns
        named_projects = rel.getNamedProjects()

        column_names = []
        new_columns = {}
        new_mappings = {}
        for expr, key in named_projects:
            key = str(key)
            column_names.append(key)

            # shortcut: if we have a column already, there is no need to re-assign it again
            # this is only the case if the expr is a RexInputRef
            if isinstance(expr, org.apache.calcite.rex.RexInputRef):
                index = expr.getIndex()
                backend_column_name = cc.get_backend_by_frontend_index(index)
                logger.debug(
                    f"Not re-adding the same column {key} (but just referencing it)"
                )
                new_mappings[key] = backend_column_name
            else:
                random_name = new_temporary_column(df)
                new_columns[random_name] = RexConverter.convert(
                    expr, dc, context=context
                )
                logger.debug(f"Adding a new column {key} out of {expr}")
                new_mappings[key] = random_name

        # Actually add the new columns
        if new_columns:
            df = df.assign(**new_columns)

        # and the new mappings
        for key, backend_column_name in new_mappings.items():
            cc = cc.add(key, backend_column_name)

        # Make sure the order is correct
        cc = cc.limit_to(column_names)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Example #17
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        # "information_schema" is a schema which is found in every presto database
        schema = context.schema_name
        df = pd.DataFrame({"Schema": [schema, "information_schema"]})

        # We currently do not use the passed additional parameter FROM.
        like = str(sql.like).strip("'")
        if like and like != "None":
            df = df[df.Schema == like]

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #18
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        first_dc, second_dc = self.assert_inputs(rel, 2, context)

        first_df = first_dc.df
        first_cc = first_dc.column_container

        second_df = second_dc.df
        second_cc = second_dc.column_container

        # For concatenating, they should have exactly the same fields
        output_field_names = [str(x) for x in rel.getRowType().getFieldNames()]
        assert len(first_cc.columns) == len(output_field_names)
        first_cc = first_cc.rename(
            columns={
                col: output_col
                for col, output_col in zip(first_cc.columns,
                                           output_field_names)
            })
        first_dc = DataContainer(first_df, first_cc)

        assert len(second_cc.columns) == len(output_field_names)
        second_cc = second_cc.rename(
            columns={
                col: output_col
                for col, output_col in zip(second_cc.columns,
                                           output_field_names)
            })
        second_dc = DataContainer(second_df, second_cc)

        # To concat the to dataframes, we need to make sure the
        # columns actually have the specified names in the
        # column containers
        # Otherwise the concat won't work
        first_df = first_dc.assign()
        second_df = second_dc.assign()

        self.check_columns_from_row_type(first_df,
                                         rel.getExpectedInputRowType(0))
        self.check_columns_from_row_type(second_df,
                                         rel.getExpectedInputRowType(1))

        df = dd.concat([first_df, second_df])

        if not rel.all:
            df = df.drop_duplicates()

        cc = ColumnContainer(df.columns)
        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Example #19
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        (dc, ) = self.assert_inputs(rel, 1, context)
        df = dc.df
        cc = dc.column_container

        # Every logic is handled in the RexConverter
        # we just need to apply it here
        condition = rel.getCondition()
        df_condition = RexConverter.convert(condition, dc, context=context)
        df = filter_or_scalar(df, df_condition)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        # No column type has changed, so no need to convert again
        return DataContainer(df, cc)
Example #20
0
    def fix_dtype_to_row_type(
            dc: DataContainer,
            row_type: "org.apache.calcite.rel.type.RelDataType"):
        """
        Fix the dtype of the given data container (or: the df within it)
        to the data type given as argument.
        To prevent unneeded conversions, do only convert if really needed,
        e.g. if the two types are "similar" enough, do not convert.
        Similarity involves the same general type (int, float, string etc)
        but not necessary the size (int64 and int32 are compatible)
        or the nullability.
        TODO: we should check the nullability of the SQL type
        """
        df = dc.df
        cc = dc.column_container

        field_types = {
            int(field.getIndex()): str(field.getType())
            for field in row_type.getFieldList()
        }

        for index, field_type in field_types.items():
            expected_type = sql_to_python_type(field_type)
            field_name = cc.get_backend_by_frontend_index(index)
            current_type = df[field_name].dtype

            logger.debug(
                f"Column {field_name} has type {current_type}, expecting {expected_type}..."
            )

            if similar_type(current_type, expected_type):
                logger.debug("...not converting.")
                continue

            current_float = pd.api.types.is_float_dtype(current_type)
            expected_integer = pd.api.types.is_integer_dtype(expected_type)
            if current_float and expected_integer:
                logger.debug("...truncating...")
                df[field_name] = da.trunc(df[field_name])

            logger.debug(
                f"Need to cast {field_name} from {current_type} to {expected_type}"
            )
            df[field_name] = df[field_name].astype(expected_type)

        return DataContainer(df, dc.column_container)
Example #21
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        schema = sql.getSchema()
        if schema is not None:
            schema = str(schema).split(".")[-1]
        else:
            schema = context.DEFAULT_SCHEMA_NAME

        if schema not in context.schema:
            raise AttributeError(f"Schema {schema} is not defined.")

        df = pd.DataFrame(
            {"Table": list(context.schema[schema].tables.keys())})

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #22
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        schema_name, name = context.fqn(sql.getTable())
        dc = context.schema[schema_name].tables[name]

        cols = dc.column_container.columns
        dtypes = list(
            map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes))
        df = pd.DataFrame({
            "Column": cols,
            "Type": dtypes,
            "Extra": [""] * len(cols),
            "Comment": [""] * len(cols),
        })

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #23
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        select = sql.getSelect()
        distribute_list = [str(col) for col in sql.getDistributeList()]

        sql_select_query = context._to_sql_string(select)
        df = context.sql(sql_select_query)
        logger.debug(f"Extracted sub-dataframe as {LoggableDataFrame(df)}")

        logger.debug(f"Will now shuffle according to {distribute_list}")

        # Perform the distribute by operation via a Dask shuffle
        df = df.shuffle(distribute_list)

        cc = ColumnContainer(df.columns)
        dc = DataContainer(df, cc)

        return dc
Example #24
0
    def convert(
        self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context"
    ) -> DataContainer:
        components = list(map(str, sql.getTable().names))
        dc = get_table_from_compound_identifier(context, components)

        cols = dc.column_container.columns
        dtypes = list(map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes))
        df = pd.DataFrame(
            {
                "Column": cols,
                "Type": dtypes,
                "Extra": [""] * len(cols),
                "Comment": [""] * len(cols),
            }
        )

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #25
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        schema_name, model_name = context.fqn(
            sql.getModelName().getIdentifier())

        if model_name not in context.schema[schema_name].models:
            raise RuntimeError(
                f"A model with the name {model_name} is not present.")

        model, training_columns = context.schema[schema_name].models[
            model_name]

        model_params = model.get_params()
        model_params["training_columns"] = training_columns.tolist()

        df = pd.DataFrame.from_dict(model_params,
                                    orient="index",
                                    columns=["Params"])
        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #26
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        (dc, ) = self.assert_inputs(rel, 1, context)
        df = dc.df
        cc = dc.column_container

        offset = rel.getOffset()
        if offset:
            offset = RexConverter.convert(offset, df, context=context)

        end = rel.getFetch()
        if end:
            end = RexConverter.convert(end, df, context=context)

            if offset:
                end += offset

        df = self._apply_limit(df, offset, end)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        # No column type has changed, so no need to cast again
        return DataContainer(df, cc)
Example #27
0
    def convert(
        self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context"
    ) -> DataContainer:
        components = list(map(str, sql.getTableName().names))
        dc = get_table_from_compound_identifier(context, components)
        columns = list(map(str, sql.getColumnList()))

        if not columns:
            columns = dc.column_container.columns

        # Define some useful shortcuts
        mapping = dc.column_container.get_backend_by_frontend_name
        df = dc.df

        # Calculate statistics
        statistics = dd.from_pandas(
            pd.DataFrame({col: [] for col in columns}), npartitions=1
        )
        statistics = statistics.append(df[[mapping(col) for col in columns]].describe())

        # Add additional information
        statistics = statistics.append(
            pd.Series(
                {
                    col: str(python_to_sql_type(df[mapping(col)].dtype)).lower()
                    for col in columns
                },
                name="data_type",
            )
        )
        statistics = statistics.append(
            pd.Series({col: col for col in columns}, name="col_name",)
        )

        cc = ColumnContainer(statistics.columns)
        dc = DataContainer(statistics, cc)
        return dc
Example #28
0
    def register_dask_table(self, df: dd.DataFrame, name: str):
        """
        Registering a dask table makes it usable in SQL queries.
        The name you give here can be used as table name in the SQL later.

        Please note, that the table is stored as it is now.
        If you change the table later, you need to re-register.

        Example:
            This code registers a data frame as table "data"
            and then uses it in a query.

            .. code-block:: python

                c.register_dask_table(df, "data")
                df_result = c.sql("SELECT a, b FROM data")

        Args:
            df (:class:`dask.dataframe.DataFrame`): The data frame to register
            name: (:obj:`str`): Under which name should the new table be addressable

        """
        self.tables[name.lower()] = DataContainer(df.copy(),
                                                  ColumnContainer(df.columns))
Example #29
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        (dc, ) = self.assert_inputs(rel, 1, context)

        df = dc.df
        cc = dc.column_container

        # We make our life easier with having unique column names
        cc = cc.make_unique()

        # I have no idea what that is, but so far it was always of length 1
        assert len(
            rel.getGroupSets()) == 1, "Do not know how to handle this case!"

        # Extract the information, which columns we need to group for
        group_column_indices = [int(i) for i in rel.getGroupSet()]
        group_columns = [
            cc.get_backend_by_frontend_index(i) for i in group_column_indices
        ]

        # Always keep an additional column around for empty groups and aggregates
        additional_column_name = str(uuid.uuid4())

        # NOTE: it might be the case that
        # we do not need this additional
        # column, but hopefully adding a single
        # column of 1 is not so problematic...
        df = df.assign(**{additional_column_name: 1})
        cc = cc.add(additional_column_name)
        dc = DataContainer(df, cc)

        # Collect all aggregates
        filtered_aggregations, output_column_order = self._collect_aggregations(
            rel, dc, group_columns, additional_column_name, context)

        if not group_columns:
            # There was actually no GROUP BY specified in the SQL
            # Still, this plan can also be used if we need to aggregate something over the full
            # data sample
            # To reuse the code, we just create a new column at the end with a single value
            # It is important to do this after creating the aggregations,
            # as we do not want this additional column to be used anywhere
            group_columns = [additional_column_name]

            logger.debug("Performing full-table aggregation")

        # Now we can perform the aggregates
        # We iterate through all pairs of (possible pre-filtered)
        # dataframes and the aggregations to perform in this data...
        df_agg = None
        for filtered_df_desc, aggregation in filtered_aggregations.items():
            filtered_column = filtered_df_desc.filtered_column
            if filtered_column:
                logger.debug(
                    f"Aggregating {dict(aggregation)} on the data filtered by {filtered_column}"
                )
            else:
                logger.debug(f"Aggregating {dict(aggregation)} on the data")

            # ... we perform the aggregations ...
            filtered_df = filtered_df_desc.df
            # TODO: we could use the type information for
            # pre-calculating the meta information
            filtered_df_agg = filtered_df.groupby(
                by=group_columns).agg(aggregation)

            # ... fix the column names to a single level ...
            filtered_df_agg.columns = filtered_df_agg.columns.get_level_values(
                -1)

            # ... and finally concat the new data with the already present columns
            if df_agg is None:
                df_agg = filtered_df_agg
            else:
                df_agg = df_agg.assign(**{
                    col: filtered_df_agg[col]
                    for col in filtered_df_agg.columns
                })

        # SQL does not care about the index, but we do not want to have any multiindices
        df_agg = df_agg.reset_index(drop=True)

        # Fix the column names and the order of them, as this was messed with during the aggregations
        df_agg.columns = df_agg.columns.get_level_values(-1)
        cc = ColumnContainer(df_agg.columns).limit_to(output_column_order)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df_agg, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Example #30
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        # Joining is a bit more complicated, so lets do it in steps:

        # 1. We now have two inputs (from left and right), so we fetch them both
        dc_lhs, dc_rhs = self.assert_inputs(rel, 2, context)
        cc_lhs = dc_lhs.column_container
        cc_rhs = dc_rhs.column_container

        # 2. dask's merge will do some smart things with columns, which have the same name
        # on lhs an rhs (which also includes reordering).
        # However, that will confuse our column numbering in SQL.
        # So we make our life easier by converting the column names into unique names
        # We will convert back in the end
        cc_lhs_renamed = cc_lhs.make_unique("lhs")
        cc_rhs_renamed = cc_rhs.make_unique("rhs")

        dc_lhs_renamed = DataContainer(dc_lhs.df, cc_lhs_renamed)
        dc_rhs_renamed = DataContainer(dc_rhs.df, cc_rhs_renamed)

        df_lhs_renamed = dc_lhs_renamed.assign()
        df_rhs_renamed = dc_rhs_renamed.assign()

        join_type = rel.getJoinType()
        join_type = self.JOIN_TYPE_MAPPING[str(join_type)]

        # 3. The join condition can have two forms, that we can understand
        # (a) a = b
        # (b) X AND Y AND a = b AND Z ... (can also be multiple a = b)
        # The first case is very simple and we do not need any additional filter
        # In the second case we do a merge on all the a = b,
        # and then apply a filter using the other expressions.
        # In all other cases, we need to do a full table cross join and filter afterwards.
        # As this is probably non-sense for large tables, but there is no other
        # known solution so far.
        join_condition = rel.getCondition()
        lhs_on, rhs_on, filter_condition = self._split_join_condition(
            join_condition)

        logger.debug(
            f"Joining with type {join_type} on columns {lhs_on}, {rhs_on}.")

        # lhs_on and rhs_on are the indices of the columns to merge on.
        # The given column indices are for the full, merged table which consists
        # of lhs and rhs put side-by-side (in this order)
        # We therefore need to normalize the rhs indices relative to the rhs table.
        rhs_on = [index - len(df_lhs_renamed.columns) for index in rhs_on]

        # 4. dask can only merge on the same column names.
        # We therefore create new columns on purpose, which have a distinct name.
        assert len(lhs_on) == len(rhs_on)
        if lhs_on:
            # 5. Now we can finally merge on these columns
            # The resulting dataframe will contain all (renamed) columns from the lhs and rhs
            # plus the added columns
            df = self._join_on_columns(
                df_lhs_renamed,
                df_rhs_renamed,
                lhs_on,
                rhs_on,
                join_type,
            )
        else:
            # 5. We are in the complex join case
            # where we have no column to merge on
            # This means we have no other chance than to merge
            # everything with everything...

            # TODO: we should implement a shortcut
            # for filter conditions that are always false

            def merge_single_partitions(lhs_partition, rhs_partition):
                # Do a cross join with the two partitions
                # TODO: it would be nice to apply the filter already here
                # problem: this would mean we need to ship the rex to the
                # workers (as this is executed on the workers),
                # which is definitely not possible (java dependency, JVM start...)
                lhs_partition = lhs_partition.assign(common=1)
                rhs_partition = rhs_partition.assign(common=1)

                return lhs_partition.merge(rhs_partition,
                                           on="common").drop(columns="common")

            # Iterate nested over all partitions from lhs and rhs and merge them
            name = "cross-join-" + tokenize(df_lhs_renamed, df_rhs_renamed)
            dsk = {(name, i * df_rhs_renamed.npartitions + j): (
                merge_single_partitions,
                (df_lhs_renamed._name, i),
                (df_rhs_renamed._name, j),
            )
                   for i in range(df_lhs_renamed.npartitions)
                   for j in range(df_rhs_renamed.npartitions)}

            graph = HighLevelGraph.from_collections(
                name, dsk, dependencies=[df_lhs_renamed, df_rhs_renamed])

            meta = dd.dispatch.concat(
                [df_lhs_renamed._meta_nonempty, df_rhs_renamed._meta_nonempty],
                axis=1)
            # TODO: Do we know the divisions in any way here?
            divisions = [None] * (len(dsk) + 1)
            df = dd.DataFrame(graph, name, meta=meta, divisions=divisions)

            warnings.warn(
                "Need to do a cross-join, which is typically very resource heavy",
                ResourceWarning,
            )

        # 6. So the next step is to make sure
        # we have the correct column order (and to remove the temporary join columns)
        correct_column_order = list(df_lhs_renamed.columns) + list(
            df_rhs_renamed.columns)
        cc = ColumnContainer(df.columns).limit_to(correct_column_order)

        # and to rename them like the rel specifies
        row_type = rel.getRowType()
        field_specifications = [str(f) for f in row_type.getFieldNames()]
        cc = cc.rename({
            from_col: to_col
            for from_col, to_col in zip(cc.columns, field_specifications)
        })
        cc = self.fix_column_to_row_type(cc, row_type)
        dc = DataContainer(df, cc)

        # 7. Last but not least we apply any filters by and-chaining together the filters
        if filter_condition:
            # This line is a bit of code duplication with RexCallPlugin - but I guess it is worth to keep it separate
            filter_condition = reduce(
                operator.and_,
                [
                    RexConverter.convert(rex, dc, context=context)
                    for rex in filter_condition
                ],
            )
            logger.debug(f"Additionally applying filter {filter_condition}")
            df = filter_or_scalar(df, filter_condition)
            dc = DataContainer(df, cc)

        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc