Example #1
0
def test_cc_init():
    c = ColumnContainer(["a", "b", "c"])

    assert c.columns == ["a", "b", "c"]
    assert c.mapping() == [("a", "a"), ("b", "b"), ("c", "c")]

    c = ColumnContainer(["a", "b", "c"], {"a": "1", "b": "2", "c": "3"})

    assert c.columns == ["a", "b", "c"]
    assert c.mapping() == [("a", "1"), ("b", "2"), ("c", "3")]
Example #2
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        components = list(map(str, sql.getTable().names))

        # some queries might also include the database
        # as we do not have such a concept, we just get rid of it
        components = components[-2:]
        tableName = components[-1]

        if len(components) == 2:
            if components[0] != context.schema_name:
                raise AttributeError(f"Schema {components[0]} is not defined.")

        try:
            dc = context.tables[tableName]
        except KeyError:  # pragma: no cover
            raise AttributeError(f"Table {tableName} is not defined.")

        cols = dc.column_container.columns
        dtypes = list(
            map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes))
        df = pd.DataFrame({
            "Column": cols,
            "Type": dtypes,
            "Extra": [""] * len(cols),
            "Comment": [""] * len(cols),
        })

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #3
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        df = pd.DataFrame({"Schema": [context.schema_name]})

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #4
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        components = list(map(str, sql.getTable().names))

        tableName = components[-1]

        if len(components) == 2:
            if components[0] != context.schema_name:
                raise AttributeError(f"Schema {components[0]} is not defined.")
        elif len(components) > 2:
            raise AttributeError(
                "Table specification must be in the form [schema.]table")

        dc = context.tables[tableName]
        cols = dc.column_container.columns
        dtypes = list(
            map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes))
        df = pd.DataFrame({
            "Column": cols,
            "Type": dtypes,
            "Extra": [""] * len(cols),
            "Comment": [""] * len(cols),
        })

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #5
0
def to_dc(
    input_item: InputType,
    file_format: str = None,
    persist: bool = True,
    hive_table_name: str = None,
    hive_schema_name: str = "default",
    **kwargs,
) -> DataContainer:
    """
    Turn possible input descriptions or formats (e.g. dask dataframes, pandas dataframes,
    locations as string, hive tables) into the loaded data containers,
    maybe persist them to cluster memory before.
    """
    filled_get_dask_dataframe = lambda *args: _get_dask_dataframe(
        *args,
        file_format=file_format,
        persist=persist,
        hive_table_name=hive_table_name,
        hive_schema_name=hive_schema_name,
        **kwargs,
    )

    if isinstance(input_item, list):
        table = dd.concat(
            [filled_get_dask_dataframe(item) for item in input_item])
    else:
        table = filled_get_dask_dataframe(input_item)

    if persist:
        table = table.persist()

    return DataContainer(table.copy(), ColumnContainer(table.columns))
Example #6
0
def test_cc_add():
    c = ColumnContainer(["a", "b", "c"])

    c2 = c.add("d")

    assert c2.columns == ["a", "b", "c", "d"]
    assert c2.mapping() == [("a", "a"), ("b", "b"), ("c", "c"), ("d", "d")]
    assert c.columns == ["a", "b", "c"]
    assert c.mapping() == [("a", "a"), ("b", "b"), ("c", "c")]

    c2 = c.add("d", "D")

    assert c2.columns == ["a", "b", "c", "d"]
    assert c2.mapping() == [("a", "a"), ("b", "b"), ("c", "c"), ("d", "D")]
    assert c.columns == ["a", "b", "c"]
    assert c.mapping() == [("a", "a"), ("b", "b"), ("c", "c")]

    c2 = c.add("d", "a")

    assert c2.columns == ["a", "b", "c", "d"]
    assert c2.mapping() == [("a", "a"), ("b", "b"), ("c", "c"), ("d", "a")]
    assert c.columns == ["a", "b", "c"]
    assert c.mapping() == [("a", "a"), ("b", "b"), ("c", "c")]

    c2 = c.add("a", "b")

    assert c2.columns == ["a", "b", "c"]
    assert c2.mapping() == [("a", "b"), ("b", "b"), ("c", "c")]
    assert c.columns == ["a", "b", "c"]
    assert c.mapping() == [("a", "a"), ("b", "b"), ("c", "c")]
Example #7
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        # There should not be any input. This is the first step.
        self.assert_inputs(rel, 0)

        rex_expression_rows = list(rel.getTuples())
        rows = []
        for rex_expression_row in rex_expression_rows:
            # We convert each of the cells in the row
            # using a RexConverter.
            # As we do not have any information on the
            # column headers, we just name them with
            # their index.
            rows.append({
                str(i): RexConverter.convert(rex_cell, None, context=context)
                for i, rex_cell in enumerate(rex_expression_row)
            })

        # TODO: we explicitely reference pandas and dask here -> might we worth making this more general
        # We assume here that when using the values plan, the resulting dataframe will be quite small
        if rows:
            df = pd.DataFrame(rows)
        else:
            field_names = [str(x) for x in rel.getRowType().getFieldNames()]
            df = pd.DataFrame(columns=field_names)

        df = dd.from_pandas(df, npartitions=1)
        cc = ColumnContainer(df.columns)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Example #8
0
def test_cc_rename():
    c = ColumnContainer(["a", "b", "c"])

    c2 = c.rename({"a": "A", "b": "a"})

    assert c2.columns == ["A", "a", "c"]
    assert c2.mapping() == [("a", "b"), ("b", "b"), ("c", "c"), ("A", "a")]
    assert c.columns == ["a", "b", "c"]
    assert c.mapping() == [("a", "a"), ("b", "b"), ("c", "c")]
Example #9
0
def test_cc_limit_to():
    c = ColumnContainer(["a", "b", "c"])

    c2 = c.limit_to(["c", "a"])

    assert c2.columns == ["c", "a"]
    assert c2.mapping() == [("a", "a"), ("b", "b"), ("c", "c")]
    assert c.columns == ["a", "b", "c"]
    assert c.mapping() == [("a", "a"), ("b", "b"), ("c", "c")]
Example #10
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        schema = str(sql.getSchema()).split(".")[-1]
        if schema != context.schema_name:
            raise AttributeError(f"Schema {schema} is not defined.")

        df = pd.DataFrame({"Table": list(context.tables.keys())})

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #11
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        first_dc, second_dc = self.assert_inputs(rel, 2, context)

        first_df = first_dc.df
        first_cc = first_dc.column_container

        second_df = second_dc.df
        second_cc = second_dc.column_container

        # For concatenating, they should have exactly the same fields
        output_field_names = [str(x) for x in rel.getRowType().getFieldNames()]
        assert len(first_cc.columns) == len(output_field_names)
        first_cc = first_cc.rename(
            columns={
                col: output_col
                for col, output_col in zip(first_cc.columns,
                                           output_field_names)
            })
        first_dc = DataContainer(first_df, first_cc)

        assert len(second_cc.columns) == len(output_field_names)
        second_cc = second_cc.rename(
            columns={
                col: output_col
                for col, output_col in zip(second_cc.columns,
                                           output_field_names)
            })
        second_dc = DataContainer(second_df, second_cc)

        # To concat the to dataframes, we need to make sure the
        # columns actually have the specified names in the
        # column containers
        # Otherwise the concat won't work
        first_df = first_dc.assign()
        second_df = second_dc.assign()

        self.check_columns_from_row_type(first_df,
                                         rel.getExpectedInputRowType(0))
        self.check_columns_from_row_type(second_df,
                                         rel.getExpectedInputRowType(1))

        df = dd.concat([first_df, second_df])

        if not rel.all:
            df = df.drop_duplicates()

        cc = ColumnContainer(df.columns)
        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Example #12
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        # "information_schema" is a schema which is found in every presto database
        schema = context.schema_name
        df = pd.DataFrame({"Schema": [schema, "information_schema"]})

        # We currently do not use the passed additional parameter FROM.
        like = str(sql.like).strip("'")
        if like and like != "None":
            df = df[df.Schema == like]

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #13
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        (dc, ) = self.assert_inputs(rel, 1, context)

        df = dc.df
        cc = dc.column_container

        # We make our life easier with having unique column names
        cc = cc.make_unique()

        # I have no idea what that is, but so far it was always of length 1
        assert len(
            rel.getGroupSets()) == 1, "Do not know how to handle this case!"

        # Extract the information, which columns we need to group for
        group_column_indices = [int(i) for i in rel.getGroupSet()]
        group_columns = [
            cc.get_backend_by_frontend_index(i) for i in group_column_indices
        ]

        dc = DataContainer(df, cc)

        if not group_columns:
            # There was actually no GROUP BY specified in the SQL
            # Still, this plan can also be used if we need to aggregate something over the full
            # data sample
            # To reuse the code, we just create a new column at the end with a single value
            logger.debug("Performing full-table aggregation")

        # Do all aggregates
        df_result, output_column_order = self._do_aggregations(
            rel,
            dc,
            group_columns,
            context,
        )

        # SQL does not care about the index, but we do not want to have any multiindices
        df_agg = df_result.reset_index(drop=True)

        # Fix the column names and the order of them, as this was messed with during the aggregations
        df_agg.columns = df_agg.columns.get_level_values(-1)
        cc = ColumnContainer(df_agg.columns).limit_to(output_column_order)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df_agg, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Example #14
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        schema = sql.getSchema()
        if schema is not None:
            schema = str(schema).split(".")[-1]
        else:
            schema = context.DEFAULT_SCHEMA_NAME

        if schema not in context.schema:
            raise AttributeError(f"Schema {schema} is not defined.")

        df = pd.DataFrame(
            {"Table": list(context.schema[schema].tables.keys())})

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #15
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        select = sql.getSelect()
        distribute_list = [str(col) for col in sql.getDistributeList()]

        sql_select_query = context._to_sql_string(select)
        df = context.sql(sql_select_query)
        logger.debug(f"Extracted sub-dataframe as {LoggableDataFrame(df)}")

        logger.debug(f"Will now shuffle according to {distribute_list}")

        # Perform the distribute by operation via a Dask shuffle
        df = df.shuffle(distribute_list)

        cc = ColumnContainer(df.columns)
        dc = DataContainer(df, cc)

        return dc
Example #16
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        schema_name, name = context.fqn(sql.getTable())
        dc = context.schema[schema_name].tables[name]

        cols = dc.column_container.columns
        dtypes = list(
            map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes))
        df = pd.DataFrame({
            "Column": cols,
            "Type": dtypes,
            "Extra": [""] * len(cols),
            "Comment": [""] * len(cols),
        })

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #17
0
    def convert(
        self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context"
    ) -> DataContainer:
        components = list(map(str, sql.getTable().names))
        dc = get_table_from_compound_identifier(context, components)

        cols = dc.column_container.columns
        dtypes = list(map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes))
        df = pd.DataFrame(
            {
                "Column": cols,
                "Type": dtypes,
                "Extra": [""] * len(cols),
                "Comment": [""] * len(cols),
            }
        )

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #18
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        schema_name, model_name = context.fqn(
            sql.getModelName().getIdentifier())

        if model_name not in context.schema[schema_name].models:
            raise RuntimeError(
                f"A model with the name {model_name} is not present.")

        model, training_columns = context.schema[schema_name].models[
            model_name]

        model_params = model.get_params()
        model_params["training_columns"] = training_columns.tolist()

        df = pd.DataFrame.from_dict(model_params,
                                    orient="index",
                                    columns=["Params"])
        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #19
0
    def convert(
        self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context"
    ) -> DataContainer:
        components = list(map(str, sql.getTableName().names))
        dc = get_table_from_compound_identifier(context, components)
        columns = list(map(str, sql.getColumnList()))

        if not columns:
            columns = dc.column_container.columns

        # Define some useful shortcuts
        mapping = dc.column_container.get_backend_by_frontend_name
        df = dc.df

        # Calculate statistics
        statistics = dd.from_pandas(
            pd.DataFrame({col: [] for col in columns}), npartitions=1
        )
        statistics = statistics.append(df[[mapping(col) for col in columns]].describe())

        # Add additional information
        statistics = statistics.append(
            pd.Series(
                {
                    col: str(python_to_sql_type(df[mapping(col)].dtype)).lower()
                    for col in columns
                },
                name="data_type",
            )
        )
        statistics = statistics.append(
            pd.Series({col: col for col in columns}, name="col_name",)
        )

        cc = ColumnContainer(statistics.columns)
        dc = DataContainer(statistics, cc)
        return dc
Example #20
0
    def register_dask_table(self, df: dd.DataFrame, name: str):
        """
        Registering a dask table makes it usable in SQL queries.
        The name you give here can be used as table name in the SQL later.

        Please note, that the table is stored as it is now.
        If you change the table later, you need to re-register.

        Example:
            This code registers a data frame as table "data"
            and then uses it in a query.

            .. code-block:: python

                c.register_dask_table(df, "data")
                df_result = c.sql("SELECT a, b FROM data")

        Args:
            df (:class:`dask.dataframe.DataFrame`): The data frame to register
            name: (:obj:`str`): Under which name should the new table be addressable

        """
        self.tables[name.lower()] = DataContainer(df.copy(),
                                                  ColumnContainer(df.columns))
Example #21
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        sql_select = sql.getSelect()
        model_name = str(sql.getModelName().getIdentifier())
        model_type = sql.getModelName().getIdentifierType()
        select_list = sql.getSelectList()

        logger.debug(
            f"Predicting from {model_name} and query {sql_select} to {list(select_list)}"
        )

        IdentifierType = com.dask.sql.parser.SqlModelIdentifier.IdentifierType

        if model_type == IdentifierType.REFERENCE:
            try:
                model, training_columns = context.models[model_name]
            except KeyError:
                raise KeyError(f"No model registered with name {model_name}")
        else:
            raise NotImplementedError(
                f"Do not understand model type {model_type}")

        sql_select_query = context._to_sql_string(sql_select)
        df = context.sql(sql_select_query)

        prediction = model.predict(df[training_columns])
        predicted_df = df.assign(target=prediction)

        # Create a temporary context, which includes the
        # new "table" so that we can use the normal
        # SQL-to-dask-code machinery
        while True:
            # Make sure to choose a non-used name
            temporary_table = str(uuid.uuid4())
            if temporary_table not in context.tables:
                break
            else:  # pragma: no cover
                continue

        tmp_context = copy.deepcopy(context)
        tmp_context.create_table(temporary_table, predicted_df)

        sql_ns = org.apache.calcite.sql
        pos = sql.getParserPosition()
        from_column_list = java.util.ArrayList()
        from_column_list.add(temporary_table)
        from_clause = sql_ns.SqlIdentifier(from_column_list,
                                           pos)  # TODO: correct pos

        outer_select = sql_ns.SqlSelect(
            sql.getParserPosition(),
            None,  # keywordList,
            select_list,  # selectList,
            from_clause,  # from,
            None,  # where,
            None,  # groupBy,
            None,  # having,
            None,  # windowDecls,
            None,  # orderBy,
            None,  # offset,
            None,  # fetch,
            None,  # hints
        )

        sql_outer_query = tmp_context._to_sql_string(outer_select)
        df = tmp_context.sql(sql_outer_query)

        cc = ColumnContainer(df.columns)
        dc = DataContainer(df, cc)

        return dc
Example #22
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        (dc, ) = self.assert_inputs(rel, 1, context)

        df = dc.df
        cc = dc.column_container

        # We make our life easier with having unique column names
        cc = cc.make_unique()

        # I have no idea what that is, but so far it was always of length 1
        assert len(
            rel.getGroupSets()) == 1, "Do not know how to handle this case!"

        # Extract the information, which columns we need to group for
        group_column_indices = [int(i) for i in rel.getGroupSet()]
        group_columns = [
            cc.get_backend_by_frontend_index(i) for i in group_column_indices
        ]

        # Always keep an additional column around for empty groups and aggregates
        additional_column_name = str(uuid.uuid4())

        # NOTE: it might be the case that
        # we do not need this additional
        # column, but hopefully adding a single
        # column of 1 is not so problematic...
        df = df.assign(**{additional_column_name: 1})
        cc = cc.add(additional_column_name)
        dc = DataContainer(df, cc)

        # Collect all aggregates
        filtered_aggregations, output_column_order = self._collect_aggregations(
            rel, dc, group_columns, additional_column_name, context)

        if not group_columns:
            # There was actually no GROUP BY specified in the SQL
            # Still, this plan can also be used if we need to aggregate something over the full
            # data sample
            # To reuse the code, we just create a new column at the end with a single value
            # It is important to do this after creating the aggregations,
            # as we do not want this additional column to be used anywhere
            group_columns = [additional_column_name]

            logger.debug("Performing full-table aggregation")

        # Now we can perform the aggregates
        # We iterate through all pairs of (possible pre-filtered)
        # dataframes and the aggregations to perform in this data...
        df_agg = None
        for filtered_df_desc, aggregation in filtered_aggregations.items():
            filtered_column = filtered_df_desc.filtered_column
            if filtered_column:
                logger.debug(
                    f"Aggregating {dict(aggregation)} on the data filtered by {filtered_column}"
                )
            else:
                logger.debug(f"Aggregating {dict(aggregation)} on the data")

            # ... we perform the aggregations ...
            filtered_df = filtered_df_desc.df
            # TODO: we could use the type information for
            # pre-calculating the meta information
            filtered_df_agg = filtered_df.groupby(
                by=group_columns).agg(aggregation)

            # ... fix the column names to a single level ...
            filtered_df_agg.columns = filtered_df_agg.columns.get_level_values(
                -1)

            # ... and finally concat the new data with the already present columns
            if df_agg is None:
                df_agg = filtered_df_agg
            else:
                df_agg = df_agg.assign(**{
                    col: filtered_df_agg[col]
                    for col in filtered_df_agg.columns
                })

        # SQL does not care about the index, but we do not want to have any multiindices
        df_agg = df_agg.reset_index(drop=True)

        # Fix the column names and the order of them, as this was messed with during the aggregations
        df_agg.columns = df_agg.columns.get_level_values(-1)
        cc = ColumnContainer(df_agg.columns).limit_to(output_column_order)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df_agg, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Example #23
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        # Joining is a bit more complicated, so lets do it in steps:

        # 1. We now have two inputs (from left and right), so we fetch them both
        dc_lhs, dc_rhs = self.assert_inputs(rel, 2, context)
        cc_lhs = dc_lhs.column_container
        cc_rhs = dc_rhs.column_container

        # 2. dask's merge will do some smart things with columns, which have the same name
        # on lhs an rhs (which also includes reordering).
        # However, that will confuse our column numbering in SQL.
        # So we make our life easier by converting the column names into unique names
        # We will convert back in the end
        cc_lhs_renamed = cc_lhs.make_unique("lhs")
        cc_rhs_renamed = cc_rhs.make_unique("rhs")

        dc_lhs_renamed = DataContainer(dc_lhs.df, cc_lhs_renamed)
        dc_rhs_renamed = DataContainer(dc_rhs.df, cc_rhs_renamed)

        df_lhs_renamed = dc_lhs_renamed.assign()
        df_rhs_renamed = dc_rhs_renamed.assign()

        join_type = rel.getJoinType()
        join_type = self.JOIN_TYPE_MAPPING[str(join_type)]

        # 3. The join condition can have two forms, that we can understand
        # (a) a = b
        # (b) X AND Y AND a = b AND Z ... (can also be multiple a = b)
        # The first case is very simple and we do not need any additional filter
        # In the second case we do a merge on all the a = b,
        # and then apply a filter using the other expressions.
        # In all other cases, we need to do a full table cross join and filter afterwards.
        # As this is probably non-sense for large tables, but there is no other
        # known solution so far.
        join_condition = rel.getCondition()
        lhs_on, rhs_on, filter_condition = self._split_join_condition(
            join_condition)

        logger.debug(
            f"Joining with type {join_type} on columns {lhs_on}, {rhs_on}.")

        # lhs_on and rhs_on are the indices of the columns to merge on.
        # The given column indices are for the full, merged table which consists
        # of lhs and rhs put side-by-side (in this order)
        # We therefore need to normalize the rhs indices relative to the rhs table.
        rhs_on = [index - len(df_lhs_renamed.columns) for index in rhs_on]

        # 4. dask can only merge on the same column names.
        # We therefore create new columns on purpose, which have a distinct name.
        assert len(lhs_on) == len(rhs_on)
        if lhs_on:
            # 5. Now we can finally merge on these columns
            # The resulting dataframe will contain all (renamed) columns from the lhs and rhs
            # plus the added columns
            df = self._join_on_columns(
                df_lhs_renamed,
                df_rhs_renamed,
                lhs_on,
                rhs_on,
                join_type,
            )
        else:
            # 5. We are in the complex join case
            # where we have no column to merge on
            # This means we have no other chance than to merge
            # everything with everything...

            # TODO: we should implement a shortcut
            # for filter conditions that are always false

            def merge_single_partitions(lhs_partition, rhs_partition):
                # Do a cross join with the two partitions
                # TODO: it would be nice to apply the filter already here
                # problem: this would mean we need to ship the rex to the
                # workers (as this is executed on the workers),
                # which is definitely not possible (java dependency, JVM start...)
                lhs_partition = lhs_partition.assign(common=1)
                rhs_partition = rhs_partition.assign(common=1)

                return lhs_partition.merge(rhs_partition,
                                           on="common").drop(columns="common")

            # Iterate nested over all partitions from lhs and rhs and merge them
            name = "cross-join-" + tokenize(df_lhs_renamed, df_rhs_renamed)
            dsk = {(name, i * df_rhs_renamed.npartitions + j): (
                merge_single_partitions,
                (df_lhs_renamed._name, i),
                (df_rhs_renamed._name, j),
            )
                   for i in range(df_lhs_renamed.npartitions)
                   for j in range(df_rhs_renamed.npartitions)}

            graph = HighLevelGraph.from_collections(
                name, dsk, dependencies=[df_lhs_renamed, df_rhs_renamed])

            meta = dd.dispatch.concat(
                [df_lhs_renamed._meta_nonempty, df_rhs_renamed._meta_nonempty],
                axis=1)
            # TODO: Do we know the divisions in any way here?
            divisions = [None] * (len(dsk) + 1)
            df = dd.DataFrame(graph, name, meta=meta, divisions=divisions)

            warnings.warn(
                "Need to do a cross-join, which is typically very resource heavy",
                ResourceWarning,
            )

        # 6. So the next step is to make sure
        # we have the correct column order (and to remove the temporary join columns)
        correct_column_order = list(df_lhs_renamed.columns) + list(
            df_rhs_renamed.columns)
        cc = ColumnContainer(df.columns).limit_to(correct_column_order)

        # and to rename them like the rel specifies
        row_type = rel.getRowType()
        field_specifications = [str(f) for f in row_type.getFieldNames()]
        cc = cc.rename({
            from_col: to_col
            for from_col, to_col in zip(cc.columns, field_specifications)
        })
        cc = self.fix_column_to_row_type(cc, row_type)
        dc = DataContainer(df, cc)

        # 7. Last but not least we apply any filters by and-chaining together the filters
        if filter_condition:
            # This line is a bit of code duplication with RexCallPlugin - but I guess it is worth to keep it separate
            filter_condition = reduce(
                operator.and_,
                [
                    RexConverter.convert(rex, dc, context=context)
                    for rex in filter_condition
                ],
            )
            logger.debug(f"Additionally applying filter {filter_condition}")
            df = filter_or_scalar(df, filter_condition)
            dc = DataContainer(df, cc)

        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Example #24
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        # Joining is a bit more complicated, so lets do it in steps:

        # 1. We now have two inputs (from left and right), so we fetch them both
        dc_lhs, dc_rhs = self.assert_inputs(rel, 2, context)
        cc_lhs = dc_lhs.column_container
        cc_rhs = dc_rhs.column_container

        # 2. dask's merge will do some smart things with columns, which have the same name
        # on lhs an rhs (which also includes reordering).
        # However, that will confuse our column numbering in SQL.
        # So we make our life easier by converting the column names into unique names
        # We will convert back in the end
        cc_lhs_renamed = cc_lhs.make_unique("lhs")
        cc_rhs_renamed = cc_rhs.make_unique("rhs")

        dc_lhs_renamed = DataContainer(dc_lhs.df, cc_lhs_renamed)
        dc_rhs_renamed = DataContainer(dc_rhs.df, cc_rhs_renamed)

        df_lhs_renamed = dc_lhs_renamed.assign()
        df_rhs_renamed = dc_rhs_renamed.assign()

        join_type = rel.getJoinType()
        join_type = self.JOIN_TYPE_MAPPING[str(join_type)]

        # 3. The join condition can have two forms, that we can understand
        # (a) a = b
        # (b) X AND Y AND a = b AND Z ... (can also be multiple a = b)
        # The first case is very simple and we do not need any additional filter
        # In the second case we do a merge on all the a = b,
        # and then apply a filter using the other expressions.
        # In all other cases, we need to do a full table cross join and filter afterwards.
        # As this is probably non-sense for large tables, but there is no other
        # known solution so far.
        join_condition = rel.getCondition()
        lhs_on, rhs_on, filter_condition = self._split_join_condition(
            join_condition)

        logger.debug(
            f"Joining with type {join_type} on columns {lhs_on}, {rhs_on}.")

        # lhs_on and rhs_on are the indices of the columns to merge on.
        # The given column indices are for the full, merged table which consists
        # of lhs and rhs put side-by-side (in this order)
        # We therefore need to normalize the rhs indices relative to the rhs table.
        rhs_on = [index - len(df_lhs_renamed.columns) for index in rhs_on]

        # 4. dask can only merge on the same column names.
        # We therefore create new columns on purpose, which have a distinct name.
        assert len(lhs_on) == len(rhs_on)
        if lhs_on:
            lhs_columns_to_add = {
                f"common_{i}": df_lhs_renamed.iloc[:, index]
                for i, index in enumerate(lhs_on)
            }
            rhs_columns_to_add = {
                f"common_{i}": df_rhs_renamed.iloc[:, index]
                for i, index in enumerate(rhs_on)
            }
        else:
            # We are in the complex join case
            # where we have no column to merge on
            # This means we have no other chance than to merge
            # everything with everything...
            # We add a 1-column to merge on
            lhs_columns_to_add = {"common": 1}
            rhs_columns_to_add = {"common": 1}

            warnings.warn(
                "Need to do a cross-join, which is typically very resource heavy",
                ResourceWarning,
            )

        df_lhs_with_tmp = df_lhs_renamed.assign(**lhs_columns_to_add)
        df_rhs_with_tmp = df_rhs_renamed.assign(**rhs_columns_to_add)
        added_columns = list(lhs_columns_to_add.keys())

        # 5. Now we can finally merge on these columns
        # The resulting dataframe will contain all (renamed) columns from the lhs and rhs
        # plus the added columns
        df = dd.merge(df_lhs_with_tmp,
                      df_rhs_with_tmp,
                      on=added_columns,
                      how=join_type)

        # 6. So the next step is to make sure
        # we have the correct column order (and to remove the temporary join columns)
        correct_column_order = list(df_lhs_renamed.columns) + list(
            df_rhs_renamed.columns)
        cc = ColumnContainer(df.columns).limit_to(correct_column_order)

        # and to rename them like the rel specifies
        row_type = rel.getRowType()
        field_specifications = [str(f) for f in row_type.getFieldNames()]
        cc = cc.rename({
            from_col: to_col
            for from_col, to_col in zip(cc.columns, field_specifications)
        })
        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df, cc)

        # 7. Last but not least we apply any filters by and-chaining together the filters
        if filter_condition:
            # This line is a bit of code duplication with RexCallPlugin - but I guess it is worth to keep it separate
            filter_condition = reduce(
                operator.and_,
                [
                    RexConverter.convert(rex, dc, context=context)
                    for rex in filter_condition
                ],
            )
            logger.debug(f"Additionally applying filter {filter_condition}")
            df = filter_or_scalar(df, filter_condition)
            dc = DataContainer(df, cc)

        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Example #25
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        select = sql.getSelect()
        schema_name, experiment_name = context.fqn(sql.getExperimentName())
        kwargs = convert_sql_kwargs(sql.getKwargs())

        if experiment_name in context.schema[schema_name].experiments:
            if sql.getIfNotExists():
                return
            elif not sql.getReplace():
                raise RuntimeError(
                    f"A experiment with the name {experiment_name} is already present."
                )

        logger.debug(
            f"Creating Experiment {experiment_name} from query {select} with options {kwargs}"
        )
        model_class = None
        automl_class = None
        experiment_class = None
        if "model_class" in kwargs:
            model_class = kwargs.pop("model_class")
            # when model class was provided, must provide experiment_class also for tuning
            if "experiment_class" not in kwargs:
                raise ValueError(
                    f"Parameters must include a 'experiment_class' parameter for tuning {model_class}."
                )
            experiment_class = kwargs.pop("experiment_class")
        elif "automl_class" in kwargs:
            automl_class = kwargs.pop("automl_class")
        else:
            raise ValueError(
                "Parameters must include a 'model_class' or 'automl_class' parameter."
            )
        target_column = kwargs.pop("target_column", "")
        tune_fit_kwargs = kwargs.pop("tune_fit_kwargs", {})
        parameters = kwargs.pop("tune_parameters", {})
        experiment_kwargs = kwargs.pop("experiment_kwargs", {})
        automl_kwargs = kwargs.pop("automl_kwargs", {})
        logger.info(parameters)

        select_query = context._to_sql_string(select)
        training_df = context.sql(select_query)
        if not target_column:
            raise ValueError(
                "Unsupervised Algorithm cannot be tuned Automatically,"
                "Consider providing 'target column'")
        non_target_columns = [
            col for col in training_df.columns if col != target_column
        ]
        X = training_df[non_target_columns]
        y = training_df[target_column]

        if model_class and experiment_class:
            try:
                ModelClass = import_class(model_class)
            except ImportError:
                raise ValueError(
                    f"Can not import model {model_class}. Make sure you spelled it correctly and have installed all packages."
                )
            try:
                ExperimentClass = import_class(experiment_class)
            except ImportError:
                raise ValueError(
                    f"Can not import tuner {experiment_class}. Make sure you spelled it correctly and have installed all packages."
                )

            try:
                from dask_ml.wrappers import ParallelPostFit
            except ImportError:  # pragma: no cover
                raise ValueError(
                    "dask_ml must be installed to use automl and tune hyperparameters"
                )

            model = ModelClass()

            search = ExperimentClass(model, {**parameters},
                                     **experiment_kwargs)
            logger.info(tune_fit_kwargs)
            search.fit(X, y, **tune_fit_kwargs)
            df = pd.DataFrame(search.cv_results_)
            df["model_class"] = model_class

            context.register_model(
                experiment_name,
                ParallelPostFit(estimator=search.best_estimator_),
                X.columns,
                schema_name=schema_name,
            )

        if automl_class:

            try:
                AutoMLClass = import_class(automl_class)
            except ImportError:
                raise ValueError(
                    f"Can not import automl model {automl_class}. Make sure you spelled it correctly and have installed all packages."
                )

            try:
                from dask_ml.wrappers import ParallelPostFit
            except ImportError:  # pragma: no cover
                raise ValueError(
                    "dask_ml must be installed to use automl and tune hyperparameters"
                )

            automl = AutoMLClass(**automl_kwargs)
            # should be avoided if  data doesn't fit in memory
            automl.fit(X.compute(), y.compute())
            df = (pd.DataFrame(
                automl.evaluated_individuals_).T.reset_index().rename(
                    {"index": "models"}, axis=1))

            context.register_model(
                experiment_name,
                ParallelPostFit(estimator=automl.fitted_pipeline_),
                X.columns,
                schema_name=schema_name,
            )

        context.register_experiment(experiment_name,
                                    experiment_results=df,
                                    schema_name=schema_name)
        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc