Exemple #1
0
    def _get_ral(self, sql):
        """Helper function to turn the sql query into a relational algebra and resulting column names"""
        # get the schema of what we currently have registered
        schema = self._prepare_schema()

        # Now create a relational algebra from that
        generator = RelationalAlgebraGenerator(schema)
        default_dialect = generator.getDialect()

        logger.debug(f"Using dialect: {get_java_class(default_dialect)}")

        try:
            sqlNode = generator.getSqlNode(sql)
            sqlNodeClass = get_java_class(sqlNode)

            if sqlNodeClass.startswith("com.dask.sql.parser."):
                rel = sqlNode
                rel_string = ""
            else:
                validatedSqlNode = generator.getValidatedNode(sqlNode)
                nonOptimizedRelNode = generator.getRelationalAlgebra(
                    validatedSqlNode)
                rel = generator.getOptimizedRelationalAlgebra(
                    nonOptimizedRelNode)
                rel_string = str(generator.getRelationalAlgebraString(rel))
        except (ValidationException, SqlParseException) as e:
            logger.debug(f"Original exception raised by Java:\n {e}")
            # We do not want to re-raise an exception here
            # as this would print the full java stack trace
            # if debug is not set.
            # Instead, we raise a nice exception
            raise ParsingException(sql, str(e.message())) from None

        # Internal, temporary results of calcite are sometimes
        # named EXPR$N (with N a number), which is not very helpful
        # to the user. We replace these cases therefore with
        # the actual query string. This logic probably fails in some
        # edge cases (if the outer SQLNode is not a select node),
        # but so far I did not find such a case.
        # So please raise an issue if you have found one!
        if sqlNodeClass == "org.apache.calcite.sql.SqlOrderBy":
            sqlNode = sqlNode.query
            sqlNodeClass = get_java_class(sqlNode)

        if sqlNodeClass == "org.apache.calcite.sql.SqlSelect":
            select_names = [
                self._to_sql_string(s, default_dialect=default_dialect)
                for s in sqlNode.getSelectList()
            ]
        else:
            logger.debug(
                "Not extracting output column names as the SQL is not a SELECT call"
            )
            select_names = None

        logger.debug(f"Extracted relational algebra:\n {rel_string}")
        return rel, select_names, rel_string
Exemple #2
0
    def convert(
        cls,
        rex: "org.apache.calcite.rex.RexNode",
        dc: DataContainer,
        context: "dask_sql.Context",
    ) -> Union[dd.DataFrame, Any]:
        """
        Convert the given rel (java instance)
        into a python expression (a dask dataframe)
        using the stored plugins and the dictionary of
        registered dask tables.
        """
        class_name = get_java_class(rex)

        try:
            plugin_instance = cls.get_plugin(class_name)
        except KeyError:  # pragma: no cover
            raise NotImplementedError(
                f"No conversion for class {class_name} available (yet).")

        logger.debug(
            f"Processing REX {rex} using {plugin_instance.__class__.__name__}..."
        )

        df = plugin_instance.convert(rex, dc, context=context)
        logger.debug(f"Processed REX {rex} into {LoggableDataFrame(df)}")
        return df
Exemple #3
0
def check_special_operator(operator: "org.apache.calcite.sql.fun"):
    """
    Check for special operator classes that have an overloaded name with other
    operator type/kinds.

    eg: sqlDatetimeSubtractionOperator has the sqltype and kind of the `-` or `minus` operation.
    """
    special_op_to_name = {
        "org.apache.calcite.sql.fun.SqlDatetimeSubtractionOperator": "datetime_subtraction"
    }
    return special_op_to_name.get(get_java_class(operator), None)
Exemple #4
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        # Get the input of the previous step
        (dc, ) = self.assert_inputs(rel, 1, context)

        df = dc.df
        cc = dc.column_container

        # Collect all (new) columns
        named_projects = rel.getNamedProjects()

        column_names = []
        new_columns = {}
        new_mappings = {}
        for expr, key in named_projects:
            key = str(key)
            column_names.append(key)

            # shortcut: if we have a column already, there is no need to re-assign it again
            # this is only the case if the expr is a RexInputRef
            if get_java_class(expr) == RexInputRefPlugin.class_name:
                index = expr.getIndex()
                backend_column_name = cc.get_backend_by_frontend_index(index)
                logger.debug(
                    f"Not re-adding the same column {key} (but just referencing it)"
                )
                new_mappings[key] = backend_column_name
            else:
                new_columns[key] = RexConverter.convert(expr,
                                                        dc,
                                                        context=context)
                logger.debug(f"Adding a new column {key} out of {expr}")
                cc = cc.add(key, key)
                new_mappings[key] = key

        # Actually add the new columns
        if new_columns:
            df = df.assign(**new_columns)

        # and the new mappings
        for key, backend_column_name in new_mappings.items():
            cc = cc.add(key, backend_column_name)

        # Make sure the order is correct
        cc = cc.limit_to(column_names)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Exemple #5
0
    def _get_ral(self, sql):
        """Helper function to turn the sql query into a relational algebra and resulting column names"""
        # get the schema of what we currently have registered
        schema = self._prepare_schema()

        # Now create a relational algebra from that
        generator = RelationalAlgebraGenerator(schema)

        sqlNode = generator.getSqlNode(sql)
        sqlNodeClass = get_java_class(sqlNode)

        if sqlNodeClass.startswith("com.dask.sql.parser."):
            return sqlNode, []

        validatedSqlNode = generator.getValidatedNode(sqlNode)
        nonOptimizedRelNode = generator.getRelationalAlgebra(validatedSqlNode)
        rel = generator.getOptimizedRelationalAlgebra(nonOptimizedRelNode)
        default_dialect = generator.getDialect()

        logger.debug(f"Using dialect: {get_java_class(default_dialect)}")

        # Internal, temporary results of calcite are sometimes
        # named EXPR$N (with N a number), which is not very helpful
        # to the user. We replace these cases therefore with
        # the actual query string. This logic probably fails in some
        # edge cases (if the outer SQLNode is not a select node),
        # but so far I did not find such a case.
        # So please raise an issue if you have found one!
        def toSqlString(s):
            try:
                return str(s.toSqlString(default_dialect))
            except:  # pragma: no cover
                return str(s)

        if sqlNodeClass == "org.apache.calcite.sql.SqlSelect":
            select_names = [toSqlString(s) for s in sqlNode.getSelectList()]
        else:
            logger.debug(
                "Not extracting output column names as the SQL is not a SELECT call"
            )
            select_names = None

        logger.debug(
            f"Extracted relational algebra:\n {generator.getRelationalAlgebraString(rel)}"
        )

        return rel, select_names
Exemple #6
0
    def _get_ral(self, sql):
        """Helper function to turn the sql query into a relational algebra and resulting column names"""
        # get the schema of what we currently have registered
        schemas = self._prepare_schemas()

        RelationalAlgebraGeneratorBuilder = (
            com.dask.sql.application.RelationalAlgebraGeneratorBuilder)

        # True if the SQL query should be case sensitive and False otherwise
        case_sensitive = dask_config.get("sql.identifier.case_sensitive",
                                         default=True)

        generator_builder = RelationalAlgebraGeneratorBuilder(
            self.schema_name, case_sensitive, java.util.ArrayList())
        for schema in schemas:
            generator_builder = generator_builder.addSchema(schema)
        generator = generator_builder.build()
        default_dialect = generator.getDialect()

        logger.debug(f"Using dialect: {get_java_class(default_dialect)}")

        ValidationException = org.apache.calcite.tools.ValidationException
        SqlParseException = org.apache.calcite.sql.parser.SqlParseException
        CalciteContextException = org.apache.calcite.runtime.CalciteContextException

        try:
            sqlNode = generator.getSqlNode(sql)
            sqlNodeClass = get_java_class(sqlNode)

            select_names = None
            rel = sqlNode
            rel_string = ""

            if not sqlNodeClass.startswith("com.dask.sql.parser."):
                nonOptimizedRelNode = generator.getRelationalAlgebra(sqlNode)
                # Optimization might remove some alias projects. Make sure to keep them here.
                select_names = [
                    str(name) for name in
                    nonOptimizedRelNode.getRowType().getFieldNames()
                ]
                rel = generator.getOptimizedRelationalAlgebra(
                    nonOptimizedRelNode)
                rel_string = str(generator.getRelationalAlgebraString(rel))
        except (ValidationException, SqlParseException,
                CalciteContextException) as e:
            logger.debug(f"Original exception raised by Java:\n {e}")
            # We do not want to re-raise an exception here
            # as this would print the full java stack trace
            # if debug is not set.
            # Instead, we raise a nice exception
            raise ParsingException(sql, str(e.message())) from None

        # Internal, temporary results of calcite are sometimes
        # named EXPR$N (with N a number), which is not very helpful
        # to the user. We replace these cases therefore with
        # the actual query string. This logic probably fails in some
        # edge cases (if the outer SQLNode is not a select node),
        # but so far I did not find such a case.
        # So please raise an issue if you have found one!
        if sqlNodeClass == "org.apache.calcite.sql.SqlOrderBy":
            sqlNode = sqlNode.query
            sqlNodeClass = get_java_class(sqlNode)

        if sqlNodeClass == "org.apache.calcite.sql.SqlSelect":
            select_names = [
                self._to_sql_string(s, default_dialect=default_dialect)
                if current_name.startswith("EXPR$") else current_name for s,
                current_name in zip(sqlNode.getSelectList(), select_names)
            ]
        else:
            logger.debug(
                "Not extracting output column names as the SQL is not a SELECT call"
            )

        logger.debug(f"Extracted relational algebra:\n {rel_string}")
        return rel, select_names, rel_string