コード例 #1
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        components = list(map(str, sql.getTable().names))

        # some queries might also include the database
        # as we do not have such a concept, we just get rid of it
        components = components[-2:]
        tableName = components[-1]

        if len(components) == 2:
            if components[0] != context.schema_name:
                raise AttributeError(f"Schema {components[0]} is not defined.")

        try:
            dc = context.tables[tableName]
        except KeyError:  # pragma: no cover
            raise AttributeError(f"Table {tableName} is not defined.")

        cols = dc.column_container.columns
        dtypes = list(
            map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes))
        df = pd.DataFrame({
            "Column": cols,
            "Type": dtypes,
            "Extra": [""] * len(cols),
            "Comment": [""] * len(cols),
        })

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
コード例 #2
0
ファイル: columns.py プロジェクト: raybellwaves/dask-sql
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        components = list(map(str, sql.getTable().names))

        tableName = components[-1]

        if len(components) == 2:
            if components[0] != context.schema_name:
                raise AttributeError(f"Schema {components[0]} is not defined.")
        elif len(components) > 2:
            raise AttributeError(
                "Table specification must be in the form [schema.]table")

        dc = context.tables[tableName]
        cols = dc.column_container.columns
        dtypes = list(
            map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes))
        df = pd.DataFrame({
            "Column": cols,
            "Type": dtypes,
            "Extra": [""] * len(cols),
            "Comment": [""] * len(cols),
        })

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
コード例 #3
0
    def _add_parameters_from_description(function_description, dask_function):
        for parameter in function_description.parameters:
            param_name, param_type = parameter
            sql_param_type = python_to_sql_type(param_type)

            dask_function.addParameter(param_name, sql_param_type, False)

        return dask_function
コード例 #4
0
    def _prepare_schema(self):
        """
        Create a schema filled with the dataframes
        and functions we have currently in our list
        """
        schema = DaskSchema(self.schema_name)

        if not self.tables:
            logger.warning("No tables are registered.")

        for name, dc in self.tables.items():
            table = DaskTable(name)
            df = dc.df
            logger.debug(
                f"Adding table '{name}' to schema with columns: {list(df.columns)}"
            )
            for column in df.columns:
                data_type = df[column].dtype
                sql_data_type = python_to_sql_type(data_type)

                table.addColumn(column, sql_data_type)

            schema.addTable(table)

        if not self.functions:
            logger.debug("No custom functions defined.")

        for function_description in self.function_list:
            name = function_description.name
            sql_return_type = python_to_sql_type(
                function_description.return_type)
            if function_description.aggregation:
                logger.debug(
                    f"Adding function '{name}' to schema as aggregation.")
                dask_function = DaskAggregateFunction(name, sql_return_type)
            else:
                logger.debug(
                    f"Adding function '{name}' to schema as scalar function.")
                dask_function = DaskScalarFunction(name, sql_return_type)

            dask_function = self._add_parameters_from_description(
                function_description, dask_function)

            schema.addFunction(dask_function)

        return schema
コード例 #5
0
ファイル: responses.py プロジェクト: raybellwaves/dask-sql
 def get_column_description(df):
     sql_types = [str(python_to_sql_type(t)) for t in df.dtypes]
     column_names = df.columns
     return [{
         "name": column_name,
         "type": sql_type.lower(),
         "typeSignature": {
             "rawType": sql_type.lower(),
             "arguments": []
         },
     } for column_name, sql_type in zip(column_names, sql_types)]
コード例 #6
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        schema_name, name = context.fqn(sql.getTable())
        dc = context.schema[schema_name].tables[name]

        cols = dc.column_container.columns
        dtypes = list(
            map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes))
        df = pd.DataFrame({
            "Column": cols,
            "Type": dtypes,
            "Extra": [""] * len(cols),
            "Comment": [""] * len(cols),
        })

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
コード例 #7
0
    def convert(
        self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context"
    ) -> DataContainer:
        components = list(map(str, sql.getTable().names))
        dc = get_table_from_compound_identifier(context, components)

        cols = dc.column_container.columns
        dtypes = list(map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes))
        df = pd.DataFrame(
            {
                "Column": cols,
                "Type": dtypes,
                "Extra": [""] * len(cols),
                "Comment": [""] * len(cols),
            }
        )

        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
コード例 #8
0
    def convert(
        self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context"
    ) -> DataContainer:
        components = list(map(str, sql.getTableName().names))
        dc = get_table_from_compound_identifier(context, components)
        columns = list(map(str, sql.getColumnList()))

        if not columns:
            columns = dc.column_container.columns

        # Define some useful shortcuts
        mapping = dc.column_container.get_backend_by_frontend_name
        df = dc.df

        # Calculate statistics
        statistics = dd.from_pandas(
            pd.DataFrame({col: [] for col in columns}), npartitions=1
        )
        statistics = statistics.append(df[[mapping(col) for col in columns]].describe())

        # Add additional information
        statistics = statistics.append(
            pd.Series(
                {
                    col: str(python_to_sql_type(df[mapping(col)].dtype)).lower()
                    for col in columns
                },
                name="data_type",
            )
        )
        statistics = statistics.append(
            pd.Series({col: col for col in columns}, name="col_name",)
        )

        cc = ColumnContainer(statistics.columns)
        dc = DataContainer(statistics, cc)
        return dc
コード例 #9
0
def test_python_to_sql():
    assert str(python_to_sql_type(np.dtype("int32"))) == "INTEGER"
    assert str(python_to_sql_type(np.dtype(">M8[ns]"))) == "TIMESTAMP"
コード例 #10
0
def test_python_to_sql_to_python():
    assert (type(
        sql_to_python_value(str(python_to_sql_type(np.dtype("int64"))),
                            54)) == np.int64)
コード例 #11
0
ファイル: test_mapping.py プロジェクト: th3architect/dask-sql
def test_python_to_sql():
    assert str(python_to_sql_type(np.dtype("int32"))) == "INTEGER"
    assert str(python_to_sql_type(np.dtype(">M8[ns]"))) == "TIMESTAMP"
    assert (str(python_to_sql_type(pd.DatetimeTZDtype(
        unit="ns", tz="UTC"))) == "TIMESTAMP_WITH_LOCAL_TIME_ZONE")
コード例 #12
0
    def _prepare_schemas(self):
        """
        Create a list of schemas filled with the dataframes
        and functions we have currently in our schema list
        """
        schema_list = []

        DaskTable = com.dask.sql.schema.DaskTable
        DaskAggregateFunction = com.dask.sql.schema.DaskAggregateFunction
        DaskScalarFunction = com.dask.sql.schema.DaskScalarFunction
        DaskSchema = com.dask.sql.schema.DaskSchema

        for schema_name, schema in self.schema.items():
            java_schema = DaskSchema(schema_name)

            if not schema.tables:
                logger.warning("No tables are registered.")

            for name, dc in schema.tables.items():
                row_count = (schema.statistics[name].row_count
                             if name in schema.statistics else None)
                if row_count is not None:
                    row_count = float(row_count)
                table = DaskTable(name, row_count)
                df = dc.df
                logger.debug(
                    f"Adding table '{name}' to schema with columns: {list(df.columns)}"
                )
                for column in df.columns:
                    data_type = df[column].dtype
                    sql_data_type = python_to_sql_type(data_type)

                    table.addColumn(column, sql_data_type)

                java_schema.addTable(table)

            if not schema.functions:
                logger.debug("No custom functions defined.")
            for function_description in schema.function_lists:
                name = function_description.name
                sql_return_type = python_to_sql_type(
                    function_description.return_type)
                if function_description.aggregation:
                    logger.debug(
                        f"Adding function '{name}' to schema as aggregation.")
                    dask_function = DaskAggregateFunction(
                        name, sql_return_type)
                else:
                    logger.debug(
                        f"Adding function '{name}' to schema as scalar function."
                    )
                    dask_function = DaskScalarFunction(name, sql_return_type)

                dask_function = self._add_parameters_from_description(
                    function_description, dask_function)

                java_schema.addFunction(dask_function)

            schema_list.append(java_schema)

        return schema_list