def sql(self, sql: str, return_futures: bool = True) -> Union[dd.DataFrame, pd.DataFrame]: """ Query the registered tables with the given SQL. The SQL follows approximately the postgreSQL standard - however, not all operations are already implemented. In general, only select statements (no data manipulation) works. For more information, see :ref:`sql`. Example: In this example, a query is called using the registered tables and then executed using dask. .. code-block:: python result = c.sql("SELECT a, b FROM my_table") print(result.compute()) Args: sql (:obj:`str`): The query string to execute return_futures (:obj:`bool`): Return the unexecuted dask dataframe or the data itself. Defaults to returning the dask dataframe. Returns: :obj:`dask.dataframe.DataFrame`: the created data frame of this query. """ rel, select_names, _ = self._get_ral(sql) dc = RelConverter.convert(rel, context=self) if dc is None: return if select_names: # Rename any columns named EXPR$* to a more human readable name cc = dc.column_container cc = cc.rename({ df_col: df_col if not df_col.startswith("EXPR$") else select_name for df_col, select_name in zip(cc.columns, select_names) }) dc = DataContainer(dc.df, cc) df = dc.assign() if not return_futures: df = df.compute() return df
def sql(self, sql: str) -> dd.DataFrame: """ Query the registered tables with the given SQL. The SQL follows approximately the postgreSQL standard - however, not all operations are already implemented. In general, only select statements (no data manipulation) works. For more information, see :ref:`sql`. Example: In this example, a query is called using the registered tables and then executed using dask. .. code-block:: python result = c.sql("SELECT a, b FROM my_table") print(result.compute()) Args: sql (:obj:`str`): The query string to execute debug (:obj:`bool`): Turn on printing of debug information. Returns: :obj:`dask.dataframe.DataFrame`: the created data frame of this query. """ try: rel, select_names = self._get_ral(sql) dc = RelConverter.convert(rel, context=self) except (ValidationException, SqlParseException) as e: logger.debug(f"Original exception raised by Java:\n {e}") # We do not want to re-raise an exception here # as this would print the full java stack trace # if debug is not set. # Instead, we raise a nice exception raise ParsingException(sql, str(e.message())) from None if dc is not None: if select_names: # Rename any columns named EXPR$* to a more human readable name cc = dc.column_container cc = cc.rename({ df_col: df_col if not df_col.startswith("EXPR$") else select_name for df_col, select_name in zip(cc.columns, select_names) }) dc = DataContainer(dc.df, cc) return dc.assign()
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: (dc, ) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container # We make our life easier with having unique column names cc = cc.make_unique() # I have no idea what that is, but so far it was always of length 1 assert len( rel.getGroupSets()) == 1, "Do not know how to handle this case!" # Extract the information, which columns we need to group for group_column_indices = [int(i) for i in rel.getGroupSet()] group_columns = [ cc.get_backend_by_frontend_index(i) for i in group_column_indices ] dc = DataContainer(df, cc) if not group_columns: # There was actually no GROUP BY specified in the SQL # Still, this plan can also be used if we need to aggregate something over the full # data sample # To reuse the code, we just create a new column at the end with a single value logger.debug("Performing full-table aggregation") # Do all aggregates df_result, output_column_order = self._do_aggregations( rel, dc, group_columns, context, ) # SQL does not care about the index, but we do not want to have any multiindices df_agg = df_result.reset_index(drop=True) # Fix the column names and the order of them, as this was messed with during the aggregations df_agg.columns = df_agg.columns.get_level_values(-1) cc = ColumnContainer(df_agg.columns).limit_to(output_column_order) cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df_agg, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: # There should not be any input. This is the first step. self.assert_inputs(rel, 0) rex_expression_rows = list(rel.getTuples()) rows = [] for rex_expression_row in rex_expression_rows: # We convert each of the cells in the row # using a RexConverter. # As we do not have any information on the # column headers, we just name them with # their index. rows.append({ str(i): RexConverter.convert(rex_cell, None, context=context) for i, rex_cell in enumerate(rex_expression_row) }) # TODO: we explicitely reference pandas and dask here -> might we worth making this more general # We assume here that when using the values plan, the resulting dataframe will be quite small if rows: df = pd.DataFrame(rows) else: field_names = [str(x) for x in rel.getRowType().getFieldNames()] df = pd.DataFrame(columns=field_names) df = dd.from_pandas(df, npartitions=1) cc = ColumnContainer(df.columns) cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: (dc, ) = self.assert_inputs(rel, 1, context) # During optimization, some constants might end up in an internal # constant pool. We need to dereference them here, as they # are treated as "normal" columns. # Unfortunately they are only referenced by their index, # (which come after the real columns), so we need # to always substract the number of real columns. constants = list(rel.getConstants()) constant_count_offset = len(dc.column_container.columns) # Output to the right field names right away field_names = rel.getRowType().getFieldNames() for window in rel.getGroups(): dc = self._apply_window(window, constants, constant_count_offset, dc, field_names, context) # Finally, fix the output schema if needed df = dc.df cc = dc.column_container cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def fix_dtype_to_row_type( dc: DataContainer, row_type: "org.apache.calcite.rel.type.RelDataType" ): """ Fix the dtype of the given data container (or: the df within it) to the data type given as argument. To prevent unneeded conversions, do only convert if really needed, e.g. if the two types are "similar" enough, do not convert. Similarity involves the same general type (int, float, string etc) but not necessary the size (int64 and int32 are compatible) or the nullability. TODO: we should check the nullability of the SQL type """ df = dc.df cc = dc.column_container field_types = { int(field.getIndex()): str(field.getType()) for field in row_type.getFieldList() } for index, field_type in field_types.items(): expected_type = sql_to_python_type(field_type) field_name = cc.get_backend_by_frontend_index(index) df = cast_column_type(df, field_name, expected_type) return DataContainer(df, dc.column_container)
def to_dc( input_item: InputType, file_format: str = None, persist: bool = True, hive_table_name: str = None, hive_schema_name: str = "default", **kwargs, ) -> DataContainer: """ Turn possible input descriptions or formats (e.g. dask dataframes, pandas dataframes, locations as string, hive tables) into the loaded data containers, maybe persist them to cluster memory before. """ filled_get_dask_dataframe = lambda *args: _get_dask_dataframe( *args, file_format=file_format, persist=persist, hive_table_name=hive_table_name, hive_schema_name=hive_schema_name, **kwargs, ) if isinstance(input_item, list): table = dd.concat( [filled_get_dask_dataframe(item) for item in input_item]) else: table = filled_get_dask_dataframe(input_item) if persist: table = table.persist() return DataContainer(table.copy(), ColumnContainer(table.columns))
def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: components = list(map(str, sql.getTable().names)) tableName = components[-1] if len(components) == 2: if components[0] != context.schema_name: raise AttributeError(f"Schema {components[0]} is not defined.") elif len(components) > 2: raise AttributeError( "Table specification must be in the form [schema.]table") dc = context.tables[tableName] cols = dc.column_container.columns dtypes = list( map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes)) df = pd.DataFrame({ "Column": cols, "Type": dtypes, "Extra": [""] * len(cols), "Comment": [""] * len(cols), }) cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: components = list(map(str, sql.getTable().names)) # some queries might also include the database # as we do not have such a concept, we just get rid of it components = components[-2:] tableName = components[-1] if len(components) == 2: if components[0] != context.schema_name: raise AttributeError(f"Schema {components[0]} is not defined.") try: dc = context.tables[tableName] except KeyError: # pragma: no cover raise AttributeError(f"Table {tableName} is not defined.") cols = dc.column_container.columns dtypes = list( map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes)) df = pd.DataFrame({ "Column": cols, "Type": dtypes, "Extra": [""] * len(cols), "Comment": [""] * len(cols), }) cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: # There should not be any input. This is the first step. self.assert_inputs(rel, 0) # The table(s) we need to return table = rel.getTable() # The table names are all names split by "." # We assume to always have the form something.something # And the first something is fixed to "schema" by the context # For us, it makes no difference anyways. table_names = [str(n) for n in table.getQualifiedName()] assert table_names[0] == context.schema_name assert len(table_names) == 2 table_name = table_names[1] table_name = table_name.lower() dc = context.tables[table_name] df = dc.df cc = dc.column_container # Make sure we only return the requested columns row_type = table.getRowType() field_specifications = [str(f) for f in row_type.getFieldNames()] cc = cc.limit_to(field_specifications) cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: df = pd.DataFrame({"Schema": [context.schema_name]}) cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: (dc, ) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container sort_collation = rel.getCollation().getFieldCollations() sort_columns = [ cc.get_backend_by_frontend_index(int(x.getFieldIndex())) for x in sort_collation ] sort_ascending = [ str(x.getDirection()) == "ASCENDING" for x in sort_collation ] offset = rel.offset if offset: offset = RexConverter.convert(offset, df, context=context) end = rel.fetch if end: end = RexConverter.convert(end, df, context=context) if offset: end += offset if sort_columns: df = self._apply_sort(df, sort_columns, sort_ascending) if offset is not None or end is not None: df = self._apply_offset(df, offset, end) cc = self.fix_column_to_row_type(cc, rel.getRowType()) # No column type has changed, so no need to cast again return DataContainer(df, cc)
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: (dc, ) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container sort_collation = rel.getCollation().getFieldCollations() sort_columns = [ cc.get_backend_by_frontend_index(int(x.getFieldIndex())) for x in sort_collation ] ASCENDING = org.apache.calcite.rel.RelFieldCollation.Direction.ASCENDING FIRST = org.apache.calcite.rel.RelFieldCollation.NullDirection.FIRST sort_ascending = [ x.getDirection() == ASCENDING for x in sort_collation ] sort_null_first = [x.nullDirection == FIRST for x in sort_collation] df = df.persist() df = apply_sort(df, sort_columns, sort_ascending, sort_null_first) cc = self.fix_column_to_row_type(cc, rel.getRowType()) # No column type has changed, so no need to cast again return DataContainer(df, cc)
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: (dc, ) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container parameters = rel.getSamplingParameters() is_bernoulli = parameters.isBernoulli() fraction = float(parameters.getSamplingPercentage()) seed = parameters.getRepeatableSeed() if parameters.isRepeatable( ) else None if is_bernoulli: df = df.sample(frac=fraction, replace=False, random_state=seed) else: random_state = np.random.RandomState(seed) random_choice = random_state.choice( [True, False], size=df.npartitions, replace=True, p=[fraction, 1 - fraction], ) if random_choice.any(): df = df.partitions[random_choice] else: df = df.head(0, compute=False) return DataContainer(df, cc)
def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: schema = str(sql.getSchema()).split(".")[-1] if schema != context.schema_name: raise AttributeError(f"Schema {schema} is not defined.") df = pd.DataFrame({"Table": list(context.tables.keys())}) cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def convert( self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context" ) -> DataContainer: # Get the input of the previous step (dc,) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container # Collect all (new) columns named_projects = rel.getNamedProjects() column_names = [] new_columns = {} new_mappings = {} for expr, key in named_projects: key = str(key) column_names.append(key) # shortcut: if we have a column already, there is no need to re-assign it again # this is only the case if the expr is a RexInputRef if isinstance(expr, org.apache.calcite.rex.RexInputRef): index = expr.getIndex() backend_column_name = cc.get_backend_by_frontend_index(index) logger.debug( f"Not re-adding the same column {key} (but just referencing it)" ) new_mappings[key] = backend_column_name else: random_name = new_temporary_column(df) new_columns[random_name] = RexConverter.convert( expr, dc, context=context ) logger.debug(f"Adding a new column {key} out of {expr}") new_mappings[key] = random_name # Actually add the new columns if new_columns: df = df.assign(**new_columns) # and the new mappings for key, backend_column_name in new_mappings.items(): cc = cc.add(key, backend_column_name) # Make sure the order is correct cc = cc.limit_to(column_names) cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: # "information_schema" is a schema which is found in every presto database schema = context.schema_name df = pd.DataFrame({"Schema": [schema, "information_schema"]}) # We currently do not use the passed additional parameter FROM. like = str(sql.like).strip("'") if like and like != "None": df = df[df.Schema == like] cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: first_dc, second_dc = self.assert_inputs(rel, 2, context) first_df = first_dc.df first_cc = first_dc.column_container second_df = second_dc.df second_cc = second_dc.column_container # For concatenating, they should have exactly the same fields output_field_names = [str(x) for x in rel.getRowType().getFieldNames()] assert len(first_cc.columns) == len(output_field_names) first_cc = first_cc.rename( columns={ col: output_col for col, output_col in zip(first_cc.columns, output_field_names) }) first_dc = DataContainer(first_df, first_cc) assert len(second_cc.columns) == len(output_field_names) second_cc = second_cc.rename( columns={ col: output_col for col, output_col in zip(second_cc.columns, output_field_names) }) second_dc = DataContainer(second_df, second_cc) # To concat the to dataframes, we need to make sure the # columns actually have the specified names in the # column containers # Otherwise the concat won't work first_df = first_dc.assign() second_df = second_dc.assign() self.check_columns_from_row_type(first_df, rel.getExpectedInputRowType(0)) self.check_columns_from_row_type(second_df, rel.getExpectedInputRowType(1)) df = dd.concat([first_df, second_df]) if not rel.all: df = df.drop_duplicates() cc = ColumnContainer(df.columns) cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: (dc, ) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container # Every logic is handled in the RexConverter # we just need to apply it here condition = rel.getCondition() df_condition = RexConverter.convert(condition, dc, context=context) df = filter_or_scalar(df, df_condition) cc = self.fix_column_to_row_type(cc, rel.getRowType()) # No column type has changed, so no need to convert again return DataContainer(df, cc)
def fix_dtype_to_row_type( dc: DataContainer, row_type: "org.apache.calcite.rel.type.RelDataType"): """ Fix the dtype of the given data container (or: the df within it) to the data type given as argument. To prevent unneeded conversions, do only convert if really needed, e.g. if the two types are "similar" enough, do not convert. Similarity involves the same general type (int, float, string etc) but not necessary the size (int64 and int32 are compatible) or the nullability. TODO: we should check the nullability of the SQL type """ df = dc.df cc = dc.column_container field_types = { int(field.getIndex()): str(field.getType()) for field in row_type.getFieldList() } for index, field_type in field_types.items(): expected_type = sql_to_python_type(field_type) field_name = cc.get_backend_by_frontend_index(index) current_type = df[field_name].dtype logger.debug( f"Column {field_name} has type {current_type}, expecting {expected_type}..." ) if similar_type(current_type, expected_type): logger.debug("...not converting.") continue current_float = pd.api.types.is_float_dtype(current_type) expected_integer = pd.api.types.is_integer_dtype(expected_type) if current_float and expected_integer: logger.debug("...truncating...") df[field_name] = da.trunc(df[field_name]) logger.debug( f"Need to cast {field_name} from {current_type} to {expected_type}" ) df[field_name] = df[field_name].astype(expected_type) return DataContainer(df, dc.column_container)
def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: schema = sql.getSchema() if schema is not None: schema = str(schema).split(".")[-1] else: schema = context.DEFAULT_SCHEMA_NAME if schema not in context.schema: raise AttributeError(f"Schema {schema} is not defined.") df = pd.DataFrame( {"Table": list(context.schema[schema].tables.keys())}) cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: schema_name, name = context.fqn(sql.getTable()) dc = context.schema[schema_name].tables[name] cols = dc.column_container.columns dtypes = list( map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes)) df = pd.DataFrame({ "Column": cols, "Type": dtypes, "Extra": [""] * len(cols), "Comment": [""] * len(cols), }) cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: select = sql.getSelect() distribute_list = [str(col) for col in sql.getDistributeList()] sql_select_query = context._to_sql_string(select) df = context.sql(sql_select_query) logger.debug(f"Extracted sub-dataframe as {LoggableDataFrame(df)}") logger.debug(f"Will now shuffle according to {distribute_list}") # Perform the distribute by operation via a Dask shuffle df = df.shuffle(distribute_list) cc = ColumnContainer(df.columns) dc = DataContainer(df, cc) return dc
def convert( self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context" ) -> DataContainer: components = list(map(str, sql.getTable().names)) dc = get_table_from_compound_identifier(context, components) cols = dc.column_container.columns dtypes = list(map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes)) df = pd.DataFrame( { "Column": cols, "Type": dtypes, "Extra": [""] * len(cols), "Comment": [""] * len(cols), } ) cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: schema_name, model_name = context.fqn( sql.getModelName().getIdentifier()) if model_name not in context.schema[schema_name].models: raise RuntimeError( f"A model with the name {model_name} is not present.") model, training_columns = context.schema[schema_name].models[ model_name] model_params = model.get_params() model_params["training_columns"] = training_columns.tolist() df = pd.DataFrame.from_dict(model_params, orient="index", columns=["Params"]) cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: (dc, ) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container offset = rel.getOffset() if offset: offset = RexConverter.convert(offset, df, context=context) end = rel.getFetch() if end: end = RexConverter.convert(end, df, context=context) if offset: end += offset df = self._apply_limit(df, offset, end) cc = self.fix_column_to_row_type(cc, rel.getRowType()) # No column type has changed, so no need to cast again return DataContainer(df, cc)
def convert( self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context" ) -> DataContainer: components = list(map(str, sql.getTableName().names)) dc = get_table_from_compound_identifier(context, components) columns = list(map(str, sql.getColumnList())) if not columns: columns = dc.column_container.columns # Define some useful shortcuts mapping = dc.column_container.get_backend_by_frontend_name df = dc.df # Calculate statistics statistics = dd.from_pandas( pd.DataFrame({col: [] for col in columns}), npartitions=1 ) statistics = statistics.append(df[[mapping(col) for col in columns]].describe()) # Add additional information statistics = statistics.append( pd.Series( { col: str(python_to_sql_type(df[mapping(col)].dtype)).lower() for col in columns }, name="data_type", ) ) statistics = statistics.append( pd.Series({col: col for col in columns}, name="col_name",) ) cc = ColumnContainer(statistics.columns) dc = DataContainer(statistics, cc) return dc
def register_dask_table(self, df: dd.DataFrame, name: str): """ Registering a dask table makes it usable in SQL queries. The name you give here can be used as table name in the SQL later. Please note, that the table is stored as it is now. If you change the table later, you need to re-register. Example: This code registers a data frame as table "data" and then uses it in a query. .. code-block:: python c.register_dask_table(df, "data") df_result = c.sql("SELECT a, b FROM data") Args: df (:class:`dask.dataframe.DataFrame`): The data frame to register name: (:obj:`str`): Under which name should the new table be addressable """ self.tables[name.lower()] = DataContainer(df.copy(), ColumnContainer(df.columns))
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: (dc, ) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container # We make our life easier with having unique column names cc = cc.make_unique() # I have no idea what that is, but so far it was always of length 1 assert len( rel.getGroupSets()) == 1, "Do not know how to handle this case!" # Extract the information, which columns we need to group for group_column_indices = [int(i) for i in rel.getGroupSet()] group_columns = [ cc.get_backend_by_frontend_index(i) for i in group_column_indices ] # Always keep an additional column around for empty groups and aggregates additional_column_name = str(uuid.uuid4()) # NOTE: it might be the case that # we do not need this additional # column, but hopefully adding a single # column of 1 is not so problematic... df = df.assign(**{additional_column_name: 1}) cc = cc.add(additional_column_name) dc = DataContainer(df, cc) # Collect all aggregates filtered_aggregations, output_column_order = self._collect_aggregations( rel, dc, group_columns, additional_column_name, context) if not group_columns: # There was actually no GROUP BY specified in the SQL # Still, this plan can also be used if we need to aggregate something over the full # data sample # To reuse the code, we just create a new column at the end with a single value # It is important to do this after creating the aggregations, # as we do not want this additional column to be used anywhere group_columns = [additional_column_name] logger.debug("Performing full-table aggregation") # Now we can perform the aggregates # We iterate through all pairs of (possible pre-filtered) # dataframes and the aggregations to perform in this data... df_agg = None for filtered_df_desc, aggregation in filtered_aggregations.items(): filtered_column = filtered_df_desc.filtered_column if filtered_column: logger.debug( f"Aggregating {dict(aggregation)} on the data filtered by {filtered_column}" ) else: logger.debug(f"Aggregating {dict(aggregation)} on the data") # ... we perform the aggregations ... filtered_df = filtered_df_desc.df # TODO: we could use the type information for # pre-calculating the meta information filtered_df_agg = filtered_df.groupby( by=group_columns).agg(aggregation) # ... fix the column names to a single level ... filtered_df_agg.columns = filtered_df_agg.columns.get_level_values( -1) # ... and finally concat the new data with the already present columns if df_agg is None: df_agg = filtered_df_agg else: df_agg = df_agg.assign(**{ col: filtered_df_agg[col] for col in filtered_df_agg.columns }) # SQL does not care about the index, but we do not want to have any multiindices df_agg = df_agg.reset_index(drop=True) # Fix the column names and the order of them, as this was messed with during the aggregations df_agg.columns = df_agg.columns.get_level_values(-1) cc = ColumnContainer(df_agg.columns).limit_to(output_column_order) cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df_agg, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: # Joining is a bit more complicated, so lets do it in steps: # 1. We now have two inputs (from left and right), so we fetch them both dc_lhs, dc_rhs = self.assert_inputs(rel, 2, context) cc_lhs = dc_lhs.column_container cc_rhs = dc_rhs.column_container # 2. dask's merge will do some smart things with columns, which have the same name # on lhs an rhs (which also includes reordering). # However, that will confuse our column numbering in SQL. # So we make our life easier by converting the column names into unique names # We will convert back in the end cc_lhs_renamed = cc_lhs.make_unique("lhs") cc_rhs_renamed = cc_rhs.make_unique("rhs") dc_lhs_renamed = DataContainer(dc_lhs.df, cc_lhs_renamed) dc_rhs_renamed = DataContainer(dc_rhs.df, cc_rhs_renamed) df_lhs_renamed = dc_lhs_renamed.assign() df_rhs_renamed = dc_rhs_renamed.assign() join_type = rel.getJoinType() join_type = self.JOIN_TYPE_MAPPING[str(join_type)] # 3. The join condition can have two forms, that we can understand # (a) a = b # (b) X AND Y AND a = b AND Z ... (can also be multiple a = b) # The first case is very simple and we do not need any additional filter # In the second case we do a merge on all the a = b, # and then apply a filter using the other expressions. # In all other cases, we need to do a full table cross join and filter afterwards. # As this is probably non-sense for large tables, but there is no other # known solution so far. join_condition = rel.getCondition() lhs_on, rhs_on, filter_condition = self._split_join_condition( join_condition) logger.debug( f"Joining with type {join_type} on columns {lhs_on}, {rhs_on}.") # lhs_on and rhs_on are the indices of the columns to merge on. # The given column indices are for the full, merged table which consists # of lhs and rhs put side-by-side (in this order) # We therefore need to normalize the rhs indices relative to the rhs table. rhs_on = [index - len(df_lhs_renamed.columns) for index in rhs_on] # 4. dask can only merge on the same column names. # We therefore create new columns on purpose, which have a distinct name. assert len(lhs_on) == len(rhs_on) if lhs_on: # 5. Now we can finally merge on these columns # The resulting dataframe will contain all (renamed) columns from the lhs and rhs # plus the added columns df = self._join_on_columns( df_lhs_renamed, df_rhs_renamed, lhs_on, rhs_on, join_type, ) else: # 5. We are in the complex join case # where we have no column to merge on # This means we have no other chance than to merge # everything with everything... # TODO: we should implement a shortcut # for filter conditions that are always false def merge_single_partitions(lhs_partition, rhs_partition): # Do a cross join with the two partitions # TODO: it would be nice to apply the filter already here # problem: this would mean we need to ship the rex to the # workers (as this is executed on the workers), # which is definitely not possible (java dependency, JVM start...) lhs_partition = lhs_partition.assign(common=1) rhs_partition = rhs_partition.assign(common=1) return lhs_partition.merge(rhs_partition, on="common").drop(columns="common") # Iterate nested over all partitions from lhs and rhs and merge them name = "cross-join-" + tokenize(df_lhs_renamed, df_rhs_renamed) dsk = {(name, i * df_rhs_renamed.npartitions + j): ( merge_single_partitions, (df_lhs_renamed._name, i), (df_rhs_renamed._name, j), ) for i in range(df_lhs_renamed.npartitions) for j in range(df_rhs_renamed.npartitions)} graph = HighLevelGraph.from_collections( name, dsk, dependencies=[df_lhs_renamed, df_rhs_renamed]) meta = dd.dispatch.concat( [df_lhs_renamed._meta_nonempty, df_rhs_renamed._meta_nonempty], axis=1) # TODO: Do we know the divisions in any way here? divisions = [None] * (len(dsk) + 1) df = dd.DataFrame(graph, name, meta=meta, divisions=divisions) warnings.warn( "Need to do a cross-join, which is typically very resource heavy", ResourceWarning, ) # 6. So the next step is to make sure # we have the correct column order (and to remove the temporary join columns) correct_column_order = list(df_lhs_renamed.columns) + list( df_rhs_renamed.columns) cc = ColumnContainer(df.columns).limit_to(correct_column_order) # and to rename them like the rel specifies row_type = rel.getRowType() field_specifications = [str(f) for f in row_type.getFieldNames()] cc = cc.rename({ from_col: to_col for from_col, to_col in zip(cc.columns, field_specifications) }) cc = self.fix_column_to_row_type(cc, row_type) dc = DataContainer(df, cc) # 7. Last but not least we apply any filters by and-chaining together the filters if filter_condition: # This line is a bit of code duplication with RexCallPlugin - but I guess it is worth to keep it separate filter_condition = reduce( operator.and_, [ RexConverter.convert(rex, dc, context=context) for rex in filter_condition ], ) logger.debug(f"Additionally applying filter {filter_condition}") df = filter_or_scalar(df, filter_condition) dc = DataContainer(df, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc