def disambiguate_integration_column_identifier(identifier, integration_name, table, initial_name_as_alias=False): """Removes integration name from column if it's present, adds table path if it's absent""" column_table_ref = [table.alias.to_string( alias=False)] if table.alias else table.parts parts = list(identifier.parts) if len(parts) > 1: if parts[0] == integration_name: parts = parts[1:] if len(parts) > 1: if (len(parts) <= len(column_table_ref) or parts[:len(column_table_ref)] != column_table_ref): raise PlanningException( f'Tried to query column {identifier.to_tree()} from integration {integration_name} table {column_table_ref}, but a different table name has been specified.' ) elif len(parts) == 1: # if parts[0] != column_table_ref: parts = column_table_ref + parts new_identifier = Identifier(parts=parts) if identifier.alias: new_identifier.alias = identifier.alias elif initial_name_as_alias: new_identifier.alias = Identifier(parts[-1]) return new_identifier
def test_where_and_or_precedence(self, dialect): sql = "SELECT col1 FROM tab WHERE col1 AND col2 OR col3" ast = parse_sql(sql, dialect=dialect) expected_ast = Select(targets=[Identifier.from_path_str('col1')], from_table=Identifier.from_path_str('tab'), where=BinaryOperation( op='or', args=( BinaryOperation( op='and', args=( Identifier.from_path_str('col1'), Identifier.from_path_str('col2'), )), Identifier.from_path_str('col3'), ))) assert str(ast).lower() == sql.lower() assert str(ast) == str(expected_ast) assert ast.to_tree() == expected_ast.to_tree() sql = "SELECT col1 FROM tab WHERE col1 = 1 AND col2 = 1 OR col3 = 1" ast = parse_sql(sql, dialect=dialect) expected_ast = Select( targets=[Identifier.from_path_str('col1')], from_table=Identifier.from_path_str('tab'), where=BinaryOperation( op='or', args=( BinaryOperation( op='and', args=( BinaryOperation( op='=', args=( Identifier.from_path_str('col1'), Constant(1), )), BinaryOperation( op='=', args=( Identifier.from_path_str('col2'), Constant(1), )), )), BinaryOperation(op='=', args=( Identifier.from_path_str('col3'), Constant(1), )), ))) assert str(ast).lower() == sql.lower() assert str(ast) == str(expected_ast) assert ast.to_tree() == expected_ast.to_tree()
def test_select_status(self, dialect): sql = 'select status from mindsdb.predictors' ast = parse_sql(sql, dialect=dialect) expected_ast = Select( targets=[Identifier.from_path_str("status")], from_table=Identifier.from_path_str('mindsdb.predictors')) assert ast.to_tree() == expected_ast.to_tree() # assert str(ast).lower() == sql.lower() assert str(ast) == str(expected_ast)
def test_not_in(self, dialect): sql = f"""SELECT column1 NOT IN column2""" ast = parse_sql(sql, dialect=dialect) expected_ast = Select(targets=[ BinaryOperation(op='not in', args=(Identifier.from_path_str("column1"), Identifier.from_path_str("column2"))) ], ) assert ast.to_tree() == expected_ast.to_tree() assert str(ast) == str(expected_ast)
def test_is_false(self, dialect): sql = "SELECT col1 FROM t1 WHERE col1 IS FALSE" ast = parse_sql(sql, dialect=dialect) expected_ast = Select(targets=[Identifier.from_path_str("col1")], from_table=Identifier.from_path_str('t1'), where=BinaryOperation( 'is', args=(Identifier.from_path_str('col1'), Constant(False)))) assert str(ast).lower() == sql.lower() assert ast.to_tree() == expected_ast.to_tree() assert str(ast) == str(expected_ast)
def test_operation_converts_to_lowercase(self, dialect): sql = f'SELECT column1 IS column2 FROM tab' ast = parse_sql(sql, dialect=dialect) expected_ast = Select(targets=[ BinaryOperation(op='is', args=(Identifier.from_path_str('column1'), Identifier.from_path_str('column2'))), ], from_table=Identifier.from_path_str('tab')) assert str(ast) == str(expected_ast) assert ast.to_tree() == expected_ast.to_tree()
def test_between(self, dialect): sql = "SELECT col1 FROM t1 WHERE col1 BETWEEN a AND b" ast = parse_sql(sql, dialect=dialect) expected_ast = Select( targets=[Identifier.from_path_str("col1")], from_table=Identifier.from_path_str('t1'), where=BetweenOperation(args=(Identifier.from_path_str('col1'), Identifier.from_path_str('a'), Identifier.from_path_str('b')))) assert str(ast).lower() == sql.lower() assert ast.to_tree() == expected_ast.to_tree() assert str(ast) == str(expected_ast)
def test_select_from_engines(self, dialect): sql = 'select * from engines' ast = parse_sql(sql, dialect=dialect) expected_ast = Select(targets=[Star()], from_table=Identifier.from_path_str('engines')) assert ast.to_tree() == expected_ast.to_tree() assert str(ast) == str(expected_ast)
def test_select_dquote_alias(self, dialect): sql = """ select a as "database" from information_schema.tables "database" """ ast = parse_sql(sql, dialect=dialect) expected_ast = Select( targets=[Identifier('a', alias=Identifier('database'))], from_table=Identifier(parts=['information_schema', 'tables'], alias=Identifier('database')), ) assert str(ast) == str(expected_ast) assert ast.to_tree() == expected_ast.to_tree()
def test_select_function_one_arg(self, dialect): funcs = ['sum', 'min', 'max', 'some_custom_function'] for func in funcs: sql = f'SELECT {func}(column) FROM tab' ast = parse_sql(sql, dialect=dialect) expected_ast = Select( targets=[ Function(op=func, args=(Identifier.from_path_str('column'), )) ], from_table=Identifier.from_path_str('tab'), ) assert str(ast).lower() == sql.lower() assert str(ast) == str(expected_ast) assert ast.to_tree() == expected_ast.to_tree()
def plan_join_two_tables(self, join): select_left_step = self.plan_integration_select( Select(targets=[Star()], from_table=join.left)) select_right_step = self.plan_integration_select( Select(targets=[Star()], from_table=join.right)) left_integration_name, left_table = self.get_integration_path_from_identifier_or_error( join.left) right_integration_name, right_table = self.get_integration_path_from_identifier_or_error( join.right) left_table_path = left_table.to_string(alias=False) right_table_path = right_table.to_string(alias=False) new_condition_args = [] for arg in join.condition.args: if isinstance(arg, Identifier): if left_table_path in arg.parts: new_condition_args.append( disambiguate_integration_column_identifier( arg, left_integration_name, left_table)) elif right_table_path in arg.parts: new_condition_args.append( disambiguate_integration_column_identifier( arg, right_integration_name, right_table)) else: raise PlanningException( f'Wrong table or no source table in join condition for column: {str(arg)}' ) else: new_condition_args.append(arg) new_join = copy.deepcopy(join) new_join.condition.args = new_condition_args new_join.left = Identifier(left_table_path, alias=left_table.alias) new_join.right = Identifier(right_table_path, alias=right_table.alias) # FIXME: INFORMATION_SCHEMA with condition # clear join condition for INFORMATION_SCHEMA if right_integration_name == 'INFORMATION_SCHEMA': new_join.condition = None return self.plan.add_step( JoinStep(left=select_left_step.result, right=select_right_step.result, query=new_join))
def test_show_index(self): sql = "SHOW INDEX FROM predictors" ast = parse_sql(sql, dialect='mysql') expected_ast = Show(category='INDEX', from_table=Identifier('predictors')) assert str(ast).lower() == sql.lower() assert str(ast) == str(expected_ast) assert ast.to_tree() == expected_ast.to_tree()
def test_select_from_view_kw(self, dialect): for table in ['view.t', 'views.t']: sql = f'select * from {table}' ast = parse_sql(sql, dialect=dialect) expected_ast = Select(targets=[Star()], from_table=Identifier.from_path_str(table)) assert ast.to_tree() == expected_ast.to_tree() assert str(ast) == str(expected_ast)
def test_operator_chained_and(self, dialect): sql = f"""SELECT column1 AND column2 AND column3""" ast = parse_sql(sql, dialect=dialect) expected_ast = Select(targets=[ BinaryOperation(op='AND', args=( BinaryOperation( op='and', args=( Identifier.from_path_str("column1"), Identifier.from_path_str("column2"))), Identifier.from_path_str("column3"), )) ]) assert str(ast).lower() == sql.lower() assert ast.to_tree() == expected_ast.to_tree()
def test_select_binary_operations(self, dialect): for op in [ '+', '-', '/', '*', '%', '=', '!=', '>', '<', '>=', '<=', 'is', 'IS NOT', 'like', 'in', 'and', 'or', '||' ]: sql = f'SELECT column1 {op.upper()} column2 FROM tab' ast = parse_sql(sql, dialect=dialect) expected_ast = Select(targets=[ BinaryOperation(op=op, args=(Identifier.from_path_str('column1'), Identifier.from_path_str('column2'))), ], from_table=Identifier.from_path_str('tab')) assert str(ast).lower() == sql.lower() assert str(ast) == str(expected_ast) assert ast.to_tree() == expected_ast.to_tree()
def disambiguate_predictor_column_identifier(identifier, predictor): """Removes integration name from column if it's present, adds table path if it's absent""" table_ref = predictor.alias.parts_to_str( ) if predictor.alias else predictor.parts_to_str() parts = list(identifier.parts) if parts[0] == table_ref: parts = parts[1:] new_identifier = Identifier(parts=parts) return new_identifier
def test_operator_precedence_sum_mult_parentheses(self, dialect): sql = f'SELECT (column1 + column2) * column3' ast = parse_sql(sql, dialect=dialect) expected_ast = Select(targets=[ BinaryOperation( op='*', args=( BinaryOperation(op='+', args=(Identifier.from_path_str('column1'), Identifier.from_path_str('column2')), parentheses=True), Identifier.from_path_str('column3'), ), ) ]) assert str(ast).lower() == sql.lower() assert str(ast) == str(expected_ast) assert ast.to_tree() == expected_ast.to_tree()
def test_select_function_no_args(self, dialect): sql = f'SELECT database() FROM tab' ast = parse_sql(sql, dialect=dialect) expected_ast = Select( targets=[Function(op='database', args=[])], from_table=Identifier.from_path_str('tab'), ) assert str(ast).lower() == sql.lower() assert str(ast) == str(expected_ast) assert ast.to_tree() == expected_ast.to_tree()
def plan_fetch_timeseries_partitions(self, query, table, predictor_group_by_names): targets = [Identifier(column) for column in predictor_group_by_names] query = Select( distinct=True, targets=targets, from_table=table, where=query.where, ) select_step = self.plan_integration_select(query) return select_step
def add_order_not_null(condition): order_field_not_null = BinaryOperation( op='is not', args=[ Identifier(parts=[predictor_time_column_name]), NullConstant() ]) if condition is not None: condition = BinaryOperation( op='and', args=[condition, order_field_not_null]) else: condition = order_field_not_null return condition
def test_select_varialbe_complex(self): sql = f"""SELECT * FROM tab1 WHERE column1 in (SELECT column2 + @variable FROM t2)""" ast = parse_sql(sql, dialect='mysql') expected_ast = Select(targets=[Star()], from_table=Identifier('tab1'), where=BinaryOperation( op='in', args=(Identifier('column1'), Select(targets=[ BinaryOperation( op='+', args=[ Identifier('column2'), Variable('variable') ]) ], from_table=Identifier('t2'), parentheses=True)))) assert ast.to_tree() == expected_ast.to_tree() assert str(ast).lower() == sql.lower() assert str(ast) == str(expected_ast)
def test_unary_is_special_values(self, dialect): args = [('NULL', NullConstant()), ('TRUE', Constant(value=True)), ('FALSE', Constant(value=False))] for sql_arg, python_obj in args: sql = f"""SELECT column1 IS {sql_arg}""" ast = parse_sql(sql, dialect=dialect) expected_ast = Select(targets=[ BinaryOperation(op='IS', args=(Identifier.from_path_str("column1"), python_obj)) ], ) assert str(ast).lower() == sql.lower() assert ast.to_tree() == expected_ast.to_tree()
def test_select_in_operation(self, dialect): sql = """SELECT * FROM t1 WHERE col1 IN ("a", "b")""" ast = parse_sql(sql, dialect=dialect) assert isinstance(ast, Select) assert ast.where expected_where = BinaryOperation( op='IN', args=[ Identifier.from_path_str('col1'), Tuple(items=[Constant('a'), Constant("b")]), ]) assert ast.where.to_tree() == expected_where.to_tree() assert ast.where == expected_where
def plan_project(self, query, dataframe, ignore_doubles=False): out_identifiers = [] for target in query.targets: if isinstance(target, Identifier) \ or isinstance(target, Star) \ or isinstance(target, Function) \ or isinstance(target, Constant): out_identifiers.append(target) else: new_identifier = Identifier(str(target.to_string(alias=False)), alias=target.alias) out_identifiers.append(new_identifier) return self.plan.add_step( ProjectStep(dataframe=dataframe, columns=out_identifiers, ignore_doubles=ignore_doubles))
def test_operator_precedence_or_and(self, dialect): sql = f'SELECT column1 OR column2 AND column3' ast = parse_sql(sql, dialect=dialect) expected_ast = Select(targets=[ BinaryOperation(op='or', args=( Identifier.from_path_str('column1'), BinaryOperation( op='and', args=( Identifier.from_path_str('column2'), Identifier.from_path_str('column3'))))) ]) assert str(ast).lower() == sql.lower() assert ast == expected_ast assert ast.to_tree() == expected_ast.to_tree() sql = f'SELECT column1 AND column2 OR column3' ast = parse_sql(sql, dialect=dialect) expected_ast = Select(targets=[ BinaryOperation(op='or', args=( BinaryOperation( op='and', args=( Identifier.from_path_str('column1'), Identifier.from_path_str('column2'))), Identifier.from_path_str('column3'), )) ]) assert str(ast).lower() == sql.lower() assert ast == expected_ast assert ast.to_tree() == expected_ast.to_tree()
def plan_join(self, query, integration=None): join = query.from_table join_left = join.left join_right = join.right if isinstance(join_left, Select): # dbt query. # TODO support complex query. Only one table is supported at the moment. if not isinstance(join_left.from_table, Identifier): raise PlanningException( f'Statement not supported: {query.to_string()}') # move properties to upper query query = join_left if query.from_table.alias is not None: table_alias = [query.from_table.alias.parts[0]] else: table_alias = query.from_table.parts def add_aliases(node, is_table, **kwargs): if not is_table and isinstance(node, Identifier): if len(node.parts) == 1: # add table alias to field node.parts = table_alias + node.parts query_traversal(query.where, add_aliases) if isinstance(query.from_table, Identifier): # DBT workaround: allow use tables without integration. # if table.part[0] not in integration - take integration name from create table command if (integration is not None and query.from_table.parts[0] not in self.integrations): # add integration name to table query.from_table.parts.insert(0, integration) join_left = join_left.from_table aliased_fields = self.get_aliased_fields(query.targets) recursively_check_join_identifiers_for_ambiguity(query.where) recursively_check_join_identifiers_for_ambiguity( query.group_by, aliased_fields=aliased_fields) recursively_check_join_identifiers_for_ambiguity(query.having) recursively_check_join_identifiers_for_ambiguity( query.order_by, aliased_fields=aliased_fields) if isinstance(join_left, Identifier) and isinstance( join_right, Identifier): if self.is_predictor(join_left) and self.is_predictor(join_right): raise PlanningException( f'Can\'t join two predictors {str(join_left.parts[0])} and {str(join_left.parts[1])}' ) predictor_namespace = None predictor = None table = None predictor_is_left = False if self.is_predictor(join_left): predictor_namespace, predictor = get_predictor_namespace_and_name_from_identifier( join_left, self.default_namespace) predictor_is_left = True else: table = join_left if self.is_predictor(join_right): predictor_namespace, predictor = get_predictor_namespace_and_name_from_identifier( join_right, self.default_namespace) else: table = join_right last_step = None if predictor: # One argument is a table, another is a predictor # Apply mindsdb model to result of last dataframe fetch # Then join results of applying mindsdb with table predictor_name = self.predictor_names[predictor.to_string( alias=False).lower()] if self.predictor_metadata[predictor_name].get('timeseries'): predictor_steps = self.plan_timeseries_predictor( query, table, predictor_namespace, predictor) else: predictor_steps = self.plan_predictor( query, table, predictor_namespace, predictor) # add join # Update reference _, table = self.get_integration_path_from_identifier_or_error( table) table_alias = table.alias or Identifier( table.to_string(alias=False).replace('.', '_')) left = Identifier( predictor_steps['predictor'].result.ref_name, alias=predictor.alias or Identifier(predictor.to_string(alias=False))) right = Identifier(predictor_steps['data'].result.ref_name, alias=table_alias) if not predictor_is_left: # swap join left, right = right, left new_join = Join(left=left, right=right, join_type=join.join_type) left = predictor_steps['predictor'].result right = predictor_steps['data'].result if not predictor_is_left: # swap join left, right = right, left last_step = self.plan.add_step( JoinStep(left=left, right=right, query=new_join)) # limit from timeseries if predictor_steps.get('saved_limit'): last_step = self.plan.add_step( LimitOffsetStep(dataframe=last_step.result, limit=predictor_steps['saved_limit'])) else: # Both arguments are tables, join results of 2 dataframe fetches join_step = self.plan_join_two_tables(join) last_step = join_step if query.where: # FIXME: INFORMATION_SCHEMA with Where right_integration_name, _ = self.get_integration_path_from_identifier_or_error( join.right) if right_integration_name == 'INFORMATION_SCHEMA': ... else: last_step = self.plan.add_step( FilterStep(dataframe=last_step.result, query=query.where)) if query.group_by: group_by_targets = [] for t in query.targets: target_copy = copy.deepcopy(t) target_copy.alias = None group_by_targets.append(target_copy) last_step = self.plan.add_step( GroupByStep(dataframe=last_step.result, columns=query.group_by, targets=group_by_targets)) if query.having: last_step = self.plan.add_step( FilterStep(dataframe=last_step.result, query=query.having)) if query.order_by: last_step = self.plan.add_step( OrderByStep(dataframe=last_step.result, order_by=query.order_by)) if query.limit is not None or query.offset is not None: limit = query.limit.value if query.limit is not None else None offset = query.offset.value if query.offset is not None else None last_step = self.plan.add_step( LimitOffsetStep(dataframe=last_step.result, limit=limit, offset=offset)) else: raise PlanningException( f'Join of unsupported objects, currently only tables and predictors can be joined.' ) return self.plan_project(query, last_step.result)
def execute_step(self, step, steps_data): if type(step) == GetPredictorColumns: predictor_name = step.predictor.parts[-1] dn = self.datahub.get(self.mindsdb_database_name) columns = dn.get_table_columns(predictor_name) columns = [(column_name, column_name) for column_name in columns] data = { 'values': [], 'columns': { (self.mindsdb_database_name, predictor_name, predictor_name): columns }, 'tables': [(self.mindsdb_database_name, predictor_name, predictor_name)] } elif type(step) == GetTableColumns: table = step.table dn = self.datahub.get(step.namespace) ds_query = Select(from_table=Identifier(table), targets=[Star()]) dso, _ = dn.data_store.create_datasource( dn.integration_name, {'query': ds_query.to_string()}) columns = dso.get_columns() cols = [] for col in columns: if not isinstance(col, dict): col = {'name': col, 'type': 'str'} cols.append(col) table_alias = (self.database, table, table) data = { 'values': [], 'columns': { table_alias: cols }, 'tables': [table_alias] } elif type(step) == FetchDataframeStep: data = self._fetch_dataframe_step(step) elif type(step) == UnionStep: raise ErNotSupportedYet('Union step is not implemented') # TODO add union support # left_data = steps_data[step.left.step_num] # right_data = steps_data[step.right.step_num] # data = left_data + right_data elif type(step) == MapReduceStep: try: if step.reduce != 'union': raise Exception( f'Unknown MapReduceStep type: {step.reduce}') step_data = steps_data[step.values.step_num] vars = [] step_data_values = step_data['values'] for row in step_data_values: var_group = {} vars.append(var_group) for row_data in row.values(): for name, value in row_data.items(): if name[0] != '__mindsdb_row_id': var_group[name[1] or name[0]] = value data = {'values': [], 'columns': {}, 'tables': []} substep = step.step if type(substep) == FetchDataframeStep: query = substep.query for var_group in vars: markQueryVar(query.where) for name, value in var_group.items(): replaceQueryVar(query.where, value, name) sub_data = self._fetch_dataframe_step(substep) if len(data['columns']) == 0: data['columns'] = sub_data['columns'] if len(data['tables']) == 0: data['tables'] = sub_data['tables'] data['values'].extend(sub_data['values']) unmarkQueryVar(query.where) elif type(substep) == MultipleSteps: data = self._multiple_steps_reduce(substep, vars) else: raise Exception(f'Unknown step type: {step.step}') except Exception as e: raise SqlApiException(f'error in map reduce step: {e}') from e elif type(step) == MultipleSteps: if step.reduce != 'union': raise Exception( f"Only MultipleSteps with type = 'union' is supported. Got '{step.type}'" ) data = None for substep in step.steps: subdata = self.execute_step(substep, steps_data) if data is None: data = subdata else: data['values'].extend(subdata['values']) elif type(step) == ApplyPredictorRowStep: try: predictor = '.'.join(step.predictor.parts) dn = self.datahub.get(self.mindsdb_database_name) where_data = step.row_dict data = dn.select( table=predictor, columns=None, where_data=where_data, integration_name=self.session.integration, integration_type=self.session.integration_type) data = [{(key, key): value for key, value in row.items()} for row in data] table_name = get_preditor_alias(step, self.database) values = [{table_name: x} for x in data] columns = {table_name: []} if len(data) > 0: row = data[0] columns[table_name] = list(row.keys()) # TODO else data = { 'values': values, 'columns': columns, 'tables': [table_name] } except Exception as e: raise SqlApiException( f'error in apply predictor row step: {e}') from e elif type(step) in (ApplyPredictorStep, ApplyTimeseriesPredictorStep): try: dn = self.datahub.get(self.mindsdb_database_name) predictor = '.'.join(step.predictor.parts) where_data = [] for row in steps_data[step.dataframe.step_num]['values']: new_row = {} for table_name in row: keys_intersection = set(new_row) & set(row[table_name]) if len(keys_intersection) > 0: raise Exception( f'The predictor got two identical keys from different datasources: {keys_intersection}' ) new_row.update(row[table_name]) where_data.append(new_row) where_data = [{key[1]: value for key, value in row.items()} for row in where_data] is_timeseries = self.planner.predictor_metadata[predictor][ 'timeseries'] _mdb_make_predictions = None if is_timeseries: if 'LATEST' in self.query_str: _mdb_make_predictions = False else: _mdb_make_predictions = True for row in where_data: if '__mdb_make_predictions' not in row: row['__mdb_make_predictions'] = _mdb_make_predictions for row in where_data: for key in row: if isinstance(row[key], datetime.date): row[key] = str(row[key]) data = dn.select( table=predictor, columns=None, where_data=where_data, integration_name=self.session.integration, integration_type=self.session.integration_type) # if is_timeseries: # if 'LATEST' not in self.raw: # # remove additional records from predictor results: # # first 'window_size' and last 'horizon' records # # otherwise there are many unxpected rows in prediciton result: # # ---------------------------------------------------------------------------------------- # # mysql> SELECT tb.time, tb.state, tb.pnew_case, tb.new_case from # # MYSQL_LOCAL.test_data.covid AS # # ta JOIN mindsdb.covid_hor3 AS tb # # WHERE ta.state = "CA" AND ta.time BETWEEN "2020-10-19" AND "2020-10-20"; # # ---------------------------------------------------------------------------------------- # # +------------+-------+-----------+----------+ # # | time | state | pnew_case | new_case | # # +------------+-------+-----------+----------+ # # | 2020-10-09 | CA | 0 | 2862 | # # | 2020-10-10 | CA | 0 | 2979 | # # | 2020-10-11 | CA | 0 | 3075 | # # | 2020-10-12 | CA | 0 | 3329 | # # | 2020-10-13 | CA | 0 | 2666 | # # | 2020-10-14 | CA | 0 | 2378 | # # | 2020-10-15 | CA | 0 | 3449 | # # | 2020-10-16 | CA | 0 | 3803 | # # | 2020-10-17 | CA | 0 | 4170 | # # | 2020-10-18 | CA | 0 | 3806 | # # | 2020-10-19 | CA | 0 | 3286 | # # | 2020-10-20 | CA | 0 | 3474 | # # | 2020-10-21 | CA | 0 | 3474 | # # | 2020-10-22 | CA | 0 | 3474 | # # +------------+-------+-----------+----------+ # # 14 rows in set (2.52 sec) # window_size = predictor_metadata[predictor]['window'] # horizon = predictor_metadata[predictor]['horizon'] # if len(data) >= (window_size + horizon): # data = data[window_size:] # if len(data) > horizon and horizon > 1: # data = data[:-horizon + 1] data = [{(key, key): value for key, value in row.items()} for row in data] table_name = get_preditor_alias(step, self.database) values = [{table_name: x} for x in data] columns = {table_name: []} if len(data) > 0: row = data[0] columns[table_name] = list(row.keys()) # TODO else data = { 'values': values, 'columns': columns, 'tables': [table_name] } except Exception as e: raise SqlApiException( f'error in apply predictor step: {e}') from e elif type(step) == JoinStep: try: left_data = steps_data[step.left.step_num] right_data = steps_data[step.right.step_num] # FIXME https://github.com/mindsdb/mindsdb_sql/issues/136 # is_timeseries = False # if True in [type(step) == ApplyTimeseriesPredictorStep for step in plan.steps]: # right_data = steps_data[step.left.step_num] # left_data = steps_data[step.right.step_num] # is_timeseries = True if step.query.condition is not None: raise Exception( 'At this moment supported only JOIN without condition') if step.query.join_type.upper() not in ('LEFT JOIN', 'JOIN'): raise Exception( 'At this moment supported only JOIN and LEFT JOIN') if (len(left_data['tables']) != 1 or len(right_data['tables']) != 1 or left_data['tables'][0] == right_data['tables'][0]): raise Exception( 'At this moment supported only JOIN of two different tables' ) data = { 'values': [], 'columns': {}, 'tables': list(set(left_data['tables'] + right_data['tables'])) } for data_part in [left_data, right_data]: for table_name in data_part['columns']: if table_name not in data['columns']: data['columns'][table_name] = data_part['columns'][ table_name] else: data['columns'][table_name].extend( data_part['columns'][table_name]) for table_name in data['columns']: data['columns'][table_name] = list( set(data['columns'][table_name])) left_key = left_data['tables'][0] right_key = right_data['tables'][0] left_columns_map = {} left_columns_map_reverse = {} for i, column_name in enumerate( left_data['columns'][left_key]): left_columns_map[f'a{i}'] = column_name left_columns_map_reverse[column_name] = f'a{i}' right_columns_map = {} right_columns_map_reverse = {} for i, column_name in enumerate( right_data['columns'][right_key]): right_columns_map[f'b{i}'] = column_name right_columns_map_reverse[column_name] = f'b{i}' left_df_data = [] for row in left_data['values']: row = row[left_key] left_df_data.append({ left_columns_map_reverse[key]: value for key, value in row.items() }) right_df_data = [] for row in right_data['values']: row = row[right_key] right_df_data.append({ right_columns_map_reverse[key]: value for key, value in row.items() }) df_a = pd.DataFrame(left_df_data) df_b = pd.DataFrame(right_df_data) a_name = f'a{round(time.time() * 1000)}' b_name = f'b{round(time.time() * 1000)}' con = duckdb.connect(database=':memory:') con.register(a_name, df_a) con.register(b_name, df_b) resp_df = con.execute(f""" SELECT * FROM {a_name} as ta full join {b_name} as tb ON ta.{left_columns_map_reverse[('__mindsdb_row_id', '__mindsdb_row_id')]} = tb.{right_columns_map_reverse[('__mindsdb_row_id', '__mindsdb_row_id')]} """).fetchdf() con.unregister(a_name) con.unregister(b_name) con.close() resp_df = resp_df.where(pd.notnull(resp_df), None) resp_dict = resp_df.to_dict(orient='records') for row in resp_dict: new_row = {left_key: {}, right_key: {}} for key, value in row.items(): if key.startswith('a'): new_row[left_key][left_columns_map[key]] = value else: new_row[right_key][right_columns_map[key]] = value data['values'].append(new_row) # remove all records with empty data from predictor from join result # otherwise there are emtpy records in the final result: # +------------+------------+-------+-----------+----------+ # | time | time | state | pnew_case | new_case | # +------------+------------+-------+-----------+----------+ # | 2020-10-21 | 2020-10-24 | CA | 0.0 | 5945.0 | # | 2020-10-22 | 2020-10-23 | CA | 0.0 | 6141.0 | # | 2020-10-23 | 2020-10-22 | CA | 0.0 | 2940.0 | # | 2020-10-24 | 2020-10-21 | CA | 0.0 | 3707.0 | # | NULL | 2020-10-20 | NULL | nan | nan | # | NULL | 2020-10-19 | NULL | nan | nan | # | NULL | 2020-10-18 | NULL | nan | nan | # | NULL | 2020-10-17 | NULL | nan | nan | # | NULL | 2020-10-16 | NULL | nan | nan | # +------------+------------+-------+-----------+----------+ # 9 rows in set (2.07 sec) # if is_timeseries: # data_values = [] # for row in data['values']: # for key in row: # if 'mindsdb' in key: # if not is_empty_prediction_row(row[key]): # data_values.append(row) # break # data['values'] = data_values except Exception as e: raise SqlApiException(f'error in join step: {e}') from e elif type(step) == FilterStep: raise ErNotSupportedYet('FilterStep is not implemented') # elif type(step) == ApplyTimeseriesPredictorStep: # raise Exception('ApplyTimeseriesPredictorStep is not implemented') elif type(step) == LimitOffsetStep: try: step_data = steps_data[step.dataframe.step_num] data = { 'values': step_data['values'].copy(), 'columns': step_data['columns'].copy(), 'tables': step_data['tables'].copy() } if isinstance(step.offset, Constant) and isinstance( step.offset.value, int): data['values'] = data['values'][step.offset.value:] if isinstance(step.limit, Constant) and isinstance( step.limit.value, int): data['values'] = data['values'][:step.limit.value] except Exception as e: raise SqlApiException( f'error in limit offset step: {e}') from e elif type(step) == ProjectStep: try: step_data = steps_data[step.dataframe.step_num] columns_list = [] for column_identifier in step.columns: table_name = None if type(column_identifier) == Star: for table_name, table_columns_list in step_data[ 'columns'].items(): for column in table_columns_list: columns_list.append( Column(database=table_name[0], table_name=table_name[1], table_alias=table_name[2], name=column[0], alias=column[1])) elif type(column_identifier) == Identifier: column_name_parts = column_identifier.parts column_alias = None if column_identifier.alias is None else '.'.join( column_identifier.alias.parts) if len(column_name_parts) > 2: raise Exception( f'Column name must contain no more than 2 parts. Got name: {column_identifier}' ) elif len(column_name_parts) == 1: column_name = column_name_parts[0] appropriate_table = None if len(step_data['tables']) == 1: appropriate_table = step_data['tables'][0] else: for table_name, table_columns in step_data[ 'columns'].items(): table_column_names_list = [ x[1] or x[0] for x in table_columns ] column_exists = get_column_in_case( table_column_names_list, column_name) if column_exists: if appropriate_table is not None: raise Exception( 'Found multiple appropriate tables for column {column_name}' ) else: appropriate_table = table_name if appropriate_table is None: # it is probably constaint # FIXME https://github.com/mindsdb/mindsdb_sql/issues/133 # column_name = column_name.strip("'") # name_or_alias = column_alias or column_name # column_alias = name_or_alias # for row in step_data['values']: # for table in row: # row[table][(column_name, name_or_alias)] = row[table][(column_name, column_name)] # appropriate_table = step_data['tables'][0] # FIXME: must be exception columns_list.append( Column(database=appropriate_table[0], table_name=appropriate_table[1], table_alias=appropriate_table[2], name=column_alias)) else: columns_list.append( Column(database=appropriate_table[0], table_name=appropriate_table[1], table_alias=appropriate_table[2], name=column_name, alias=column_alias)) # column_name elif len(column_name_parts) == 2: table_name_or_alias = column_name_parts[0] column_name = column_name_parts[1] appropriate_table = None for table_name, table_columns in step_data[ 'columns'].items(): table_column_names_list = [ x[1] or x[0] for x in table_columns ] checkig_table_name_or_alias = table_name[ 2] or table_name[1] if table_name_or_alias.lower( ) == checkig_table_name_or_alias.lower(): column_exists = get_column_in_case( table_column_names_list, column_name) if column_exists: appropriate_table = table_name break else: raise Exception( f'Can not find column "{column_name}" in table "{table_name}"' ) if appropriate_table is None: raise Exception( f'Can not find approproate table for column {column_name}' ) columns_to_copy = None table_column_names_list = [ x[1] or x[0] for x in table_columns ] checking_name = get_column_in_case( table_column_names_list, column_name) for column in step_data['columns'][ appropriate_table]: if column[0] == checking_name and ( column[1] is None or column[1] == checking_name): columns_to_copy = column break else: raise Exception( f'Can not find approproate column in data: {(column_name, column_alias)}' ) for row in step_data['values']: row[appropriate_table][( column_name, column_alias )] = row[appropriate_table][columns_to_copy] columns_list.append( Column(database=appropriate_table[0], table_name=appropriate_table[1], table_alias=appropriate_table[2], name=column_name, alias=column_alias)) else: raise Exception('Undefined column name') else: raise Exception( f'Unexpected column name type: {column_identifier}' ) self.columns_list = columns_list data = step_data except Exception as e: raise SqlApiException(f'error on project step:{e} ') from e else: raise SqlApiException(F'Unknown planner step: {step}') return data
def plan_timeseries_predictor(self, query, table, predictor_namespace, predictor): predictor_name = predictor.to_string(alias=False).lower() # to original case predictor_name = self.predictor_names[predictor_name] predictor_time_column_name = self.predictor_metadata[predictor_name][ 'order_by_column'] predictor_group_by_names = self.predictor_metadata[predictor_name][ 'group_by_columns'] if predictor_group_by_names is None: predictor_group_by_names = [] predictor_window = self.predictor_metadata[predictor_name]['window'] if query.order_by: raise PlanningException( f'Can\'t provide ORDER BY to time series predictor, it will be taken from predictor settings. Found: {query.order_by}' ) saved_limit = query.limit if query.group_by or query.having or query.offset: raise PlanningException( f'Unsupported query to timeseries predictor: {str(query)}') allowed_columns = [predictor_time_column_name.lower()] if len(predictor_group_by_names) > 0: allowed_columns += [i.lower() for i in predictor_group_by_names] validate_ts_where_condition(query.where, allowed_columns=allowed_columns) time_filter = find_time_filter( query.where, time_column_name=predictor_time_column_name) order_by = [ OrderBy(Identifier(parts=[predictor_time_column_name]), direction='DESC') ] preparation_where = copy.deepcopy(query.where) # add {order_by_field} is not null def add_order_not_null(condition): order_field_not_null = BinaryOperation( op='is not', args=[ Identifier(parts=[predictor_time_column_name]), NullConstant() ]) if condition is not None: condition = BinaryOperation( op='and', args=[condition, order_field_not_null]) else: condition = order_field_not_null return condition preparation_where2 = copy.deepcopy(preparation_where) preparation_where = add_order_not_null(preparation_where) # Obtain integration selects if isinstance(time_filter, BetweenOperation): between_from = time_filter.args[1] preparation_time_filter = BinaryOperation( '<', args=[Identifier(predictor_time_column_name), between_from]) preparation_where2 = replace_time_filter(preparation_where2, time_filter, preparation_time_filter) integration_select_1 = Select( targets=[Star()], from_table=table, where=add_order_not_null(preparation_where2), order_by=order_by, limit=Constant(predictor_window)) integration_select_2 = Select(targets=[Star()], from_table=table, where=preparation_where, order_by=order_by) integration_selects = [integration_select_1, integration_select_2] elif isinstance( time_filter, BinaryOperation ) and time_filter.op == '>' and time_filter.args[1] == Latest(): integration_select = Select( targets=[Star()], from_table=table, where=preparation_where, order_by=order_by, limit=Constant(predictor_window), ) integration_select.where = find_and_remove_time_filter( integration_select.where, time_filter) integration_selects = [integration_select] elif isinstance(time_filter, BinaryOperation) and time_filter.op in ('>', '>='): time_filter_date = time_filter.args[1] preparation_time_filter_op = {'>': '<=', '>=': '<'}[time_filter.op] preparation_time_filter = BinaryOperation( preparation_time_filter_op, args=[ Identifier(predictor_time_column_name), time_filter_date ]) preparation_where2 = replace_time_filter(preparation_where2, time_filter, preparation_time_filter) integration_select_1 = Select( targets=[Star()], from_table=table, where=add_order_not_null(preparation_where2), order_by=order_by, limit=Constant(predictor_window)) integration_select_2 = Select(targets=[Star()], from_table=table, where=preparation_where, order_by=order_by) integration_selects = [integration_select_1, integration_select_2] else: integration_select = Select( targets=[Star()], from_table=table, where=preparation_where, order_by=order_by, ) integration_selects = [integration_select] if len(predictor_group_by_names) == 0: # ts query without grouping # one or multistep if len(integration_selects) == 1: select_partition_step = self.get_integration_select_step( integration_selects[0]) else: select_partition_step = MultipleSteps(steps=[ self.get_integration_select_step(s) for s in integration_selects ], reduce='union') # fetch data step data_step = self.plan.add_step(select_partition_step) else: # inject $var to queries for integration_select in integration_selects: condition = integration_select.where for num, column in enumerate(predictor_group_by_names): cond = BinaryOperation( '=', args=[Identifier(column), Constant(f'$var[{column}]')]) # join to main condition if condition is None: condition = cond else: condition = BinaryOperation('and', args=[condition, cond]) integration_select.where = condition # one or multistep if len(integration_selects) == 1: select_partition_step = self.get_integration_select_step( integration_selects[0]) else: select_partition_step = MultipleSteps(steps=[ self.get_integration_select_step(s) for s in integration_selects ], reduce='union') # get groping values no_time_filter_query = copy.deepcopy(query) no_time_filter_query.where = find_and_remove_time_filter( no_time_filter_query.where, time_filter) select_partitions_step = self.plan_fetch_timeseries_partitions( no_time_filter_query, table, predictor_group_by_names) # sub-query by every grouping value map_reduce_step = self.plan.add_step( MapReduceStep(values=select_partitions_step.result, reduce='union', step=select_partition_step)) data_step = map_reduce_step predictor_step = self.plan.add_step( ApplyTimeseriesPredictorStep( output_time_filter=time_filter, namespace=predictor_namespace, dataframe=data_step.result, predictor=predictor, )) return { 'predictor': predictor_step, 'data': data_step, 'saved_limit': saved_limit, }