Esempio n. 1
0
def disambiguate_integration_column_identifier(identifier,
                                               integration_name,
                                               table,
                                               initial_name_as_alias=False):
    """Removes integration name from column if it's present, adds table path if it's absent"""
    column_table_ref = [table.alias.to_string(
        alias=False)] if table.alias else table.parts
    parts = list(identifier.parts)

    if len(parts) > 1:
        if parts[0] == integration_name:
            parts = parts[1:]

    if len(parts) > 1:
        if (len(parts) <= len(column_table_ref)
                or parts[:len(column_table_ref)] != column_table_ref):
            raise PlanningException(
                f'Tried to query column {identifier.to_tree()} from integration {integration_name} table {column_table_ref}, but a different table name has been specified.'
            )
    elif len(parts) == 1:
        # if parts[0] != column_table_ref:
        parts = column_table_ref + parts

    new_identifier = Identifier(parts=parts)
    if identifier.alias:
        new_identifier.alias = identifier.alias
    elif initial_name_as_alias:
        new_identifier.alias = Identifier(parts[-1])

    return new_identifier
    def test_where_and_or_precedence(self, dialect):
        sql = "SELECT col1 FROM tab WHERE col1 AND col2 OR col3"
        ast = parse_sql(sql, dialect=dialect)

        expected_ast = Select(targets=[Identifier.from_path_str('col1')],
                              from_table=Identifier.from_path_str('tab'),
                              where=BinaryOperation(
                                  op='or',
                                  args=(
                                      BinaryOperation(
                                          op='and',
                                          args=(
                                              Identifier.from_path_str('col1'),
                                              Identifier.from_path_str('col2'),
                                          )),
                                      Identifier.from_path_str('col3'),
                                  )))

        assert str(ast).lower() == sql.lower()
        assert str(ast) == str(expected_ast)
        assert ast.to_tree() == expected_ast.to_tree()

        sql = "SELECT col1 FROM tab WHERE col1 = 1 AND col2 = 1 OR col3 = 1"
        ast = parse_sql(sql, dialect=dialect)

        expected_ast = Select(
            targets=[Identifier.from_path_str('col1')],
            from_table=Identifier.from_path_str('tab'),
            where=BinaryOperation(
                op='or',
                args=(
                    BinaryOperation(
                        op='and',
                        args=(
                            BinaryOperation(
                                op='=',
                                args=(
                                    Identifier.from_path_str('col1'),
                                    Constant(1),
                                )),
                            BinaryOperation(
                                op='=',
                                args=(
                                    Identifier.from_path_str('col2'),
                                    Constant(1),
                                )),
                        )),
                    BinaryOperation(op='=',
                                    args=(
                                        Identifier.from_path_str('col3'),
                                        Constant(1),
                                    )),
                )))

        assert str(ast).lower() == sql.lower()
        assert str(ast) == str(expected_ast)
        assert ast.to_tree() == expected_ast.to_tree()
 def test_select_status(self, dialect):
     sql = 'select status from mindsdb.predictors'
     ast = parse_sql(sql, dialect=dialect)
     expected_ast = Select(
         targets=[Identifier.from_path_str("status")],
         from_table=Identifier.from_path_str('mindsdb.predictors'))
     assert ast.to_tree() == expected_ast.to_tree()
     # assert str(ast).lower() == sql.lower()
     assert str(ast) == str(expected_ast)
    def test_not_in(self, dialect):
        sql = f"""SELECT column1 NOT   IN column2"""
        ast = parse_sql(sql, dialect=dialect)

        expected_ast = Select(targets=[
            BinaryOperation(op='not in',
                            args=(Identifier.from_path_str("column1"),
                                  Identifier.from_path_str("column2")))
        ], )

        assert ast.to_tree() == expected_ast.to_tree()
        assert str(ast) == str(expected_ast)
    def test_is_false(self, dialect):
        sql = "SELECT col1 FROM t1 WHERE col1 IS FALSE"
        ast = parse_sql(sql, dialect=dialect)

        expected_ast = Select(targets=[Identifier.from_path_str("col1")],
                              from_table=Identifier.from_path_str('t1'),
                              where=BinaryOperation(
                                  'is',
                                  args=(Identifier.from_path_str('col1'),
                                        Constant(False))))
        assert str(ast).lower() == sql.lower()
        assert ast.to_tree() == expected_ast.to_tree()
        assert str(ast) == str(expected_ast)
    def test_operation_converts_to_lowercase(self, dialect):
        sql = f'SELECT column1 IS column2 FROM tab'
        ast = parse_sql(sql, dialect=dialect)

        expected_ast = Select(targets=[
            BinaryOperation(op='is',
                            args=(Identifier.from_path_str('column1'),
                                  Identifier.from_path_str('column2'))),
        ],
                              from_table=Identifier.from_path_str('tab'))

        assert str(ast) == str(expected_ast)
        assert ast.to_tree() == expected_ast.to_tree()
    def test_between(self, dialect):
        sql = "SELECT col1 FROM t1 WHERE col1 BETWEEN a AND b"
        ast = parse_sql(sql, dialect=dialect)

        expected_ast = Select(
            targets=[Identifier.from_path_str("col1")],
            from_table=Identifier.from_path_str('t1'),
            where=BetweenOperation(args=(Identifier.from_path_str('col1'),
                                         Identifier.from_path_str('a'),
                                         Identifier.from_path_str('b'))))

        assert str(ast).lower() == sql.lower()
        assert ast.to_tree() == expected_ast.to_tree()
        assert str(ast) == str(expected_ast)
 def test_select_from_engines(self, dialect):
     sql = 'select * from engines'
     ast = parse_sql(sql, dialect=dialect)
     expected_ast = Select(targets=[Star()],
                           from_table=Identifier.from_path_str('engines'))
     assert ast.to_tree() == expected_ast.to_tree()
     assert str(ast) == str(expected_ast)
    def test_select_dquote_alias(self, dialect):
        sql = """
            select
              a as "database"      
            from information_schema.tables "database"
        """
        ast = parse_sql(sql, dialect=dialect)

        expected_ast = Select(
            targets=[Identifier('a', alias=Identifier('database'))],
            from_table=Identifier(parts=['information_schema', 'tables'],
                                  alias=Identifier('database')),
        )

        assert str(ast) == str(expected_ast)
        assert ast.to_tree() == expected_ast.to_tree()
    def test_select_function_one_arg(self, dialect):
        funcs = ['sum', 'min', 'max', 'some_custom_function']
        for func in funcs:
            sql = f'SELECT {func}(column) FROM tab'
            ast = parse_sql(sql, dialect=dialect)

            expected_ast = Select(
                targets=[
                    Function(op=func,
                             args=(Identifier.from_path_str('column'), ))
                ],
                from_table=Identifier.from_path_str('tab'),
            )

            assert str(ast).lower() == sql.lower()
            assert str(ast) == str(expected_ast)
            assert ast.to_tree() == expected_ast.to_tree()
Esempio n. 11
0
    def plan_join_two_tables(self, join):
        select_left_step = self.plan_integration_select(
            Select(targets=[Star()], from_table=join.left))
        select_right_step = self.plan_integration_select(
            Select(targets=[Star()], from_table=join.right))

        left_integration_name, left_table = self.get_integration_path_from_identifier_or_error(
            join.left)
        right_integration_name, right_table = self.get_integration_path_from_identifier_or_error(
            join.right)

        left_table_path = left_table.to_string(alias=False)
        right_table_path = right_table.to_string(alias=False)

        new_condition_args = []
        for arg in join.condition.args:
            if isinstance(arg, Identifier):
                if left_table_path in arg.parts:
                    new_condition_args.append(
                        disambiguate_integration_column_identifier(
                            arg, left_integration_name, left_table))
                elif right_table_path in arg.parts:
                    new_condition_args.append(
                        disambiguate_integration_column_identifier(
                            arg, right_integration_name, right_table))
                else:
                    raise PlanningException(
                        f'Wrong table or no source table in join condition for column: {str(arg)}'
                    )
            else:
                new_condition_args.append(arg)
        new_join = copy.deepcopy(join)
        new_join.condition.args = new_condition_args
        new_join.left = Identifier(left_table_path, alias=left_table.alias)
        new_join.right = Identifier(right_table_path, alias=right_table.alias)

        # FIXME: INFORMATION_SCHEMA with condition
        # clear join condition for INFORMATION_SCHEMA
        if right_integration_name == 'INFORMATION_SCHEMA':
            new_join.condition = None

        return self.plan.add_step(
            JoinStep(left=select_left_step.result,
                     right=select_right_step.result,
                     query=new_join))
Esempio n. 12
0
    def test_show_index(self):
        sql = "SHOW INDEX FROM predictors"
        ast = parse_sql(sql, dialect='mysql')
        expected_ast = Show(category='INDEX',
                            from_table=Identifier('predictors'))

        assert str(ast).lower() == sql.lower()
        assert str(ast) == str(expected_ast)
        assert ast.to_tree() == expected_ast.to_tree()
    def test_select_from_view_kw(self, dialect):
        for table in ['view.t', 'views.t']:
            sql = f'select * from {table}'

            ast = parse_sql(sql, dialect=dialect)
            expected_ast = Select(targets=[Star()],
                                  from_table=Identifier.from_path_str(table))
            assert ast.to_tree() == expected_ast.to_tree()
            assert str(ast) == str(expected_ast)
    def test_operator_chained_and(self, dialect):
        sql = f"""SELECT column1 AND column2 AND column3"""
        ast = parse_sql(sql, dialect=dialect)

        expected_ast = Select(targets=[
            BinaryOperation(op='AND',
                            args=(
                                BinaryOperation(
                                    op='and',
                                    args=(
                                        Identifier.from_path_str("column1"),
                                        Identifier.from_path_str("column2"))),
                                Identifier.from_path_str("column3"),
                            ))
        ])

        assert str(ast).lower() == sql.lower()
        assert ast.to_tree() == expected_ast.to_tree()
    def test_select_binary_operations(self, dialect):
        for op in [
                '+', '-', '/', '*', '%', '=', '!=', '>', '<', '>=', '<=', 'is',
                'IS NOT', 'like', 'in', 'and', 'or', '||'
        ]:
            sql = f'SELECT column1 {op.upper()} column2 FROM tab'
            ast = parse_sql(sql, dialect=dialect)

            expected_ast = Select(targets=[
                BinaryOperation(op=op,
                                args=(Identifier.from_path_str('column1'),
                                      Identifier.from_path_str('column2'))),
            ],
                                  from_table=Identifier.from_path_str('tab'))

            assert str(ast).lower() == sql.lower()
            assert str(ast) == str(expected_ast)
            assert ast.to_tree() == expected_ast.to_tree()
Esempio n. 16
0
def disambiguate_predictor_column_identifier(identifier, predictor):
    """Removes integration name from column if it's present, adds table path if it's absent"""
    table_ref = predictor.alias.parts_to_str(
    ) if predictor.alias else predictor.parts_to_str()
    parts = list(identifier.parts)
    if parts[0] == table_ref:
        parts = parts[1:]

    new_identifier = Identifier(parts=parts)
    return new_identifier
    def test_operator_precedence_sum_mult_parentheses(self, dialect):
        sql = f'SELECT (column1 + column2) * column3'
        ast = parse_sql(sql, dialect=dialect)

        expected_ast = Select(targets=[
            BinaryOperation(
                op='*',
                args=(
                    BinaryOperation(op='+',
                                    args=(Identifier.from_path_str('column1'),
                                          Identifier.from_path_str('column2')),
                                    parentheses=True),
                    Identifier.from_path_str('column3'),
                ),
            )
        ])

        assert str(ast).lower() == sql.lower()
        assert str(ast) == str(expected_ast)
        assert ast.to_tree() == expected_ast.to_tree()
    def test_select_function_no_args(self, dialect):
        sql = f'SELECT database() FROM tab'
        ast = parse_sql(sql, dialect=dialect)

        expected_ast = Select(
            targets=[Function(op='database', args=[])],
            from_table=Identifier.from_path_str('tab'),
        )

        assert str(ast).lower() == sql.lower()
        assert str(ast) == str(expected_ast)
        assert ast.to_tree() == expected_ast.to_tree()
Esempio n. 19
0
    def plan_fetch_timeseries_partitions(self, query, table,
                                         predictor_group_by_names):
        targets = [Identifier(column) for column in predictor_group_by_names]

        query = Select(
            distinct=True,
            targets=targets,
            from_table=table,
            where=query.where,
        )
        select_step = self.plan_integration_select(query)
        return select_step
Esempio n. 20
0
 def add_order_not_null(condition):
     order_field_not_null = BinaryOperation(
         op='is not',
         args=[
             Identifier(parts=[predictor_time_column_name]),
             NullConstant()
         ])
     if condition is not None:
         condition = BinaryOperation(
             op='and', args=[condition, order_field_not_null])
     else:
         condition = order_field_not_null
     return condition
Esempio n. 21
0
    def test_select_varialbe_complex(self):
        sql = f"""SELECT * FROM tab1 WHERE column1 in (SELECT column2 + @variable FROM t2)"""
        ast = parse_sql(sql, dialect='mysql')
        expected_ast = Select(targets=[Star()],
                              from_table=Identifier('tab1'),
                              where=BinaryOperation(
                                  op='in',
                                  args=(Identifier('column1'),
                                        Select(targets=[
                                            BinaryOperation(
                                                op='+',
                                                args=[
                                                    Identifier('column2'),
                                                    Variable('variable')
                                                ])
                                        ],
                                               from_table=Identifier('t2'),
                                               parentheses=True))))

        assert ast.to_tree() == expected_ast.to_tree()
        assert str(ast).lower() == sql.lower()
        assert str(ast) == str(expected_ast)
    def test_unary_is_special_values(self, dialect):
        args = [('NULL', NullConstant()), ('TRUE', Constant(value=True)),
                ('FALSE', Constant(value=False))]
        for sql_arg, python_obj in args:
            sql = f"""SELECT column1 IS {sql_arg}"""
            ast = parse_sql(sql, dialect=dialect)

            expected_ast = Select(targets=[
                BinaryOperation(op='IS',
                                args=(Identifier.from_path_str("column1"),
                                      python_obj))
            ], )

            assert str(ast).lower() == sql.lower()
            assert ast.to_tree() == expected_ast.to_tree()
    def test_select_in_operation(self, dialect):
        sql = """SELECT * FROM t1 WHERE col1 IN ("a", "b")"""

        ast = parse_sql(sql, dialect=dialect)

        assert isinstance(ast, Select)
        assert ast.where

        expected_where = BinaryOperation(
            op='IN',
            args=[
                Identifier.from_path_str('col1'),
                Tuple(items=[Constant('a'), Constant("b")]),
            ])

        assert ast.where.to_tree() == expected_where.to_tree()
        assert ast.where == expected_where
Esempio n. 24
0
    def plan_project(self, query, dataframe, ignore_doubles=False):
        out_identifiers = []

        for target in query.targets:
            if isinstance(target, Identifier) \
                    or isinstance(target, Star) \
                    or isinstance(target, Function) \
                    or isinstance(target, Constant):
                out_identifiers.append(target)
            else:
                new_identifier = Identifier(str(target.to_string(alias=False)),
                                            alias=target.alias)
                out_identifiers.append(new_identifier)
        return self.plan.add_step(
            ProjectStep(dataframe=dataframe,
                        columns=out_identifiers,
                        ignore_doubles=ignore_doubles))
    def test_operator_precedence_or_and(self, dialect):
        sql = f'SELECT column1 OR column2 AND column3'
        ast = parse_sql(sql, dialect=dialect)

        expected_ast = Select(targets=[
            BinaryOperation(op='or',
                            args=(
                                Identifier.from_path_str('column1'),
                                BinaryOperation(
                                    op='and',
                                    args=(
                                        Identifier.from_path_str('column2'),
                                        Identifier.from_path_str('column3')))))
        ])

        assert str(ast).lower() == sql.lower()
        assert ast == expected_ast
        assert ast.to_tree() == expected_ast.to_tree()

        sql = f'SELECT column1 AND column2 OR column3'
        ast = parse_sql(sql, dialect=dialect)

        expected_ast = Select(targets=[
            BinaryOperation(op='or',
                            args=(
                                BinaryOperation(
                                    op='and',
                                    args=(
                                        Identifier.from_path_str('column1'),
                                        Identifier.from_path_str('column2'))),
                                Identifier.from_path_str('column3'),
                            ))
        ])

        assert str(ast).lower() == sql.lower()
        assert ast == expected_ast
        assert ast.to_tree() == expected_ast.to_tree()
Esempio n. 26
0
    def plan_join(self, query, integration=None):
        join = query.from_table
        join_left = join.left
        join_right = join.right

        if isinstance(join_left, Select):
            # dbt query.
            # TODO support complex query. Only one table is supported at the moment.
            if not isinstance(join_left.from_table, Identifier):
                raise PlanningException(
                    f'Statement not supported: {query.to_string()}')

            # move properties to upper query
            query = join_left

            if query.from_table.alias is not None:
                table_alias = [query.from_table.alias.parts[0]]
            else:
                table_alias = query.from_table.parts

            def add_aliases(node, is_table, **kwargs):
                if not is_table and isinstance(node, Identifier):
                    if len(node.parts) == 1:
                        # add table alias to field
                        node.parts = table_alias + node.parts

            query_traversal(query.where, add_aliases)

            if isinstance(query.from_table, Identifier):
                # DBT workaround: allow use tables without integration.
                #   if table.part[0] not in integration - take integration name from create table command
                if (integration is not None and query.from_table.parts[0]
                        not in self.integrations):
                    # add integration name to table
                    query.from_table.parts.insert(0, integration)

            join_left = join_left.from_table

        aliased_fields = self.get_aliased_fields(query.targets)

        recursively_check_join_identifiers_for_ambiguity(query.where)
        recursively_check_join_identifiers_for_ambiguity(
            query.group_by, aliased_fields=aliased_fields)
        recursively_check_join_identifiers_for_ambiguity(query.having)
        recursively_check_join_identifiers_for_ambiguity(
            query.order_by, aliased_fields=aliased_fields)

        if isinstance(join_left, Identifier) and isinstance(
                join_right, Identifier):
            if self.is_predictor(join_left) and self.is_predictor(join_right):
                raise PlanningException(
                    f'Can\'t join two predictors {str(join_left.parts[0])} and {str(join_left.parts[1])}'
                )

            predictor_namespace = None
            predictor = None
            table = None
            predictor_is_left = False
            if self.is_predictor(join_left):
                predictor_namespace, predictor = get_predictor_namespace_and_name_from_identifier(
                    join_left, self.default_namespace)
                predictor_is_left = True
            else:
                table = join_left

            if self.is_predictor(join_right):
                predictor_namespace, predictor = get_predictor_namespace_and_name_from_identifier(
                    join_right, self.default_namespace)
            else:
                table = join_right

            last_step = None
            if predictor:
                # One argument is a table, another is a predictor
                # Apply mindsdb model to result of last dataframe fetch
                # Then join results of applying mindsdb with table

                predictor_name = self.predictor_names[predictor.to_string(
                    alias=False).lower()]
                if self.predictor_metadata[predictor_name].get('timeseries'):
                    predictor_steps = self.plan_timeseries_predictor(
                        query, table, predictor_namespace, predictor)
                else:
                    predictor_steps = self.plan_predictor(
                        query, table, predictor_namespace, predictor)

                # add join
                # Update reference
                _, table = self.get_integration_path_from_identifier_or_error(
                    table)
                table_alias = table.alias or Identifier(
                    table.to_string(alias=False).replace('.', '_'))

                left = Identifier(
                    predictor_steps['predictor'].result.ref_name,
                    alias=predictor.alias
                    or Identifier(predictor.to_string(alias=False)))
                right = Identifier(predictor_steps['data'].result.ref_name,
                                   alias=table_alias)

                if not predictor_is_left:
                    # swap join
                    left, right = right, left
                new_join = Join(left=left,
                                right=right,
                                join_type=join.join_type)

                left = predictor_steps['predictor'].result
                right = predictor_steps['data'].result
                if not predictor_is_left:
                    # swap join
                    left, right = right, left

                last_step = self.plan.add_step(
                    JoinStep(left=left, right=right, query=new_join))

                # limit from timeseries
                if predictor_steps.get('saved_limit'):
                    last_step = self.plan.add_step(
                        LimitOffsetStep(dataframe=last_step.result,
                                        limit=predictor_steps['saved_limit']))

            else:
                # Both arguments are tables, join results of 2 dataframe fetches

                join_step = self.plan_join_two_tables(join)
                last_step = join_step
                if query.where:
                    # FIXME: INFORMATION_SCHEMA with Where
                    right_integration_name, _ = self.get_integration_path_from_identifier_or_error(
                        join.right)
                    if right_integration_name == 'INFORMATION_SCHEMA':
                        ...
                    else:
                        last_step = self.plan.add_step(
                            FilterStep(dataframe=last_step.result,
                                       query=query.where))

                if query.group_by:
                    group_by_targets = []
                    for t in query.targets:
                        target_copy = copy.deepcopy(t)
                        target_copy.alias = None
                        group_by_targets.append(target_copy)
                    last_step = self.plan.add_step(
                        GroupByStep(dataframe=last_step.result,
                                    columns=query.group_by,
                                    targets=group_by_targets))

                if query.having:
                    last_step = self.plan.add_step(
                        FilterStep(dataframe=last_step.result,
                                   query=query.having))

                if query.order_by:
                    last_step = self.plan.add_step(
                        OrderByStep(dataframe=last_step.result,
                                    order_by=query.order_by))

                if query.limit is not None or query.offset is not None:
                    limit = query.limit.value if query.limit is not None else None
                    offset = query.offset.value if query.offset is not None else None
                    last_step = self.plan.add_step(
                        LimitOffsetStep(dataframe=last_step.result,
                                        limit=limit,
                                        offset=offset))

        else:
            raise PlanningException(
                f'Join of unsupported objects, currently only tables and predictors can be joined.'
            )
        return self.plan_project(query, last_step.result)
Esempio n. 27
0
    def execute_step(self, step, steps_data):
        if type(step) == GetPredictorColumns:
            predictor_name = step.predictor.parts[-1]
            dn = self.datahub.get(self.mindsdb_database_name)
            columns = dn.get_table_columns(predictor_name)
            columns = [(column_name, column_name) for column_name in columns]
            data = {
                'values': [],
                'columns': {
                    (self.mindsdb_database_name, predictor_name, predictor_name):
                    columns
                },
                'tables':
                [(self.mindsdb_database_name, predictor_name, predictor_name)]
            }
        elif type(step) == GetTableColumns:
            table = step.table
            dn = self.datahub.get(step.namespace)
            ds_query = Select(from_table=Identifier(table), targets=[Star()])
            dso, _ = dn.data_store.create_datasource(
                dn.integration_name, {'query': ds_query.to_string()})

            columns = dso.get_columns()
            cols = []
            for col in columns:
                if not isinstance(col, dict):
                    col = {'name': col, 'type': 'str'}
                cols.append(col)

            table_alias = (self.database, table, table)

            data = {
                'values': [],
                'columns': {
                    table_alias: cols
                },
                'tables': [table_alias]
            }
        elif type(step) == FetchDataframeStep:
            data = self._fetch_dataframe_step(step)
        elif type(step) == UnionStep:
            raise ErNotSupportedYet('Union step is not implemented')
            # TODO add union support
            # left_data = steps_data[step.left.step_num]
            # right_data = steps_data[step.right.step_num]
            # data = left_data + right_data
        elif type(step) == MapReduceStep:
            try:
                if step.reduce != 'union':
                    raise Exception(
                        f'Unknown MapReduceStep type: {step.reduce}')

                step_data = steps_data[step.values.step_num]
                vars = []
                step_data_values = step_data['values']
                for row in step_data_values:
                    var_group = {}
                    vars.append(var_group)
                    for row_data in row.values():
                        for name, value in row_data.items():
                            if name[0] != '__mindsdb_row_id':
                                var_group[name[1] or name[0]] = value

                data = {'values': [], 'columns': {}, 'tables': []}
                substep = step.step
                if type(substep) == FetchDataframeStep:
                    query = substep.query
                    for var_group in vars:
                        markQueryVar(query.where)
                        for name, value in var_group.items():
                            replaceQueryVar(query.where, value, name)
                        sub_data = self._fetch_dataframe_step(substep)
                        if len(data['columns']) == 0:
                            data['columns'] = sub_data['columns']
                        if len(data['tables']) == 0:
                            data['tables'] = sub_data['tables']
                        data['values'].extend(sub_data['values'])
                        unmarkQueryVar(query.where)
                elif type(substep) == MultipleSteps:
                    data = self._multiple_steps_reduce(substep, vars)
                else:
                    raise Exception(f'Unknown step type: {step.step}')
            except Exception as e:
                raise SqlApiException(f'error in map reduce step: {e}') from e
        elif type(step) == MultipleSteps:
            if step.reduce != 'union':
                raise Exception(
                    f"Only MultipleSteps with type = 'union' is supported. Got '{step.type}'"
                )
            data = None
            for substep in step.steps:
                subdata = self.execute_step(substep, steps_data)
                if data is None:
                    data = subdata
                else:
                    data['values'].extend(subdata['values'])
        elif type(step) == ApplyPredictorRowStep:
            try:
                predictor = '.'.join(step.predictor.parts)
                dn = self.datahub.get(self.mindsdb_database_name)
                where_data = step.row_dict

                data = dn.select(
                    table=predictor,
                    columns=None,
                    where_data=where_data,
                    integration_name=self.session.integration,
                    integration_type=self.session.integration_type)

                data = [{(key, key): value
                         for key, value in row.items()} for row in data]

                table_name = get_preditor_alias(step, self.database)
                values = [{table_name: x} for x in data]
                columns = {table_name: []}
                if len(data) > 0:
                    row = data[0]
                    columns[table_name] = list(row.keys())
                # TODO else

                data = {
                    'values': values,
                    'columns': columns,
                    'tables': [table_name]
                }
            except Exception as e:
                raise SqlApiException(
                    f'error in apply predictor row step: {e}') from e
        elif type(step) in (ApplyPredictorStep, ApplyTimeseriesPredictorStep):
            try:
                dn = self.datahub.get(self.mindsdb_database_name)
                predictor = '.'.join(step.predictor.parts)
                where_data = []
                for row in steps_data[step.dataframe.step_num]['values']:
                    new_row = {}
                    for table_name in row:
                        keys_intersection = set(new_row) & set(row[table_name])
                        if len(keys_intersection) > 0:
                            raise Exception(
                                f'The predictor got two identical keys from different datasources: {keys_intersection}'
                            )
                        new_row.update(row[table_name])
                    where_data.append(new_row)

                where_data = [{key[1]: value
                               for key, value in row.items()}
                              for row in where_data]

                is_timeseries = self.planner.predictor_metadata[predictor][
                    'timeseries']
                _mdb_make_predictions = None
                if is_timeseries:
                    if 'LATEST' in self.query_str:
                        _mdb_make_predictions = False
                    else:
                        _mdb_make_predictions = True
                    for row in where_data:
                        if '__mdb_make_predictions' not in row:
                            row['__mdb_make_predictions'] = _mdb_make_predictions

                for row in where_data:
                    for key in row:
                        if isinstance(row[key], datetime.date):
                            row[key] = str(row[key])

                data = dn.select(
                    table=predictor,
                    columns=None,
                    where_data=where_data,
                    integration_name=self.session.integration,
                    integration_type=self.session.integration_type)

                # if is_timeseries:
                #     if 'LATEST' not in self.raw:
                #         # remove additional records from predictor results:
                #         # first 'window_size' and last 'horizon' records
                #         # otherwise there are many unxpected rows in prediciton result:
                #         # ----------------------------------------------------------------------------------------
                #         # mysql> SELECT tb.time, tb.state, tb.pnew_case, tb.new_case from
                #         # MYSQL_LOCAL.test_data.covid AS
                #         # ta JOIN mindsdb.covid_hor3 AS tb
                #         # WHERE ta.state = "CA" AND ta.time BETWEEN "2020-10-19" AND "2020-10-20";
                #         # ----------------------------------------------------------------------------------------
                #         # +------------+-------+-----------+----------+
                #         # | time       | state | pnew_case | new_case |
                #         # +------------+-------+-----------+----------+
                #         # | 2020-10-09 | CA    | 0         | 2862     |
                #         # | 2020-10-10 | CA    | 0         | 2979     |
                #         # | 2020-10-11 | CA    | 0         | 3075     |
                #         # | 2020-10-12 | CA    | 0         | 3329     |
                #         # | 2020-10-13 | CA    | 0         | 2666     |
                #         # | 2020-10-14 | CA    | 0         | 2378     |
                #         # | 2020-10-15 | CA    | 0         | 3449     |
                #         # | 2020-10-16 | CA    | 0         | 3803     |
                #         # | 2020-10-17 | CA    | 0         | 4170     |
                #         # | 2020-10-18 | CA    | 0         | 3806     |
                #         # | 2020-10-19 | CA    | 0         | 3286     |
                #         # | 2020-10-20 | CA    | 0         | 3474     |
                #         # | 2020-10-21 | CA    | 0         | 3474     |
                #         # | 2020-10-22 | CA    | 0         | 3474     |
                #         # +------------+-------+-----------+----------+
                #         # 14 rows in set (2.52 sec)

                #         window_size = predictor_metadata[predictor]['window']
                #         horizon = predictor_metadata[predictor]['horizon']
                #         if len(data) >= (window_size + horizon):
                #             data = data[window_size:]
                #             if len(data) > horizon and horizon > 1:
                #                 data = data[:-horizon + 1]
                data = [{(key, key): value
                         for key, value in row.items()} for row in data]

                table_name = get_preditor_alias(step, self.database)
                values = [{table_name: x} for x in data]
                columns = {table_name: []}
                if len(data) > 0:
                    row = data[0]
                    columns[table_name] = list(row.keys())
                # TODO else

                data = {
                    'values': values,
                    'columns': columns,
                    'tables': [table_name]
                }
            except Exception as e:
                raise SqlApiException(
                    f'error in apply predictor step: {e}') from e
        elif type(step) == JoinStep:
            try:
                left_data = steps_data[step.left.step_num]
                right_data = steps_data[step.right.step_num]

                # FIXME https://github.com/mindsdb/mindsdb_sql/issues/136
                # is_timeseries = False
                # if True in [type(step) == ApplyTimeseriesPredictorStep for step in plan.steps]:
                #     right_data = steps_data[step.left.step_num]
                #     left_data = steps_data[step.right.step_num]
                #     is_timeseries = True

                if step.query.condition is not None:
                    raise Exception(
                        'At this moment supported only JOIN without condition')
                if step.query.join_type.upper() not in ('LEFT JOIN', 'JOIN'):
                    raise Exception(
                        'At this moment supported only JOIN and LEFT JOIN')
                if (len(left_data['tables']) != 1
                        or len(right_data['tables']) != 1
                        or left_data['tables'][0] == right_data['tables'][0]):
                    raise Exception(
                        'At this moment supported only JOIN of two different tables'
                    )

                data = {
                    'values': [],
                    'columns': {},
                    'tables':
                    list(set(left_data['tables'] + right_data['tables']))
                }

                for data_part in [left_data, right_data]:
                    for table_name in data_part['columns']:
                        if table_name not in data['columns']:
                            data['columns'][table_name] = data_part['columns'][
                                table_name]
                        else:
                            data['columns'][table_name].extend(
                                data_part['columns'][table_name])
                for table_name in data['columns']:
                    data['columns'][table_name] = list(
                        set(data['columns'][table_name]))

                left_key = left_data['tables'][0]
                right_key = right_data['tables'][0]

                left_columns_map = {}
                left_columns_map_reverse = {}
                for i, column_name in enumerate(
                        left_data['columns'][left_key]):
                    left_columns_map[f'a{i}'] = column_name
                    left_columns_map_reverse[column_name] = f'a{i}'

                right_columns_map = {}
                right_columns_map_reverse = {}
                for i, column_name in enumerate(
                        right_data['columns'][right_key]):
                    right_columns_map[f'b{i}'] = column_name
                    right_columns_map_reverse[column_name] = f'b{i}'

                left_df_data = []
                for row in left_data['values']:
                    row = row[left_key]
                    left_df_data.append({
                        left_columns_map_reverse[key]: value
                        for key, value in row.items()
                    })

                right_df_data = []
                for row in right_data['values']:
                    row = row[right_key]
                    right_df_data.append({
                        right_columns_map_reverse[key]: value
                        for key, value in row.items()
                    })

                df_a = pd.DataFrame(left_df_data)
                df_b = pd.DataFrame(right_df_data)

                a_name = f'a{round(time.time() * 1000)}'
                b_name = f'b{round(time.time() * 1000)}'
                con = duckdb.connect(database=':memory:')
                con.register(a_name, df_a)
                con.register(b_name, df_b)
                resp_df = con.execute(f"""
                    SELECT * FROM {a_name} as ta full join {b_name} as tb
                    ON ta.{left_columns_map_reverse[('__mindsdb_row_id', '__mindsdb_row_id')]}
                     = tb.{right_columns_map_reverse[('__mindsdb_row_id', '__mindsdb_row_id')]}
                """).fetchdf()
                con.unregister(a_name)
                con.unregister(b_name)
                con.close()
                resp_df = resp_df.where(pd.notnull(resp_df), None)
                resp_dict = resp_df.to_dict(orient='records')

                for row in resp_dict:
                    new_row = {left_key: {}, right_key: {}}
                    for key, value in row.items():
                        if key.startswith('a'):
                            new_row[left_key][left_columns_map[key]] = value
                        else:
                            new_row[right_key][right_columns_map[key]] = value
                    data['values'].append(new_row)

                # remove all records with empty data from predictor from join result
                # otherwise there are emtpy records in the final result:
                # +------------+------------+-------+-----------+----------+
                # | time       | time       | state | pnew_case | new_case |
                # +------------+------------+-------+-----------+----------+
                # | 2020-10-21 | 2020-10-24 | CA    | 0.0       | 5945.0   |
                # | 2020-10-22 | 2020-10-23 | CA    | 0.0       | 6141.0   |
                # | 2020-10-23 | 2020-10-22 | CA    | 0.0       | 2940.0   |
                # | 2020-10-24 | 2020-10-21 | CA    | 0.0       | 3707.0   |
                # | NULL       | 2020-10-20 | NULL  | nan       | nan      |
                # | NULL       | 2020-10-19 | NULL  | nan       | nan      |
                # | NULL       | 2020-10-18 | NULL  | nan       | nan      |
                # | NULL       | 2020-10-17 | NULL  | nan       | nan      |
                # | NULL       | 2020-10-16 | NULL  | nan       | nan      |
                # +------------+------------+-------+-----------+----------+
                # 9 rows in set (2.07 sec)

                # if is_timeseries:
                #     data_values = []
                #     for row in data['values']:
                #         for key in row:
                #             if 'mindsdb' in key:
                #                 if not is_empty_prediction_row(row[key]):
                #                     data_values.append(row)
                #                     break
                #     data['values'] = data_values
            except Exception as e:
                raise SqlApiException(f'error in join step: {e}') from e

        elif type(step) == FilterStep:
            raise ErNotSupportedYet('FilterStep is not implemented')
        # elif type(step) == ApplyTimeseriesPredictorStep:
        #     raise Exception('ApplyTimeseriesPredictorStep is not implemented')
        elif type(step) == LimitOffsetStep:
            try:
                step_data = steps_data[step.dataframe.step_num]
                data = {
                    'values': step_data['values'].copy(),
                    'columns': step_data['columns'].copy(),
                    'tables': step_data['tables'].copy()
                }
                if isinstance(step.offset, Constant) and isinstance(
                        step.offset.value, int):
                    data['values'] = data['values'][step.offset.value:]
                if isinstance(step.limit, Constant) and isinstance(
                        step.limit.value, int):
                    data['values'] = data['values'][:step.limit.value]
            except Exception as e:
                raise SqlApiException(
                    f'error in limit offset step: {e}') from e
        elif type(step) == ProjectStep:
            try:
                step_data = steps_data[step.dataframe.step_num]
                columns_list = []
                for column_identifier in step.columns:
                    table_name = None
                    if type(column_identifier) == Star:
                        for table_name, table_columns_list in step_data[
                                'columns'].items():
                            for column in table_columns_list:
                                columns_list.append(
                                    Column(database=table_name[0],
                                           table_name=table_name[1],
                                           table_alias=table_name[2],
                                           name=column[0],
                                           alias=column[1]))
                    elif type(column_identifier) == Identifier:
                        column_name_parts = column_identifier.parts
                        column_alias = None if column_identifier.alias is None else '.'.join(
                            column_identifier.alias.parts)
                        if len(column_name_parts) > 2:
                            raise Exception(
                                f'Column name must contain no more than 2 parts. Got name: {column_identifier}'
                            )
                        elif len(column_name_parts) == 1:
                            column_name = column_name_parts[0]

                            appropriate_table = None
                            if len(step_data['tables']) == 1:
                                appropriate_table = step_data['tables'][0]
                            else:
                                for table_name, table_columns in step_data[
                                        'columns'].items():
                                    table_column_names_list = [
                                        x[1] or x[0] for x in table_columns
                                    ]
                                    column_exists = get_column_in_case(
                                        table_column_names_list, column_name)
                                    if column_exists:
                                        if appropriate_table is not None:
                                            raise Exception(
                                                'Found multiple appropriate tables for column {column_name}'
                                            )
                                        else:
                                            appropriate_table = table_name
                            if appropriate_table is None:
                                # it is probably constaint
                                # FIXME https://github.com/mindsdb/mindsdb_sql/issues/133
                                # column_name = column_name.strip("'")
                                # name_or_alias = column_alias or column_name
                                # column_alias = name_or_alias
                                # for row in step_data['values']:
                                #     for table in row:
                                #         row[table][(column_name, name_or_alias)] = row[table][(column_name, column_name)]
                                # appropriate_table = step_data['tables'][0]
                                # FIXME: must be exception
                                columns_list.append(
                                    Column(database=appropriate_table[0],
                                           table_name=appropriate_table[1],
                                           table_alias=appropriate_table[2],
                                           name=column_alias))
                            else:
                                columns_list.append(
                                    Column(database=appropriate_table[0],
                                           table_name=appropriate_table[1],
                                           table_alias=appropriate_table[2],
                                           name=column_name,
                                           alias=column_alias))  # column_name
                        elif len(column_name_parts) == 2:
                            table_name_or_alias = column_name_parts[0]
                            column_name = column_name_parts[1]

                            appropriate_table = None
                            for table_name, table_columns in step_data[
                                    'columns'].items():
                                table_column_names_list = [
                                    x[1] or x[0] for x in table_columns
                                ]
                                checkig_table_name_or_alias = table_name[
                                    2] or table_name[1]
                                if table_name_or_alias.lower(
                                ) == checkig_table_name_or_alias.lower():
                                    column_exists = get_column_in_case(
                                        table_column_names_list, column_name)
                                    if column_exists:
                                        appropriate_table = table_name
                                        break
                                    else:
                                        raise Exception(
                                            f'Can not find column "{column_name}" in table "{table_name}"'
                                        )
                            if appropriate_table is None:
                                raise Exception(
                                    f'Can not find approproate table for column {column_name}'
                                )

                            columns_to_copy = None
                            table_column_names_list = [
                                x[1] or x[0] for x in table_columns
                            ]
                            checking_name = get_column_in_case(
                                table_column_names_list, column_name)
                            for column in step_data['columns'][
                                    appropriate_table]:
                                if column[0] == checking_name and (
                                        column[1] is None
                                        or column[1] == checking_name):
                                    columns_to_copy = column
                                    break
                            else:
                                raise Exception(
                                    f'Can not find approproate column in data: {(column_name, column_alias)}'
                                )

                            for row in step_data['values']:
                                row[appropriate_table][(
                                    column_name, column_alias
                                )] = row[appropriate_table][columns_to_copy]

                            columns_list.append(
                                Column(database=appropriate_table[0],
                                       table_name=appropriate_table[1],
                                       table_alias=appropriate_table[2],
                                       name=column_name,
                                       alias=column_alias))
                        else:
                            raise Exception('Undefined column name')
                    else:
                        raise Exception(
                            f'Unexpected column name type: {column_identifier}'
                        )

                self.columns_list = columns_list
                data = step_data
            except Exception as e:
                raise SqlApiException(f'error on project step:{e} ') from e
        else:
            raise SqlApiException(F'Unknown planner step: {step}')
        return data
Esempio n. 28
0
    def plan_timeseries_predictor(self, query, table, predictor_namespace,
                                  predictor):
        predictor_name = predictor.to_string(alias=False).lower()
        # to original case
        predictor_name = self.predictor_names[predictor_name]

        predictor_time_column_name = self.predictor_metadata[predictor_name][
            'order_by_column']
        predictor_group_by_names = self.predictor_metadata[predictor_name][
            'group_by_columns']
        if predictor_group_by_names is None:
            predictor_group_by_names = []
        predictor_window = self.predictor_metadata[predictor_name]['window']

        if query.order_by:
            raise PlanningException(
                f'Can\'t provide ORDER BY to time series predictor, it will be taken from predictor settings. Found: {query.order_by}'
            )

        saved_limit = query.limit

        if query.group_by or query.having or query.offset:
            raise PlanningException(
                f'Unsupported query to timeseries predictor: {str(query)}')

        allowed_columns = [predictor_time_column_name.lower()]
        if len(predictor_group_by_names) > 0:
            allowed_columns += [i.lower() for i in predictor_group_by_names]
        validate_ts_where_condition(query.where,
                                    allowed_columns=allowed_columns)

        time_filter = find_time_filter(
            query.where, time_column_name=predictor_time_column_name)

        order_by = [
            OrderBy(Identifier(parts=[predictor_time_column_name]),
                    direction='DESC')
        ]

        preparation_where = copy.deepcopy(query.where)

        # add {order_by_field} is not null
        def add_order_not_null(condition):
            order_field_not_null = BinaryOperation(
                op='is not',
                args=[
                    Identifier(parts=[predictor_time_column_name]),
                    NullConstant()
                ])
            if condition is not None:
                condition = BinaryOperation(
                    op='and', args=[condition, order_field_not_null])
            else:
                condition = order_field_not_null
            return condition

        preparation_where2 = copy.deepcopy(preparation_where)
        preparation_where = add_order_not_null(preparation_where)

        # Obtain integration selects
        if isinstance(time_filter, BetweenOperation):
            between_from = time_filter.args[1]
            preparation_time_filter = BinaryOperation(
                '<',
                args=[Identifier(predictor_time_column_name), between_from])
            preparation_where2 = replace_time_filter(preparation_where2,
                                                     time_filter,
                                                     preparation_time_filter)
            integration_select_1 = Select(
                targets=[Star()],
                from_table=table,
                where=add_order_not_null(preparation_where2),
                order_by=order_by,
                limit=Constant(predictor_window))

            integration_select_2 = Select(targets=[Star()],
                                          from_table=table,
                                          where=preparation_where,
                                          order_by=order_by)

            integration_selects = [integration_select_1, integration_select_2]
        elif isinstance(
                time_filter, BinaryOperation
        ) and time_filter.op == '>' and time_filter.args[1] == Latest():
            integration_select = Select(
                targets=[Star()],
                from_table=table,
                where=preparation_where,
                order_by=order_by,
                limit=Constant(predictor_window),
            )
            integration_select.where = find_and_remove_time_filter(
                integration_select.where, time_filter)
            integration_selects = [integration_select]

        elif isinstance(time_filter,
                        BinaryOperation) and time_filter.op in ('>', '>='):
            time_filter_date = time_filter.args[1]
            preparation_time_filter_op = {'>': '<=', '>=': '<'}[time_filter.op]

            preparation_time_filter = BinaryOperation(
                preparation_time_filter_op,
                args=[
                    Identifier(predictor_time_column_name), time_filter_date
                ])
            preparation_where2 = replace_time_filter(preparation_where2,
                                                     time_filter,
                                                     preparation_time_filter)
            integration_select_1 = Select(
                targets=[Star()],
                from_table=table,
                where=add_order_not_null(preparation_where2),
                order_by=order_by,
                limit=Constant(predictor_window))

            integration_select_2 = Select(targets=[Star()],
                                          from_table=table,
                                          where=preparation_where,
                                          order_by=order_by)

            integration_selects = [integration_select_1, integration_select_2]
        else:
            integration_select = Select(
                targets=[Star()],
                from_table=table,
                where=preparation_where,
                order_by=order_by,
            )
            integration_selects = [integration_select]

        if len(predictor_group_by_names) == 0:
            # ts query without grouping
            # one or multistep
            if len(integration_selects) == 1:
                select_partition_step = self.get_integration_select_step(
                    integration_selects[0])
            else:
                select_partition_step = MultipleSteps(steps=[
                    self.get_integration_select_step(s)
                    for s in integration_selects
                ],
                                                      reduce='union')

            # fetch data step
            data_step = self.plan.add_step(select_partition_step)
        else:
            # inject $var to queries
            for integration_select in integration_selects:
                condition = integration_select.where
                for num, column in enumerate(predictor_group_by_names):
                    cond = BinaryOperation(
                        '=',
                        args=[Identifier(column),
                              Constant(f'$var[{column}]')])

                    # join to main condition
                    if condition is None:
                        condition = cond
                    else:
                        condition = BinaryOperation('and',
                                                    args=[condition, cond])

                integration_select.where = condition
            # one or multistep
            if len(integration_selects) == 1:
                select_partition_step = self.get_integration_select_step(
                    integration_selects[0])
            else:
                select_partition_step = MultipleSteps(steps=[
                    self.get_integration_select_step(s)
                    for s in integration_selects
                ],
                                                      reduce='union')

            # get groping values
            no_time_filter_query = copy.deepcopy(query)
            no_time_filter_query.where = find_and_remove_time_filter(
                no_time_filter_query.where, time_filter)
            select_partitions_step = self.plan_fetch_timeseries_partitions(
                no_time_filter_query, table, predictor_group_by_names)

            # sub-query by every grouping value
            map_reduce_step = self.plan.add_step(
                MapReduceStep(values=select_partitions_step.result,
                              reduce='union',
                              step=select_partition_step))
            data_step = map_reduce_step

        predictor_step = self.plan.add_step(
            ApplyTimeseriesPredictorStep(
                output_time_filter=time_filter,
                namespace=predictor_namespace,
                dataframe=data_step.result,
                predictor=predictor,
            ))

        return {
            'predictor': predictor_step,
            'data': data_step,
            'saved_limit': saved_limit,
        }