Esempio n. 1
0
    def __init__(self,
                 query=None,
                 integrations=None,
                 predictor_namespace=None,
                 predictor_metadata=None,
                 default_namespace=None):
        self.query = query
        self.plan = QueryPlan()

        self.integrations = [int.lower()
                             for int in integrations] if integrations else []
        self.predictor_namespace = predictor_namespace.lower(
        ) if predictor_namespace else 'mindsdb'
        self.predictor_metadata = predictor_metadata or defaultdict(dict)
        self.default_namespace = default_namespace

        # map for lower names of predictors
        self.predictor_names = {
            k.lower(): k
            for k in self.predictor_metadata.keys()
        }

        # allow to select from mindsdb namespace
        # self.integrations.append(self.predictor_namespace)

        self.statement = None
    def test_integration_select_limit_offset(self):
        query = Select(targets=[Identifier('column1')],
                       from_table=Identifier('int.tab'),
                       where=BinaryOperation('=', args=[Identifier('column1'), Identifier('column2')]),
                       limit=Constant(10),
                       offset=Constant(15),
                       )
        expected_plan = QueryPlan(integrations=['int'],
                                  steps=[
                                      FetchDataframeStep(integration='int',
                                                         query=Select(
                                                             targets=[Identifier('tab.column1', alias=Identifier('column1'))],
                                                             from_table=Identifier('tab'),
                                                             where=BinaryOperation('=', args=[Identifier('tab.column1'),
                                                                                              Identifier(
                                                                                                  'tab.column2')]),
                                                             limit=Constant(10),
                                                             offset=Constant(15),
                                                             ),
                                                         ),
                                  ])

        plan = plan_query(query, integrations=['int'])

        assert plan.steps == expected_plan.steps
Esempio n. 3
0
    def test_select_from_predictor_get_columns(self):
        sql = f'SELECT GDP_per_capita_USD FROM hdi_predictor_external WHERE 1 = 0'
        query = parse_sql(sql, dialect='mindsdb')

        expected_query = Select(
            targets=[Identifier('GDP_per_capita_USD')],
            from_table=Identifier('hdi_predictor_external'),
            where=BinaryOperation(op="=", args=[Constant(1),
                                                Constant(0)]))
        assert query.to_tree() == expected_query.to_tree()

        expected_plan = QueryPlan(
            predictor_namespace='mindsdb',
            default_namespace='mindsdb',
            steps=[
                GetPredictorColumns(
                    namespace='mindsdb',
                    predictor=Identifier('hdi_predictor_external')),
                ProjectStep(dataframe=Result(0),
                            columns=[Identifier('GDP_per_capita_USD')]),
            ],
        )

        plan = plan_query(query,
                          predictor_namespace='mindsdb',
                          default_namespace='mindsdb',
                          predictor_metadata={'hdi_predictor_external': {}})

        assert plan.steps == expected_plan.steps
Esempio n. 4
0
    def test_join_tables_disambiguate_identifiers_in_condition(self):
        query = Select(targets=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')],
                       from_table=Join(left=Identifier('int.tab1'),
                                       right=Identifier('int.tab2'),
                                       condition=BinaryOperation(op='=', args=[Identifier('int.tab1.column1'), # integration name included
                                                                               Identifier('tab2.column1')]),
                                       join_type=JoinType.INNER_JOIN
                                       )
                       )
        plan = plan_query(query, integrations=['int'])
        expected_plan = QueryPlan(integrations=['int'],
                                  steps=[
                                      FetchDataframeStep(integration='int',
                                                         query=Select(
                                                             targets=[Star()],
                                                             from_table=Identifier('tab1')),
                                                         ),
                                      FetchDataframeStep(integration='int',
                                                         query=Select(targets=[Star()],
                                                                      from_table=Identifier('tab2')),
                                                         ),
                                      JoinStep(left=Result(0), right=Result(1),
                                               query=Join(left=Identifier('tab1'),
                                                          right=Identifier('tab2'),
                                                          condition=BinaryOperation(op='=',
                                                                                    args=[Identifier('tab1.column1'), # integration name gets stripped out
                                                                                          Identifier('tab2.column1')]),
                                                          join_type=JoinType.INNER_JOIN
                                                          )),
                                      ProjectStep(dataframe=Result(2),
                                                  columns=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')]),
                                  ],
                                  )

        assert plan.steps == expected_plan.steps
    def test_integration_name_is_case_insensitive(self):
        query = Select(targets=[Identifier('column1')],
                       from_table=Identifier('int.tab'),
                       where=BinaryOperation('and', args=[
                           BinaryOperation('=', args=[Identifier('column1'), Identifier('column2')]),
                           BinaryOperation('>', args=[Identifier('column3'), Constant(0)]),
                       ]))
        expected_plan = QueryPlan(integrations=['int'],
                                  steps=[
                                      FetchDataframeStep(integration='int',
                                                         query=Select(targets=[Identifier('tab.column1', alias=Identifier('column1'))],
                                                                      from_table=Identifier('tab'),
                                                                      where=BinaryOperation('and', args=[
                                                                              BinaryOperation('=',
                                                                                              args=[Identifier('tab.column1'),
                                                                                                    Identifier('tab.column2')]),
                                                                              BinaryOperation('>',
                                                                                              args=[Identifier('tab.column3'),
                                                                                                    Constant(0)]),
                                                                          ])
                                                                      )),
                                  ])

        plan = plan_query(query, integrations=['INT'])

        assert plan.steps == expected_plan.steps
    def test_integration_select_subquery_in_where(self):
        query = Select(targets=[Star()],
                          from_table=Identifier('int.tab1'),
                          where=BinaryOperation(op='in',
                                                args=(
                                                    Identifier(parts=['column1']),
                                                    Select(targets=[Identifier('column2')],
                                                           from_table=Identifier('int.tab2'),
                                                           parentheses=True)
                                                )))

        expected_plan = QueryPlan(integrations=['int'],
                                  steps=[
                                      FetchDataframeStep(integration='int',
                                                         query=Select(targets=[Star()],
                                                                      from_table=Identifier('tab1'),
                                                                      where=BinaryOperation(op='in',
                                                                                            args=[
                                                                                                Identifier('tab1.column1'),
                                                                                                Select(targets=[
                                                                                                    Identifier('tab2.column2', alias=Identifier('column2'))],
                                                                                                       from_table=Identifier('tab2'),
                                                                                                       parentheses=True)]
                                                                                            ))),
                                  ])

        plan = plan_query(query, integrations=['int'])

        assert plan.steps == expected_plan.steps
    def test_integration_select_default_namespace(self):
        query = Select(targets=[Identifier('column1'), Constant(1), Function('database', args=[])],
                       from_table=Identifier('tab'),
                       where=BinaryOperation('and', args=[
                           BinaryOperation('=', args=[Identifier('column1'), Identifier('column2')]),
                           BinaryOperation('>', args=[Identifier('column3'), Constant(0)]),
                       ]))

        expected_plan = QueryPlan(integrations=['int'],
                                  default_namespace='int',
                                  steps=[
                                      FetchDataframeStep(integration='int',
                                                         query=Select(targets=[Identifier('tab.column1', alias=Identifier('column1')),
                                                                               Constant(1),
                                                                               Function('database', args=[]),
                                                                               ],
                                                                      from_table=Identifier('tab'),
                                                                      where=BinaryOperation('and', args=[
                                                                              BinaryOperation('=',
                                                                                              args=[Identifier('tab.column1'),
                                                                                                    Identifier('tab.column2')]),
                                                                              BinaryOperation('>',
                                                                                              args=[Identifier('tab.column3'),
                                                                                                    Constant(0)]),
                                                                          ])
                                                                      ),
                                                         step_num=0,
                                                         references=None,
                                                         ),
                                  ])

        plan = plan_query(query, integrations=['int'], default_namespace='int')

        for i in range(len(plan.steps)):
            assert plan.steps[i] == expected_plan.steps[i]
Esempio n. 8
0
    def test_join_tables_plan_default_namespace(self):
        query = Select(targets=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')],
                       from_table=Join(left=Identifier('tab1'),
                                       right=Identifier('tab2'),
                                       condition=BinaryOperation(op='=', args=[Identifier('tab1.column1'), Identifier('tab2.column1')]),
                                       join_type=JoinType.INNER_JOIN
                                       )
                )
        expected_plan = QueryPlan(integrations=['int'],
                                  default_namespace='int',
                                  steps = [
                                      FetchDataframeStep(integration='int',
                                                         query=Select(
                                                             targets=[Star()],
                                                             from_table=Identifier('tab1')),
                                                         ),
                                      FetchDataframeStep(integration='int',
                                                         query=Select(targets=[Star()],
                                                                      from_table=Identifier('tab2')),
                                                         ),
                                      JoinStep(left=Result(0), right=Result(1),
                                               query=Join(left=Identifier('tab1'),
                                                          right=Identifier('tab2'),
                                                          condition=BinaryOperation(op='=',
                                                                                    args=[Identifier('tab1.column1'),
                                                                                          Identifier('tab2.column1')]),
                                                          join_type=JoinType.INNER_JOIN
                                                          )),
                                      ProjectStep(dataframe=Result(2),
                                                  columns=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')]),
                                  ],
        )
        plan = plan_query(query, integrations=['int'], default_namespace='int')

        assert plan.steps == expected_plan.steps
    def test_integration_select_plan_group_by(self):
        query = Select(targets=[Identifier('column1'),
                                Identifier("column2"),
                                Function(op="sum",
                                 args=[Identifier(parts=["column3"])],
                                 alias=Identifier('total')),
                                ],
                       from_table=Identifier('int.tab'),
                       group_by=[Identifier("column1"), Identifier("column2")],
                       having=BinaryOperation('=', args=[Identifier("column1"), Constant(0)])
                       )
        expected_plan = QueryPlan(integrations=['int'],
                                  steps=[
                                      FetchDataframeStep(integration='int',
                                                         query=Select(targets=[
                                                             Identifier('tab.column1', alias=Identifier('column1')),
                                                             Identifier('tab.column2', alias=Identifier('column2')),
                                                             Function(op="sum",
                                                                      args=[Identifier(parts=['tab', 'column3'])],
                                                                      alias=Identifier('total')),

                                                         ],
                                                             from_table=Identifier('tab'),
                                                             group_by=[Identifier('tab.column1'), Identifier('tab.column2')],
                                                             having=BinaryOperation('=', args=[Identifier('tab.column1'),
                                                                                               Constant(0)])
                                                         )),
                                  ])

        plan = plan_query(query, integrations=['int'])

        assert plan.steps == expected_plan.steps
Esempio n. 10
0
    def test_select_from_predictor_default_namespace(self):
        query = Select(
            targets=[Star()],
            from_table=Identifier('pred'),
            where=BinaryOperation(
                op='and',
                args=[
                    BinaryOperation(op='=',
                                    args=[Identifier('x1'),
                                          Constant(1)]),
                    BinaryOperation(op='=',
                                    args=[Identifier('x2'),
                                          Constant('2')])
                ],
            ))
        expected_plan = QueryPlan(
            predictor_namespace='mindsdb',
            default_namespace='mindsdb',
            steps=[
                ApplyPredictorRowStep(namespace='mindsdb',
                                      predictor=Identifier('pred'),
                                      row_dict={
                                          'x1': 1,
                                          'x2': '2'
                                      }),
                ProjectStep(dataframe=Result(0), columns=[Star()]),
            ],
        )

        plan = plan_query(query,
                          predictor_namespace='mindsdb',
                          default_namespace='mindsdb',
                          predictor_metadata={'pred': {}})

        assert plan.steps == expected_plan.steps
Esempio n. 11
0
    def test_join_predictor_plan_order_by(self):
        query = Select(targets=[Identifier('tab.column1'), Identifier('pred.predicted')],
                       from_table=Join(left=Identifier('int.tab'),
                                       right=Identifier('mindsdb.pred'),
                                       join_type=JoinType.INNER_JOIN,
                                       implicit=True),
                       where=BinaryOperation('=', args=[Identifier('tab.product_id'), Constant('x')]),
                       limit=Constant(10),
                       offset=Constant(15),
                       order_by=[OrderBy(field=Identifier('tab.column1'))]
                       )

        expected_plan = QueryPlan(
            steps=[
                FetchDataframeStep(integration='int',
                                   query=Select(targets=[Star()],
                                                from_table=Identifier('tab'),
                                                where=BinaryOperation('=', args=[Identifier('tab.product_id'), Constant('x')]),
                                                limit=Constant(10),
                                                offset=Constant(15),
                                                order_by=[OrderBy(field=Identifier('tab.column1'))],
                                                ),
                                   ),
                ApplyPredictorStep(namespace='mindsdb', dataframe=Result(0), predictor=Identifier('pred')),
                JoinStep(left=Result(0), right=Result(1),
                         query=Join(left=Identifier('result_0', alias=Identifier('tab')),
                                    right=Identifier('result_1', alias=Identifier('pred')),
                                    join_type=JoinType.INNER_JOIN)),
                ProjectStep(dataframe=Result(2), columns=[Identifier('tab.column1'), Identifier('pred.predicted')]),
            ],
        )
        plan = plan_query(query, integrations=['int'], predictor_namespace='mindsdb', predictor_metadata={'pred': {}})

        assert plan.steps == expected_plan.steps
Esempio n. 12
0
    def test_join_predictor_plan_default_namespace_predictor(self):
        query = Select(targets=[Identifier('tab1.column1'), Identifier('pred.predicted')],
                       from_table=Join(left=Identifier('int.tab1'),
                                       right=Identifier('pred'),
                                       join_type=JoinType.INNER_JOIN,
                                       implicit=True)
                       )
        expected_plan = QueryPlan(
            default_namespace='mindsdb',
            steps=[
                FetchDataframeStep(integration='int',
                                   query=Select(targets=[Star()],
                                                from_table=Identifier('tab1')),
                                   ),
                ApplyPredictorStep(namespace='mindsdb', dataframe=Result(0), predictor=Identifier('pred')),
                JoinStep(left=Result(0), right=Result(1),
                         query=Join(left=Identifier('result_0', alias=Identifier('tab1')),
                                    right=Identifier('result_1', alias=Identifier('pred')),
                                    join_type=JoinType.INNER_JOIN)),
                ProjectStep(dataframe=Result(2), columns=[Identifier('tab1.column1'), Identifier('pred.predicted')]),
            ],
        )
        plan = plan_query(query, integrations=['int'], predictor_namespace='mindsdb', default_namespace='mindsdb', predictor_metadata={'pred': {}})

        for i in range(len(plan.steps)):
            assert plan.steps[i] == expected_plan.steps[i]
    def test_predictors_select_plan(self):
        query = Select(
            targets=[
                Identifier('column1'),
                Constant(1),
                NullConstant(),
                Function('database', args=[])
            ],
            from_table=Identifier('mindsdb.predictors'),
            where=BinaryOperation(
                'and',
                args=[
                    BinaryOperation(
                        '=',
                        args=[Identifier('column1'),
                              Identifier('column2')]),
                    BinaryOperation('>',
                                    args=[Identifier('column3'),
                                          Constant(0)]),
                ]))
        expected_plan = QueryPlan(
            integrations=['mindsdb'],
            steps=[
                FetchDataframeStep(
                    integration='mindsdb',
                    query=Select(
                        targets=[
                            Identifier('predictors.column1',
                                       alias=Identifier('column1')),
                            Constant(1),
                            NullConstant(),
                            Function('database', args=[]),
                        ],
                        from_table=Identifier('predictors'),
                        where=BinaryOperation(
                            'and',
                            args=[
                                BinaryOperation(
                                    '=',
                                    args=[
                                        Identifier('predictors.column1'),
                                        Identifier('predictors.column2')
                                    ]),
                                BinaryOperation(
                                    '>',
                                    args=[
                                        Identifier('predictors.column3'),
                                        Constant(0)
                                    ]),
                            ])),
                    step_num=0,
                    references=None,
                ),
            ])

        plan = plan_query(query, integrations=['mindsdb'])

        for i in range(len(plan.steps)):
            assert plan.steps[i] == expected_plan.steps[i]
Esempio n. 14
0
    def test_plan_union_queries(self):
        query1 = Select(targets=[Identifier('column1'), Constant(None, alias=Identifier('predicted'))],
                       from_table=Identifier('int.tab'),
                       where=BinaryOperation('and', args=[
                           BinaryOperation('=', args=[Identifier('column1'), Identifier('column2')]),
                           BinaryOperation('>', args=[Identifier('column3'), Constant(0)]),
                       ]))

        query2 = Select(
            targets=[Identifier('tab1.column1'), Identifier('pred.predicted', alias=Identifier('predicted'))],
            from_table=Join(left=Identifier('int.tab1'),
                            right=Identifier('mindsdb.pred'),
                            join_type=JoinType.INNER_JOIN,
                            implicit=True)
            )

        query = Union(left=query1, right=query2, unique=False)
        expected_plan = QueryPlan(
            steps=[
                # Query 1
                FetchDataframeStep(integration='int',
                                   query=Select(targets=[Identifier('tab.column1', alias=Identifier('column1')),
                                                         Constant(None, alias=Identifier('predicted'))],
                                                from_table=Identifier('tab'),
                                                where=BinaryOperation('and', args=[
                                                    BinaryOperation('=',
                                                                    args=[Identifier('tab.column1'),
                                                                          Identifier('tab.column2')]),
                                                    BinaryOperation('>',
                                                                    args=[Identifier('tab.column3'),
                                                                          Constant(0)]),
                                                ])
                                                )),
                # Query 2
                FetchDataframeStep(integration='int',
                                   query=Select(
                                       targets=[Star()],
                                       from_table=Identifier('tab1')),
                                   ),
                ApplyPredictorStep(namespace='mindsdb', dataframe=Result(1), predictor=Identifier('pred')),
                JoinStep(left=Result(1), right=Result(2),
                         query=Join(left=Identifier('result_1', alias=Identifier('tab1')),
                                    right=Identifier('result_2', alias=Identifier('pred')),
                                    join_type=JoinType.INNER_JOIN)),
                ProjectStep(dataframe=Result(3), columns=[Identifier('tab1.column1'), Identifier('pred.predicted', alias=Identifier('predicted'))]),

                # Union
                UnionStep(left=Result(0), right=Result(4), unique=False),

            ],
        )

        plan = plan_query(query, integrations=['int'], predictor_namespace='mindsdb', predictor_metadata={'pred': {}})

        for i in range(len(plan.steps)):
            assert plan.steps[i] == expected_plan.steps[i]
    def test_integration_select_plan_star(self):
        query = Select(targets=[Star()],
                       from_table=Identifier('int.tab'))
        expected_plan = QueryPlan(integrations=['int'],
                                  steps=[
                                      FetchDataframeStep(integration='int', query=Select(targets=[Star()], from_table=Identifier('tab'))),
                                  ])

        plan = plan_query(query, integrations=['int'])

        assert plan.steps == expected_plan.steps
    def test_integration_select_column_alias(self):
        query = Select(targets=[Identifier('col1', alias=Identifier('column_alias'))],
                       from_table=Identifier('int.tab'))

        expected_plan = QueryPlan(integrations=['int'],
                                  steps=[
                                      FetchDataframeStep(integration='int',
                                                         query=Select(
                                                             targets=[Identifier(parts=['tab', 'col1'], alias=Identifier('column_alias'))],
                                                             from_table=Identifier(parts=['tab'])),
                                                         ),
                                  ])

        plan = plan_query(query, integrations=['int'])

        assert plan.steps == expected_plan.steps
    def test_integration_select_table_alias_full_query(self):
        sql = 'select ta.sqft from int.test_data.home_rentals as ta'

        query = parse_sql(sql, dialect='sqlite')

        expected_plan = QueryPlan(integrations=['int'],
                                  steps=[
                                      FetchDataframeStep(integration='int',
                                                         query=Select(
                                                             targets=[Identifier(parts=['ta', 'sqft'], alias=Identifier('sqft'))],
                                                             from_table=Identifier(parts=['test_data', 'home_rentals'], alias=Identifier('ta'))),
                                                         ),
                                  ])

        plan = plan_query(query, integrations=['int'])

        assert plan.steps == expected_plan.steps
Esempio n. 18
0
 def test_join_tables_plan_groupby(self):
     query = Select(targets=[
         Identifier('tab1.column1'),
         Identifier('tab2.column1'),
         Function('sum', args=[Identifier('tab2.column2')], alias=Identifier('total'))],
         from_table=Join(left=Identifier('int.tab1'),
                         right=Identifier('int.tab2'),
                         condition=BinaryOperation(op='=',
                                                   args=[Identifier('tab1.column1'), Identifier('tab2.column1')]),
                         join_type=JoinType.INNER_JOIN
                         ),
         group_by=[Identifier('tab1.column1'), Identifier('tab2.column1')],
         having=BinaryOperation(op='=', args=[Identifier('tab1.column1'), Constant(0)])
     )
     plan = plan_query(query, integrations=['int'])
     expected_plan = QueryPlan(integrations=['int'],
                               steps = [
                                   FetchDataframeStep(integration='int',
                                                      query=Select(
                                                          targets=[Star()],
                                                          from_table=Identifier('tab1')),
                                                      ),
                                   FetchDataframeStep(integration='int',
                                                      query=Select(targets=[Star()],
                                                                   from_table=Identifier('tab2')),
                                                      ),
                                   JoinStep(left=Result(0), right=Result(1),
                                            query=Join(left=Identifier('tab1'),
                                                       right=Identifier('tab2'),
                                                       condition=BinaryOperation(op='=',
                                                                                 args=[Identifier('tab1.column1'),
                                                                                       Identifier('tab2.column1')]),
                                                       join_type=JoinType.INNER_JOIN
                                                       )),
                                   GroupByStep(dataframe=Result(2),
                                               targets=[Identifier('tab1.column1'),
                                                         Identifier('tab2.column1'),
                                                         Function('sum', args=[Identifier('tab2.column2')])],
                                               columns=[Identifier('tab1.column1'), Identifier('tab2.column1')]),
                                   FilterStep(dataframe=Result(3), query=BinaryOperation(op='=', args=[Identifier('tab1.column1'), Constant(0)])),
                                   ProjectStep(dataframe=Result(4),
                                               columns=[Identifier('tab1.column1'), Identifier('tab2.column1'),
                                                        Function(op='sum', args=[Identifier('tab2.column2')], alias=Identifier('total'))]),
                               ],
                               )
     assert plan.steps == expected_plan.steps
    def test_integration_select_plan_complex_path(self):
        query = Select(targets=[Identifier(parts=['int', 'tab', 'a column with spaces'])],
                       from_table=Identifier('int.tab'))
        expected_plan = QueryPlan(integrations=['int'],
                                  steps=[
                                      FetchDataframeStep(integration='int',
                                                         query=Select(
                                                             targets=[
                                                                 Identifier('tab.`a column with spaces`',
                                                                            alias=Identifier('a column with spaces')
                                                                            )
                                                             ],
                                                             from_table=Identifier('tab')),
                                                         ),
                                  ])

        plan = plan_query(query, integrations=['int'])

        assert plan.steps == expected_plan.steps
    def test_integration_select_subquery_in_from(self):
        query = Select(targets=[Identifier('column1')],
                       from_table=Select(targets=[Identifier('column1')],
                                         from_table=Identifier('int.tab'),
                                         alias=Identifier('subquery')))
        expected_plan = QueryPlan(integrations=['int'],
                                  steps=[
                                      FetchDataframeStep(integration='int',
                                                         query=Select(
                                                             targets=[Identifier('column1')],
                                                             from_table=Select(
                                                                 targets=[Identifier('tab.column1', alias=Identifier('column1'))],
                                                                 from_table=Identifier('tab'),
                                                                 alias=Identifier('subquery')),
                                                             )),
                                  ])

        plan = plan_query(query, integrations=['int'])

        assert plan.steps == expected_plan.steps
Esempio n. 21
0
    def test_join_tables_plan_order_by(self):
        query = Select(targets=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')],
                       from_table=Join(left=Identifier('int.tab1'),
                                       right=Identifier('int.tab2'),
                                       condition=BinaryOperation(op='=', args=[Identifier('tab1.column1'), Identifier('tab2.column1')]),
                                       join_type=JoinType.INNER_JOIN
                                       ),
                       limit=Constant(10),
                       offset=Constant(15),
                       order_by=[OrderBy(field=Identifier('tab1.column1'))],
                )
        plan = plan_query(query, integrations=['int'])
        expected_plan = QueryPlan(integrations=['int'],
                                  steps = [
                                      FetchDataframeStep(integration='int',
                                                         query=Select(
                                                             targets=[Star()],
                                                             from_table=Identifier('tab1')),
                                                         ),
                                      FetchDataframeStep(integration='int',
                                                         query=Select(targets=[Star()],
                                                                      from_table=Identifier('tab2')),
                                                         ),
                                      JoinStep(left=Result(0), right=Result(1),
                                               query=Join(left=Identifier('tab1'),
                                                          right=Identifier('tab2'),
                                                          condition=BinaryOperation(op='=',
                                                                                    args=[Identifier('tab1.column1'),
                                                                                          Identifier('tab2.column1')]),
                                                          join_type=JoinType.INNER_JOIN
                                                          )),
                                      OrderByStep(dataframe=Result(2), order_by=[OrderBy(field=Identifier('tab1.column1'))]),
                                      LimitOffsetStep(dataframe=Result(3), limit=10, offset=15),
                                      ProjectStep(dataframe=Result(4),
                                                  columns=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')]),
                                  ],
                                  )

        assert plan.steps == expected_plan.steps
    def test_integration_select_3_level(self):
        sql = "select * from xxx.yyy.zzz where x > 1"
        query = parse_sql(sql, dialect='mindsdb')

        expected_plan = QueryPlan(integrations=['int'],
                                  default_namespace='xxx',
                                  steps=[
                                      FetchDataframeStep(
                                          integration='xxx',
                                          query=Select(
                                             targets=[Star()],
                                             from_table=Identifier('yyy.zzz'),
                                             where=BinaryOperation(op='>', args=[
                                                 Identifier('yyy.zzz.x'),
                                                 Constant(1)
                                             ])
                                          )
                                      )
                                  ])

        plan = plan_query(query, integrations=['xxx'])

        assert plan.steps == expected_plan.steps
Esempio n. 23
0
    def test_join_tables_where_plan(self):
        query = Select(targets=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')],
                       from_table=Join(left=Identifier('int.tab1'),
                                       right=Identifier('int.tab2'),
                                       condition=BinaryOperation(op='=', args=[Identifier('tab1.column1'),
                                                                               Identifier('tab2.column1')]),
                                       join_type=JoinType.INNER_JOIN
                                       ),
                       where=BinaryOperation('and',
                                             args=[
                                                 BinaryOperation('and',
                                                                 args=[
                                                                     BinaryOperation('=',
                                                                                     args=[Identifier('tab1.column1'),
                                                                                           Constant(1)]),
                                                                     BinaryOperation('=',
                                                                                     args=[Identifier('tab2.column1'),
                                                                                           Constant(0)]),

                                                                 ]
                                                                 ),
                                                 BinaryOperation('=',
                                                                 args=[Identifier('tab1.column3'),
                                                                       Identifier('tab2.column3')]),
                                             ]
                                             )
                       )

        plan = plan_query(query, integrations=['int'])
        expected_plan = QueryPlan(integrations=['int'],
                                  steps=[
                                      FetchDataframeStep(integration='int',
                                                         query=Select(
                                                             targets=[Star()],
                                                             from_table=Identifier('tab1'),
                                                         ),

                                                         ),
                                      FetchDataframeStep(integration='int',
                                                         query=Select(targets=[Star()],
                                                                      from_table=Identifier('tab2'),
                                                                      ),
                                                         ),
                                      JoinStep(left=Result(0), right=Result(1),
                                               query=Join(left=Identifier('tab1'),
                                                          right=Identifier('tab2'),
                                                          condition=BinaryOperation(op='=',
                                                                                    args=[Identifier('tab1.column1'),
                                                                                          Identifier('tab2.column1')]),
                                                          join_type=JoinType.INNER_JOIN
                                                          )),
                                      FilterStep(dataframe=Result(2),
                                                 query=BinaryOperation('and',
                                                                       args=[
                                                                           BinaryOperation('and',
                                                                                           args=[
                                                                                               BinaryOperation('=',
                                                                                                               args=[
                                                                                                                   Identifier(
                                                                                                                       'tab1.column1'),
                                                                                                                   Constant(
                                                                                                                       1)]),
                                                                                               BinaryOperation('=',
                                                                                                               args=[
                                                                                                                   Identifier(
                                                                                                                       'tab2.column1'),
                                                                                                                   Constant(
                                                                                                                       0)]),

                                                                                           ]
                                                                                           ),
                                                                           BinaryOperation('=',
                                                                                           args=[Identifier(
                                                                                               'tab1.column3'),
                                                                                                 Identifier(
                                                                                                     'tab2.column3')]),
                                                                       ]
                                                                       )),
                                      ProjectStep(dataframe=Result(3),
                                                  columns=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')]),
                                  ],
                                  )

        assert plan.steps == expected_plan.steps
Esempio n. 24
0
class QueryPlanner():
    def __init__(self,
                 query=None,
                 integrations=None,
                 predictor_namespace=None,
                 predictor_metadata=None,
                 default_namespace=None):
        self.query = query
        self.plan = QueryPlan()

        self.integrations = [int.lower()
                             for int in integrations] if integrations else []
        self.predictor_namespace = predictor_namespace.lower(
        ) if predictor_namespace else 'mindsdb'
        self.predictor_metadata = predictor_metadata or defaultdict(dict)
        self.default_namespace = default_namespace

        # map for lower names of predictors
        self.predictor_names = {
            k.lower(): k
            for k in self.predictor_metadata.keys()
        }

        # allow to select from mindsdb namespace
        # self.integrations.append(self.predictor_namespace)

        self.statement = None

    def is_predictor(self, identifier):
        parts = identifier.parts
        if not parts[-1].lower() in self.predictor_names:
            return False
        if parts[0].lower() == self.predictor_namespace:
            return True
        elif len(parts
                 ) == 1 and self.default_namespace == self.predictor_namespace:
            return True
        return False

    # not used
    # def is_integration_table(self, identifier):
    #     parts = identifier.parts
    #     if parts[0].lower() in self.integrations:
    #         return True
    #     elif len(parts) == 1 and self.default_namespace in self.integrations:
    #         return True
    #     return False

    def get_integration_path_from_identifier_or_error(self,
                                                      identifier,
                                                      recurse=True):
        try:
            integration_name, table = get_integration_path_from_identifier(
                identifier)
            if not integration_name.lower() in self.integrations:
                raise PlanningException(
                    f'Unknown integration {integration_name} for table {str(identifier)}. Available integrations: {", ".join(self.integrations)}'
                )
        except PlanningException:
            if not recurse or not self.default_namespace:
                raise
            else:
                new_identifier = copy.deepcopy(identifier)
                new_identifier.parts = [
                    self.default_namespace, *identifier.parts
                ]
                return self.get_integration_path_from_identifier_or_error(
                    new_identifier, recurse=False)
        return integration_name, table

    def get_integration_select_step(self, select):
        integration_name, table = self.get_integration_path_from_identifier_or_error(
            select.from_table)

        fetch_df_select = copy.deepcopy(select)
        recursively_disambiguate_identifiers(fetch_df_select, integration_name,
                                             table)

        return FetchDataframeStep(integration=integration_name,
                                  query=fetch_df_select)

    def plan_integration_select(self, select):
        """Plan for a select query that can be fully executed in an integration"""

        return self.plan.add_step(self.get_integration_select_step(select))

    def plan_nested_select(self, select):

        # get all predictors
        query_predictors = []

        def find_predictors(node, is_table, **kwargs):
            if is_table and isinstance(node, ast.Identifier):
                if self.is_predictor(node):
                    query_predictors.append(node)

        utils.query_traversal(select, find_predictors)

        if len(query_predictors) == 0:
            # if no predictor inside = run as is
            return self.plan_integration_nested_select(select)
        else:
            return self.plan_predictor_nested_select(select)

    def plan_integration_nested_select(self, select):
        fetch_df_select = copy.deepcopy(select)
        deepest_select = get_deepest_select(fetch_df_select)
        integration_name, table = self.get_integration_path_from_identifier_or_error(
            deepest_select.from_table)
        recursively_disambiguate_identifiers(deepest_select, integration_name,
                                             table)
        return self.plan.add_step(
            FetchDataframeStep(integration=integration_name,
                               query=fetch_df_select))

    def plan_predictor_nested_select(self, select):
        # plan nested select

        if select.limit == 0:
            # TODO don't run predictor if limit is 0
            ...
        select2 = copy.deepcopy(select.from_table)
        select2.parentheses = False
        select2.alias = None
        self.plan_select(select2)

        last_step = self.plan.steps[-1]
        group_step = self.plan_group(select, last_step)
        if group_step is not None:
            self.plan.add_step(group_step)
            # don't do project step

        else:
            # do we need projection?
            if len(select.targets) != 1 or not isinstance(
                    select.targets[0], Star):
                # remove prefix alias
                alias = select.from_table.alias
                if alias is not None:
                    alias = alias.parts[0]
                    for t in select.targets:
                        if isinstance(t, Identifier) and t.parts[0] == alias:
                            t.parts.pop(0)

                self.plan_project(select,
                                  last_step.result,
                                  ignore_doubles=True)

        # do we need limit?
        last_step = self.plan.steps[-1]
        if select.limit is not None:
            last_step = self.plan.add_step(
                LimitOffsetStep(dataframe=last_step.result,
                                limit=select.limit.value))
        return last_step

    def plan_select_from_predictor(self, select):
        predictor_namespace, predictor = get_predictor_namespace_and_name_from_identifier(
            select.from_table, self.default_namespace)

        if select.where == BinaryOperation('=',
                                           args=[Constant(1),
                                                 Constant(0)]):
            # Hardcoded mysql way of getting predictor columns
            predictor_step = self.plan.add_step(
                GetPredictorColumns(namespace=predictor_namespace,
                                    predictor=predictor))
        else:
            new_query_targets = []
            for target in select.targets:
                if isinstance(target, Identifier):
                    new_query_targets.append(
                        disambiguate_predictor_column_identifier(
                            target, predictor))
                elif type(target) in (Star, Constant):
                    new_query_targets.append(target)
                else:
                    raise PlanningException(
                        f'Unknown select target {type(target)}')

            if select.group_by or select.having:
                raise PlanningException(
                    f'Unsupported operation when querying predictor. Only WHERE is allowed and required.'
                )

            row_dict = {}
            where_clause = select.where
            if not where_clause:
                raise PlanningException(
                    f'WHERE clause required when selecting from predictor')

            recursively_extract_column_values(where_clause, row_dict,
                                              predictor)

            predictor_step = self.plan.add_step(
                ApplyPredictorRowStep(namespace=predictor_namespace,
                                      predictor=predictor,
                                      row_dict=row_dict))
        project_step = self.plan_project(select, predictor_step.result)
        return predictor_step, project_step

    def plan_predictor(self, query, table, predictor_namespace, predictor):
        integration_select_step = self.plan_integration_select(
            Select(
                targets=[Star()],
                from_table=table,
                where=query.where,
                group_by=query.group_by,
                having=query.having,
                order_by=query.order_by,
                limit=query.limit,
                offset=query.offset,
            ))
        predictor_step = self.plan.add_step(
            ApplyPredictorStep(namespace=predictor_namespace,
                               dataframe=integration_select_step.result,
                               predictor=predictor))

        return {'predictor': predictor_step, 'data': integration_select_step}

    def plan_fetch_timeseries_partitions(self, query, table,
                                         predictor_group_by_names):
        targets = [Identifier(column) for column in predictor_group_by_names]

        query = Select(
            distinct=True,
            targets=targets,
            from_table=table,
            where=query.where,
        )
        select_step = self.plan_integration_select(query)
        return select_step

    def plan_timeseries_predictor(self, query, table, predictor_namespace,
                                  predictor):
        predictor_name = predictor.to_string(alias=False).lower()
        # to original case
        predictor_name = self.predictor_names[predictor_name]

        predictor_time_column_name = self.predictor_metadata[predictor_name][
            'order_by_column']
        predictor_group_by_names = self.predictor_metadata[predictor_name][
            'group_by_columns']
        if predictor_group_by_names is None:
            predictor_group_by_names = []
        predictor_window = self.predictor_metadata[predictor_name]['window']

        if query.order_by:
            raise PlanningException(
                f'Can\'t provide ORDER BY to time series predictor, it will be taken from predictor settings. Found: {query.order_by}'
            )

        saved_limit = query.limit

        if query.group_by or query.having or query.offset:
            raise PlanningException(
                f'Unsupported query to timeseries predictor: {str(query)}')

        allowed_columns = [predictor_time_column_name.lower()]
        if len(predictor_group_by_names) > 0:
            allowed_columns += [i.lower() for i in predictor_group_by_names]
        validate_ts_where_condition(query.where,
                                    allowed_columns=allowed_columns)

        time_filter = find_time_filter(
            query.where, time_column_name=predictor_time_column_name)

        order_by = [
            OrderBy(Identifier(parts=[predictor_time_column_name]),
                    direction='DESC')
        ]

        preparation_where = copy.deepcopy(query.where)

        # add {order_by_field} is not null
        def add_order_not_null(condition):
            order_field_not_null = BinaryOperation(
                op='is not',
                args=[
                    Identifier(parts=[predictor_time_column_name]),
                    NullConstant()
                ])
            if condition is not None:
                condition = BinaryOperation(
                    op='and', args=[condition, order_field_not_null])
            else:
                condition = order_field_not_null
            return condition

        preparation_where2 = copy.deepcopy(preparation_where)
        preparation_where = add_order_not_null(preparation_where)

        # Obtain integration selects
        if isinstance(time_filter, BetweenOperation):
            between_from = time_filter.args[1]
            preparation_time_filter = BinaryOperation(
                '<',
                args=[Identifier(predictor_time_column_name), between_from])
            preparation_where2 = replace_time_filter(preparation_where2,
                                                     time_filter,
                                                     preparation_time_filter)
            integration_select_1 = Select(
                targets=[Star()],
                from_table=table,
                where=add_order_not_null(preparation_where2),
                order_by=order_by,
                limit=Constant(predictor_window))

            integration_select_2 = Select(targets=[Star()],
                                          from_table=table,
                                          where=preparation_where,
                                          order_by=order_by)

            integration_selects = [integration_select_1, integration_select_2]
        elif isinstance(
                time_filter, BinaryOperation
        ) and time_filter.op == '>' and time_filter.args[1] == Latest():
            integration_select = Select(
                targets=[Star()],
                from_table=table,
                where=preparation_where,
                order_by=order_by,
                limit=Constant(predictor_window),
            )
            integration_select.where = find_and_remove_time_filter(
                integration_select.where, time_filter)
            integration_selects = [integration_select]

        elif isinstance(time_filter,
                        BinaryOperation) and time_filter.op in ('>', '>='):
            time_filter_date = time_filter.args[1]
            preparation_time_filter_op = {'>': '<=', '>=': '<'}[time_filter.op]

            preparation_time_filter = BinaryOperation(
                preparation_time_filter_op,
                args=[
                    Identifier(predictor_time_column_name), time_filter_date
                ])
            preparation_where2 = replace_time_filter(preparation_where2,
                                                     time_filter,
                                                     preparation_time_filter)
            integration_select_1 = Select(
                targets=[Star()],
                from_table=table,
                where=add_order_not_null(preparation_where2),
                order_by=order_by,
                limit=Constant(predictor_window))

            integration_select_2 = Select(targets=[Star()],
                                          from_table=table,
                                          where=preparation_where,
                                          order_by=order_by)

            integration_selects = [integration_select_1, integration_select_2]
        else:
            integration_select = Select(
                targets=[Star()],
                from_table=table,
                where=preparation_where,
                order_by=order_by,
            )
            integration_selects = [integration_select]

        if len(predictor_group_by_names) == 0:
            # ts query without grouping
            # one or multistep
            if len(integration_selects) == 1:
                select_partition_step = self.get_integration_select_step(
                    integration_selects[0])
            else:
                select_partition_step = MultipleSteps(steps=[
                    self.get_integration_select_step(s)
                    for s in integration_selects
                ],
                                                      reduce='union')

            # fetch data step
            data_step = self.plan.add_step(select_partition_step)
        else:
            # inject $var to queries
            for integration_select in integration_selects:
                condition = integration_select.where
                for num, column in enumerate(predictor_group_by_names):
                    cond = BinaryOperation(
                        '=',
                        args=[Identifier(column),
                              Constant(f'$var[{column}]')])

                    # join to main condition
                    if condition is None:
                        condition = cond
                    else:
                        condition = BinaryOperation('and',
                                                    args=[condition, cond])

                integration_select.where = condition
            # one or multistep
            if len(integration_selects) == 1:
                select_partition_step = self.get_integration_select_step(
                    integration_selects[0])
            else:
                select_partition_step = MultipleSteps(steps=[
                    self.get_integration_select_step(s)
                    for s in integration_selects
                ],
                                                      reduce='union')

            # get groping values
            no_time_filter_query = copy.deepcopy(query)
            no_time_filter_query.where = find_and_remove_time_filter(
                no_time_filter_query.where, time_filter)
            select_partitions_step = self.plan_fetch_timeseries_partitions(
                no_time_filter_query, table, predictor_group_by_names)

            # sub-query by every grouping value
            map_reduce_step = self.plan.add_step(
                MapReduceStep(values=select_partitions_step.result,
                              reduce='union',
                              step=select_partition_step))
            data_step = map_reduce_step

        predictor_step = self.plan.add_step(
            ApplyTimeseriesPredictorStep(
                output_time_filter=time_filter,
                namespace=predictor_namespace,
                dataframe=data_step.result,
                predictor=predictor,
            ))

        return {
            'predictor': predictor_step,
            'data': data_step,
            'saved_limit': saved_limit,
        }

    def plan_join_two_tables(self, join):
        select_left_step = self.plan_integration_select(
            Select(targets=[Star()], from_table=join.left))
        select_right_step = self.plan_integration_select(
            Select(targets=[Star()], from_table=join.right))

        left_integration_name, left_table = self.get_integration_path_from_identifier_or_error(
            join.left)
        right_integration_name, right_table = self.get_integration_path_from_identifier_or_error(
            join.right)

        left_table_path = left_table.to_string(alias=False)
        right_table_path = right_table.to_string(alias=False)

        new_condition_args = []
        for arg in join.condition.args:
            if isinstance(arg, Identifier):
                if left_table_path in arg.parts:
                    new_condition_args.append(
                        disambiguate_integration_column_identifier(
                            arg, left_integration_name, left_table))
                elif right_table_path in arg.parts:
                    new_condition_args.append(
                        disambiguate_integration_column_identifier(
                            arg, right_integration_name, right_table))
                else:
                    raise PlanningException(
                        f'Wrong table or no source table in join condition for column: {str(arg)}'
                    )
            else:
                new_condition_args.append(arg)
        new_join = copy.deepcopy(join)
        new_join.condition.args = new_condition_args
        new_join.left = Identifier(left_table_path, alias=left_table.alias)
        new_join.right = Identifier(right_table_path, alias=right_table.alias)

        # FIXME: INFORMATION_SCHEMA with condition
        # clear join condition for INFORMATION_SCHEMA
        if right_integration_name == 'INFORMATION_SCHEMA':
            new_join.condition = None

        return self.plan.add_step(
            JoinStep(left=select_left_step.result,
                     right=select_right_step.result,
                     query=new_join))

    def plan_group(self, query, last_step):
        # check group
        funcs = []
        for t in query.targets:
            if isinstance(t, Function):
                funcs.append(t.op.lower())
        agg_funcs = ['sum', 'min', 'max', 'avg', 'count', 'std']

        if (query.having is not None or query.group_by is not None
                or set(agg_funcs) & set(funcs)):
            # is aggregate
            group_by_targets = []
            for t in query.targets:
                target_copy = copy.deepcopy(t)
                group_by_targets.append(target_copy)
            # last_step = self.plan.steps[-1]
            return GroupByStep(dataframe=last_step.result,
                               columns=query.group_by,
                               targets=group_by_targets)

    def plan_project(self, query, dataframe, ignore_doubles=False):
        out_identifiers = []

        for target in query.targets:
            if isinstance(target, Identifier) \
                    or isinstance(target, Star) \
                    or isinstance(target, Function) \
                    or isinstance(target, Constant):
                out_identifiers.append(target)
            else:
                new_identifier = Identifier(str(target.to_string(alias=False)),
                                            alias=target.alias)
                out_identifiers.append(new_identifier)
        return self.plan.add_step(
            ProjectStep(dataframe=dataframe,
                        columns=out_identifiers,
                        ignore_doubles=ignore_doubles))

    def get_aliased_fields(self, targets):
        # get aliases from select target
        aliased_fields = {}
        for target in targets:
            if target.alias is not None:
                aliased_fields[target.alias.to_string()] = target
        return aliased_fields

    def plan_join(self, query, integration=None):
        join = query.from_table
        join_left = join.left
        join_right = join.right

        if isinstance(join_left, Select):
            # dbt query.
            # TODO support complex query. Only one table is supported at the moment.
            if not isinstance(join_left.from_table, Identifier):
                raise PlanningException(
                    f'Statement not supported: {query.to_string()}')

            # move properties to upper query
            query = join_left

            if query.from_table.alias is not None:
                table_alias = [query.from_table.alias.parts[0]]
            else:
                table_alias = query.from_table.parts

            def add_aliases(node, is_table, **kwargs):
                if not is_table and isinstance(node, Identifier):
                    if len(node.parts) == 1:
                        # add table alias to field
                        node.parts = table_alias + node.parts

            query_traversal(query.where, add_aliases)

            if isinstance(query.from_table, Identifier):
                # DBT workaround: allow use tables without integration.
                #   if table.part[0] not in integration - take integration name from create table command
                if (integration is not None and query.from_table.parts[0]
                        not in self.integrations):
                    # add integration name to table
                    query.from_table.parts.insert(0, integration)

            join_left = join_left.from_table

        aliased_fields = self.get_aliased_fields(query.targets)

        recursively_check_join_identifiers_for_ambiguity(query.where)
        recursively_check_join_identifiers_for_ambiguity(
            query.group_by, aliased_fields=aliased_fields)
        recursively_check_join_identifiers_for_ambiguity(query.having)
        recursively_check_join_identifiers_for_ambiguity(
            query.order_by, aliased_fields=aliased_fields)

        if isinstance(join_left, Identifier) and isinstance(
                join_right, Identifier):
            if self.is_predictor(join_left) and self.is_predictor(join_right):
                raise PlanningException(
                    f'Can\'t join two predictors {str(join_left.parts[0])} and {str(join_left.parts[1])}'
                )

            predictor_namespace = None
            predictor = None
            table = None
            predictor_is_left = False
            if self.is_predictor(join_left):
                predictor_namespace, predictor = get_predictor_namespace_and_name_from_identifier(
                    join_left, self.default_namespace)
                predictor_is_left = True
            else:
                table = join_left

            if self.is_predictor(join_right):
                predictor_namespace, predictor = get_predictor_namespace_and_name_from_identifier(
                    join_right, self.default_namespace)
            else:
                table = join_right

            last_step = None
            if predictor:
                # One argument is a table, another is a predictor
                # Apply mindsdb model to result of last dataframe fetch
                # Then join results of applying mindsdb with table

                predictor_name = self.predictor_names[predictor.to_string(
                    alias=False).lower()]
                if self.predictor_metadata[predictor_name].get('timeseries'):
                    predictor_steps = self.plan_timeseries_predictor(
                        query, table, predictor_namespace, predictor)
                else:
                    predictor_steps = self.plan_predictor(
                        query, table, predictor_namespace, predictor)

                # add join
                # Update reference
                _, table = self.get_integration_path_from_identifier_or_error(
                    table)
                table_alias = table.alias or Identifier(
                    table.to_string(alias=False).replace('.', '_'))

                left = Identifier(
                    predictor_steps['predictor'].result.ref_name,
                    alias=predictor.alias
                    or Identifier(predictor.to_string(alias=False)))
                right = Identifier(predictor_steps['data'].result.ref_name,
                                   alias=table_alias)

                if not predictor_is_left:
                    # swap join
                    left, right = right, left
                new_join = Join(left=left,
                                right=right,
                                join_type=join.join_type)

                left = predictor_steps['predictor'].result
                right = predictor_steps['data'].result
                if not predictor_is_left:
                    # swap join
                    left, right = right, left

                last_step = self.plan.add_step(
                    JoinStep(left=left, right=right, query=new_join))

                # limit from timeseries
                if predictor_steps.get('saved_limit'):
                    last_step = self.plan.add_step(
                        LimitOffsetStep(dataframe=last_step.result,
                                        limit=predictor_steps['saved_limit']))

            else:
                # Both arguments are tables, join results of 2 dataframe fetches

                join_step = self.plan_join_two_tables(join)
                last_step = join_step
                if query.where:
                    # FIXME: INFORMATION_SCHEMA with Where
                    right_integration_name, _ = self.get_integration_path_from_identifier_or_error(
                        join.right)
                    if right_integration_name == 'INFORMATION_SCHEMA':
                        ...
                    else:
                        last_step = self.plan.add_step(
                            FilterStep(dataframe=last_step.result,
                                       query=query.where))

                if query.group_by:
                    group_by_targets = []
                    for t in query.targets:
                        target_copy = copy.deepcopy(t)
                        target_copy.alias = None
                        group_by_targets.append(target_copy)
                    last_step = self.plan.add_step(
                        GroupByStep(dataframe=last_step.result,
                                    columns=query.group_by,
                                    targets=group_by_targets))

                if query.having:
                    last_step = self.plan.add_step(
                        FilterStep(dataframe=last_step.result,
                                   query=query.having))

                if query.order_by:
                    last_step = self.plan.add_step(
                        OrderByStep(dataframe=last_step.result,
                                    order_by=query.order_by))

                if query.limit is not None or query.offset is not None:
                    limit = query.limit.value if query.limit is not None else None
                    offset = query.offset.value if query.offset is not None else None
                    last_step = self.plan.add_step(
                        LimitOffsetStep(dataframe=last_step.result,
                                        limit=limit,
                                        offset=offset))

        else:
            raise PlanningException(
                f'Join of unsupported objects, currently only tables and predictors can be joined.'
            )
        return self.plan_project(query, last_step.result)

    def plan_create_table(self, query):
        if query.from_select is None:
            raise PlanningException(
                f'Not implemented "create table": {query.to_string()}')

        integration_name = query.name.parts[0]

        last_step = self.plan_select(query.from_select,
                                     integration=integration_name)

        # create table step
        self.plan.add_step(
            SaveToTable(
                table=query.name,
                dataframe=last_step,
                is_replace=query.is_replace,
            ))

    def plan_insert(self, query):
        if query.from_select is None:
            raise PlanningException(f'Support only insert from select')

        integration_name = query.table.parts[0]

        # plan sub-select first
        last_step = self.plan_select(query.from_select,
                                     integration=integration_name)

        table = query.table
        self.plan.add_step(InsertToTable(
            table=table,
            dataframe=last_step,
        ))

    def plan_select(self, query, integration=None):
        from_table = query.from_table

        if isinstance(from_table, Identifier):
            if self.is_predictor(from_table):
                return self.plan_select_from_predictor(query)
            else:
                return self.plan_integration_select(query)
        elif isinstance(from_table, Select):
            return self.plan_nested_select(query)
        elif isinstance(from_table, Join):
            return self.plan_join(query, integration=integration)
        else:
            raise PlanningException(
                f'Unsupported from_table {type(from_table)}')

    def plan_union(self, query):
        query1 = self.plan_select(query.left)
        query2 = self.plan_select(query.right)

        return self.plan.add_step(
            UnionStep(left=query1.result,
                      right=query2.result,
                      unique=query.unique))

    # method for compatibility
    def from_query(self, query=None):
        if query is None:
            query = self.query

        if isinstance(query, Select):
            self.plan_select(query)
        elif isinstance(query, Union):
            self.plan_union(query)
        elif isinstance(query, CreateTable):
            self.plan_create_table(query)
        elif isinstance(query, Insert):
            self.plan_insert(query)
        else:
            raise PlanningException(f'Unsupported query type {type(query)}')

        return self.plan

    def prepare_steps(self, query):
        statement_planner = PreparedStatementPlanner(self)

        # return generator
        return statement_planner.prepare_steps(query)

    def execute_steps(self, params=None):
        statement_planner = PreparedStatementPlanner(self)

        # return generator
        return statement_planner.execute_steps(params)

    # def fetch(self, row_count):
    #     statement_planner = PreparedStatementPlanner(self)
    #     return statement_planner.fetch(row_count)
    #
    # def close(self):
    #     statement_planner = PreparedStatementPlanner(self)
    #     return statement_planner.close()

    def get_statement_info(self):
        statement_planner = PreparedStatementPlanner(self)

        return statement_planner.get_statement_info()
Esempio n. 25
0
    def test_nested_select(self):
        # for tableau

        sql = f'''
            SELECT time
            FROM ( 
               select * from int.covid 
               join mindsdb.pred 
               limit 10
            ) `Custom SQL Query`
            limit 1
         '''

        query = parse_sql(sql, dialect='mindsdb')

        expected_plan = QueryPlan(
            default_namespace='mindsdb',
            steps=[
                FetchDataframeStep(integration='int',
                                   query=parse_sql('select * from covid limit 10')),
                ApplyPredictorStep(namespace='mindsdb', dataframe=Result(0), predictor=Identifier('pred')),
                JoinStep(left=Result(0), right=Result(1),
                         query=Join(left=Identifier('result_0', alias=Identifier('covid')),
                                    right=Identifier('result_1', alias=Identifier('pred')),
                                    join_type=JoinType.JOIN)),
                ProjectStep(dataframe=Result(2), columns=[Star()]),
                ProjectStep(dataframe=Result(3), columns=[Identifier('time')], ignore_doubles=True),
                LimitOffsetStep(dataframe=Result(4), limit=1)
            ],
        )

        plan = plan_query(
            query,
            integrations=['int'],
            predictor_namespace='mindsdb',
            default_namespace='mindsdb',
            predictor_metadata={'pred': {}}
        )
        for i in range(len(plan.steps)):

            assert plan.steps[i] == expected_plan.steps[i]

        sql = f'''
                 SELECT `time`
                 FROM (
                   select * from int.covid
                   join mindsdb.pred
                 ) `Custom SQL Query`
                GROUP BY 1
            '''

        query = parse_sql(sql, dialect='mindsdb')

        expected_plan = QueryPlan(
            default_namespace='mindsdb',
            steps=[
                FetchDataframeStep(integration='int',
                                   query=parse_sql('select * from covid')),
                ApplyPredictorStep(namespace='mindsdb', dataframe=Result(0), predictor=Identifier('pred')),
                JoinStep(left=Result(0), right=Result(1),
                         query=Join(left=Identifier('result_0', alias=Identifier('covid')),
                                    right=Identifier('result_1', alias=Identifier('pred')),
                                    join_type=JoinType.JOIN)),
                ProjectStep(dataframe=Result(2), columns=[Star()]),
                GroupByStep(dataframe=Result(3), columns=[Constant(1)], targets=[Identifier('time')])
            ],
        )

        plan = plan_query(
            query,
            integrations=['int'],
            predictor_namespace='mindsdb',
            default_namespace='mindsdb',
            predictor_metadata={'pred': {}}
        )
        for i in range(len(plan.steps)):
            assert plan.steps[i] == expected_plan.steps[i]