Example #1
0
def _assert_safe_query(query, engine_id, session=None):
    try:
        from lib.metastore.utils import MetastoreTableACLChecker

        table_per_statement, _ = process_query(query)
        all_tables = [
            table for tables in table_per_statement for table in tables
        ]

        query_engine = admin_logic.get_query_engine_by_id(engine_id,
                                                          session=session)
        if query_engine.metastore_id is None:
            LOG.debug("No metastore for query engine, skipping")
            return

        metastore = admin_logic.get_query_metastore_by_id(
            query_engine.metastore_id, session=session)
        acl_checker = MetastoreTableACLChecker(metastore.acl_control)

        for table in all_tables:
            schema_name, table_name = table.split(".")
            if not acl_checker.is_table_valid(schema_name, table_name):
                raise InvalidQueryExecution(
                    f"Table {table} is not allowed by metastore")
    except InvalidQueryExecution as e:
        raise e
    except Exception as e:
        LOG.info(e)
Example #2
0
def query_cell_to_es(query_cell, session=None):
    query_cell_id = query_cell.id
    query_cell_meta = query_cell.meta

    engine_id = query_cell_meta.get("engine")
    engine = get_query_engine_by_id(engine_id, session=session)

    query = query_cell.context
    table_names, _ = process_query(query,
                                   language=(engine and engine.language))
    table_names = list(chain.from_iterable(table_names))

    datadoc = query_cell.doc

    expand_query = {
        "id": query_cell_id,
        "query_type": "query_cell",
        "title": query_cell_meta.get("title", "Untitled"),
        "data_doc_id": datadoc and datadoc.id,
        "environment_id": datadoc and datadoc.environment_id,
        "author_uid": datadoc and datadoc.owner_uid,
        "engine_id": engine_id,
        "statement_type": get_table_statement_type(query),
        "created_at": DATETIME_TO_UTC(query_cell.created_at),
        "full_table_name": table_names,
        "query_text": query,
    }
    return expand_query
Example #3
0
def query_execution_to_es(query_execution, data_cell=None, session=None):
    """data_cell is added as a parameter so that bulk insert of query executions won't require
    re-retrieval of data_cell"""
    query_execution_id = query_execution.id

    engine_id = query_execution.engine_id
    engine = get_query_engine_by_id(engine_id, session=session)

    table_names, _ = process_query(query_execution.query,
                                   language=(engine and engine.language))
    table_names = list(chain.from_iterable(table_names))

    duration = (DATETIME_TO_UTC(query_execution.completed_at) -
                DATETIME_TO_UTC(query_execution.created_at)
                if query_execution.completed_at is not None else None)

    environments = engine.environments
    environment_ids = [env.id for env in environments]

    title = data_cell.meta.get("title", "Untitled") if data_cell else None

    expand_query_execution = {
        "id": query_execution_id,
        "query_type": "query_execution",
        "title": title,
        "environment_id": environment_ids,
        "author_uid": query_execution.uid,
        "engine_id": engine_id,
        "statement_type": get_table_statement_type(query_execution.query),
        "created_at": DATETIME_TO_UTC(query_execution.created_at),
        "duration": duration,
        "full_table_name": table_names,
        "query_text": query_execution.query,
    }
    return expand_query_execution
def create_table_lineage_from_metadata(job_metadata_id,
                                       query_language=None,
                                       session=None):
    job_metadata = session.query(DataJobMetadata).get(job_metadata_id)
    if job_metadata is None:
        return

    _, lineage_per_statement = process_query(job_metadata.query_text,
                                             query_language)

    lineage_ids = []
    for statement_lineage in lineage_per_statement:
        if len(statement_lineage):
            for lineage in statement_lineage:
                if "source" in lineage:
                    source_string = lineage["source"].split(".")
                    parent_table = get_table_by_name(
                        source_string[0],
                        source_string[1],
                        job_metadata.metastore_id,
                        session=session,
                    )

                    target_string = lineage["target"].split(".")
                    child_table = get_table_by_name(
                        target_string[0],
                        target_string[1],
                        job_metadata.metastore_id,
                        session=session,
                    )

                    if parent_table and child_table:
                        lineage_ids.append(
                            add_table_lineage(
                                child_table.id,
                                parent_table.id,
                                job_metadata_id,
                                session=session,
                            ).id)

    return lineage_ids
Example #5
0
    def test_select_statement(self):
        query = """
SELECT
    w9.Country,
    w9.Rank AS [2019],
    w8.Rank AS [2018],
    w7.HappinessRank AS [2017],
    w6.HappinessRank AS [2016],
    w5.HappinessRank AS [2015]
FROM
    main.world_happiness_2019 w9
    INNER JOIN main.world_happiness_2018 w8 ON w9.Country = w8.Country
    INNER JOIN main.world_happiness_2017 w7 ON w9.Country = w7.Country
    INNER JOIN main.world_happiness_2016 w6 ON w9.Country = w6.Country
    INNER JOIN main.world_happiness_2015 w5 ON w9.Country = w5.Country
    AND (w5.Region = "{{Region}}");
-- Region is a template variable with the value of 'Western Europe'
-- click on the <> button on the bottom right of the DataDoc to configure more!
        """
        processed_query = process_query(query)
        self.assertIsInstance(processed_query, tuple)
        self.assertEqual(len(processed_query), 2)
        self.assertEqual(len(processed_query[0]), 1)
        self.assertSetEqual(
            set(processed_query[0][0]),
            set([
                "main.world_happiness_2019",
                "main.world_happiness_2018",
                "main.world_happiness_2017",
                "main.world_happiness_2016",
                "main.world_happiness_2015",
            ]),
        )

        self.assertEqual(
            processed_query[1],
            ([[]]),
        )
Example #6
0
    def test_create_insert_query(self):
        """testing if process_query returns correct lineage information for multi-line
           query that has insert and create
        """
        query = """
            USE analytics;
            CREATE EXTERNAL TABLE IF NOT EXISTS example_1
            (id INT, count BIGINT) PARTITIONED BY (dt STRING)
            LOCATION 's3n://fakebucket/fake/path/';
            INSERT OVERWRITE TABLE example_1
            PARTITION (dt = '%(end_date)s')
            SELECT DISTINCT id, count from default.example_2
            where dt <= '2019-01-01' and dt >= date_sub('2018-01-01', 28)
            and to_date(created_at) >= start_dt
        """

        expected_table_per_statement = [
            [],
            ["analytics.example_1"],
            ["analytics.example_1", "default.example_2"],
        ]
        expected_lineage_per_statement = [
            [],
            [],
            [{
                "source": "default.example_2",
                "target": "analytics.example_1"
            }],
        ]
        processed_query = process_query(query)
        self.assertIsInstance(processed_query, tuple)

        table_per_statement, lineage_per_statement = processed_query
        for stmt, expected_stmt in zip(table_per_statement,
                                       expected_table_per_statement):
            self.assertCountEqual(stmt, expected_stmt)
        self.assertListEqual(lineage_per_statement,
                             expected_lineage_per_statement)
def log_query_per_table_task(self, query_execution_id):
    with DBSession() as session:
        query_execution = qe_logic.get_query_execution_by_id(
            query_execution_id, session=session)
        assert query_execution.status == QueryExecutionStatus.DONE
        metastore_id = query_execution.engine.metastore_id
        if metastore_id is None:
            # This query engine has no metastore configured
            return

        statement_types = get_table_statement_type(query_execution.query)
        table_per_statement, _ = process_query(query_execution.query,
                                               query_execution.engine.language)

        sync_table_to_metastore(table_per_statement,
                                statement_types,
                                metastore_id,
                                session=session)

        datadoc_cell = next(iter(query_execution.cells), None)
        if any(statement in statement_types
               for statement in ["CREATE", "INSERT"]):
            create_lineage_from_query(query_execution,
                                      metastore_id,
                                      datadoc_cell,
                                      session=session)
        if datadoc_cell is None or not datadoc_cell.doc.public:
            return

        log_table_per_statement(
            table_per_statement,
            statement_types,
            query_execution_id,
            metastore_id,
            datadoc_cell.id,
            session=session,
        )
Example #8
0
 def test_empty_statement(self):
     self.assertEqual(process_query(""), ([], []))
     self.assertEqual(process_query("\t\n\t\n"), ([], []))
     self.assertEqual(process_query("\n\n;\n\n;\n\n"), ([[], []], [[], []]))
Example #9
0
def _get_table_names_from_query(query, language=None) -> List[str]:
    table_names, _ = lineage_lib.process_query(query, language=language)
    return list(chain.from_iterable(table_names))