def _assert_safe_query(query, engine_id, session=None): try: from lib.metastore.utils import MetastoreTableACLChecker table_per_statement, _ = process_query(query) all_tables = [ table for tables in table_per_statement for table in tables ] query_engine = admin_logic.get_query_engine_by_id(engine_id, session=session) if query_engine.metastore_id is None: LOG.debug("No metastore for query engine, skipping") return metastore = admin_logic.get_query_metastore_by_id( query_engine.metastore_id, session=session) acl_checker = MetastoreTableACLChecker(metastore.acl_control) for table in all_tables: schema_name, table_name = table.split(".") if not acl_checker.is_table_valid(schema_name, table_name): raise InvalidQueryExecution( f"Table {table} is not allowed by metastore") except InvalidQueryExecution as e: raise e except Exception as e: LOG.info(e)
def query_cell_to_es(query_cell, session=None): query_cell_id = query_cell.id query_cell_meta = query_cell.meta engine_id = query_cell_meta.get("engine") engine = get_query_engine_by_id(engine_id, session=session) query = query_cell.context table_names, _ = process_query(query, language=(engine and engine.language)) table_names = list(chain.from_iterable(table_names)) datadoc = query_cell.doc expand_query = { "id": query_cell_id, "query_type": "query_cell", "title": query_cell_meta.get("title", "Untitled"), "data_doc_id": datadoc and datadoc.id, "environment_id": datadoc and datadoc.environment_id, "author_uid": datadoc and datadoc.owner_uid, "engine_id": engine_id, "statement_type": get_table_statement_type(query), "created_at": DATETIME_TO_UTC(query_cell.created_at), "full_table_name": table_names, "query_text": query, } return expand_query
def query_execution_to_es(query_execution, data_cell=None, session=None): """data_cell is added as a parameter so that bulk insert of query executions won't require re-retrieval of data_cell""" query_execution_id = query_execution.id engine_id = query_execution.engine_id engine = get_query_engine_by_id(engine_id, session=session) table_names, _ = process_query(query_execution.query, language=(engine and engine.language)) table_names = list(chain.from_iterable(table_names)) duration = (DATETIME_TO_UTC(query_execution.completed_at) - DATETIME_TO_UTC(query_execution.created_at) if query_execution.completed_at is not None else None) environments = engine.environments environment_ids = [env.id for env in environments] title = data_cell.meta.get("title", "Untitled") if data_cell else None expand_query_execution = { "id": query_execution_id, "query_type": "query_execution", "title": title, "environment_id": environment_ids, "author_uid": query_execution.uid, "engine_id": engine_id, "statement_type": get_table_statement_type(query_execution.query), "created_at": DATETIME_TO_UTC(query_execution.created_at), "duration": duration, "full_table_name": table_names, "query_text": query_execution.query, } return expand_query_execution
def create_table_lineage_from_metadata(job_metadata_id, query_language=None, session=None): job_metadata = session.query(DataJobMetadata).get(job_metadata_id) if job_metadata is None: return _, lineage_per_statement = process_query(job_metadata.query_text, query_language) lineage_ids = [] for statement_lineage in lineage_per_statement: if len(statement_lineage): for lineage in statement_lineage: if "source" in lineage: source_string = lineage["source"].split(".") parent_table = get_table_by_name( source_string[0], source_string[1], job_metadata.metastore_id, session=session, ) target_string = lineage["target"].split(".") child_table = get_table_by_name( target_string[0], target_string[1], job_metadata.metastore_id, session=session, ) if parent_table and child_table: lineage_ids.append( add_table_lineage( child_table.id, parent_table.id, job_metadata_id, session=session, ).id) return lineage_ids
def test_select_statement(self): query = """ SELECT w9.Country, w9.Rank AS [2019], w8.Rank AS [2018], w7.HappinessRank AS [2017], w6.HappinessRank AS [2016], w5.HappinessRank AS [2015] FROM main.world_happiness_2019 w9 INNER JOIN main.world_happiness_2018 w8 ON w9.Country = w8.Country INNER JOIN main.world_happiness_2017 w7 ON w9.Country = w7.Country INNER JOIN main.world_happiness_2016 w6 ON w9.Country = w6.Country INNER JOIN main.world_happiness_2015 w5 ON w9.Country = w5.Country AND (w5.Region = "{{Region}}"); -- Region is a template variable with the value of 'Western Europe' -- click on the <> button on the bottom right of the DataDoc to configure more! """ processed_query = process_query(query) self.assertIsInstance(processed_query, tuple) self.assertEqual(len(processed_query), 2) self.assertEqual(len(processed_query[0]), 1) self.assertSetEqual( set(processed_query[0][0]), set([ "main.world_happiness_2019", "main.world_happiness_2018", "main.world_happiness_2017", "main.world_happiness_2016", "main.world_happiness_2015", ]), ) self.assertEqual( processed_query[1], ([[]]), )
def test_create_insert_query(self): """testing if process_query returns correct lineage information for multi-line query that has insert and create """ query = """ USE analytics; CREATE EXTERNAL TABLE IF NOT EXISTS example_1 (id INT, count BIGINT) PARTITIONED BY (dt STRING) LOCATION 's3n://fakebucket/fake/path/'; INSERT OVERWRITE TABLE example_1 PARTITION (dt = '%(end_date)s') SELECT DISTINCT id, count from default.example_2 where dt <= '2019-01-01' and dt >= date_sub('2018-01-01', 28) and to_date(created_at) >= start_dt """ expected_table_per_statement = [ [], ["analytics.example_1"], ["analytics.example_1", "default.example_2"], ] expected_lineage_per_statement = [ [], [], [{ "source": "default.example_2", "target": "analytics.example_1" }], ] processed_query = process_query(query) self.assertIsInstance(processed_query, tuple) table_per_statement, lineage_per_statement = processed_query for stmt, expected_stmt in zip(table_per_statement, expected_table_per_statement): self.assertCountEqual(stmt, expected_stmt) self.assertListEqual(lineage_per_statement, expected_lineage_per_statement)
def log_query_per_table_task(self, query_execution_id): with DBSession() as session: query_execution = qe_logic.get_query_execution_by_id( query_execution_id, session=session) assert query_execution.status == QueryExecutionStatus.DONE metastore_id = query_execution.engine.metastore_id if metastore_id is None: # This query engine has no metastore configured return statement_types = get_table_statement_type(query_execution.query) table_per_statement, _ = process_query(query_execution.query, query_execution.engine.language) sync_table_to_metastore(table_per_statement, statement_types, metastore_id, session=session) datadoc_cell = next(iter(query_execution.cells), None) if any(statement in statement_types for statement in ["CREATE", "INSERT"]): create_lineage_from_query(query_execution, metastore_id, datadoc_cell, session=session) if datadoc_cell is None or not datadoc_cell.doc.public: return log_table_per_statement( table_per_statement, statement_types, query_execution_id, metastore_id, datadoc_cell.id, session=session, )
def test_empty_statement(self): self.assertEqual(process_query(""), ([], [])) self.assertEqual(process_query("\t\n\t\n"), ([], [])) self.assertEqual(process_query("\n\n;\n\n;\n\n"), ([[], []], [[], []]))
def _get_table_names_from_query(query, language=None) -> List[str]: table_names, _ = lineage_lib.process_query(query, language=language) return list(chain.from_iterable(table_names))