def query_cell_to_es(query_cell, session=None): query_cell_id = query_cell.id query_cell_meta = query_cell.meta engine_id = query_cell_meta.get("engine") engine = get_query_engine_by_id(engine_id, session=session) query = query_cell.context table_names, _ = process_query(query, language=(engine and engine.language)) table_names = list(chain.from_iterable(table_names)) datadoc = query_cell.doc expand_query = { "id": query_cell_id, "query_type": "query_cell", "title": query_cell_meta.get("title", "Untitled"), "data_doc_id": datadoc and datadoc.id, "environment_id": datadoc and datadoc.environment_id, "author_uid": datadoc and datadoc.owner_uid, "engine_id": engine_id, "statement_type": get_table_statement_type(query), "created_at": DATETIME_TO_UTC(query_cell.created_at), "full_table_name": table_names, "query_text": query, } return expand_query
def query_execution_to_es(query_execution, data_cell=None, session=None): """data_cell is added as a parameter so that bulk insert of query executions won't require re-retrieval of data_cell""" query_execution_id = query_execution.id engine_id = query_execution.engine_id engine = get_query_engine_by_id(engine_id, session=session) table_names, _ = process_query(query_execution.query, language=(engine and engine.language)) table_names = list(chain.from_iterable(table_names)) duration = (DATETIME_TO_UTC(query_execution.completed_at) - DATETIME_TO_UTC(query_execution.created_at) if query_execution.completed_at is not None else None) environments = engine.environments environment_ids = [env.id for env in environments] title = data_cell.meta.get("title", "Untitled") if data_cell else None expand_query_execution = { "id": query_execution_id, "query_type": "query_execution", "title": title, "environment_id": environment_ids, "author_uid": query_execution.uid, "engine_id": engine_id, "statement_type": get_table_statement_type(query_execution.query), "created_at": DATETIME_TO_UTC(query_execution.created_at), "duration": duration, "full_table_name": table_names, "query_text": query_execution.query, } return expand_query_execution
def test_special_cases(self): raw_query = """ set hive.memory = 110G; with test as ( select * from wasd ) insert into b select * from test; explain select * from test; """ self.assertSequenceEqual(get_table_statement_type(raw_query), [None, "INSERT", None])
def query_execution_to_es(query_execution, data_cell=None, fields=None, session=None): """data_cell is added as a parameter so that bulk insert of query executions won't require re-retrieval of data_cell""" engine_id = query_execution.engine_id engine = admin_logic.get_query_engine_by_id(engine_id, session=session) datadoc = data_cell.doc if data_cell else None def get_duration(): return (DATETIME_TO_UTC(query_execution.completed_at) - DATETIME_TO_UTC(query_execution.created_at) if query_execution.completed_at is not None else None) field_to_getter = { "id": query_execution.id, "query_type": "query_execution", "title": data_cell.meta.get("title", "Untitled") if data_cell else None, "environment_id": [env.id for env in engine.environments], "author_uid": query_execution.uid, "engine_id": engine_id, "statement_type": lambda: get_table_statement_type(query_execution.query), "created_at": lambda: DATETIME_TO_UTC(query_execution.created_at), "duration": get_duration, "full_table_name": lambda: _get_table_names_from_query( query_execution.query, language=(engine and engine.language)), "query_text": query_execution.query, "public": datadoc is None or datadoc.public, "readable_user_ids": lambda: _get_datadoc_editors(datadoc, session=session), } return _get_dict_by_field(field_to_getter, fields=fields)
def test_simple_statements(self): raw_query = """ select * from test; create database test; use test; create table test.test ( id int(11) not null, PRIMARY KEY (id) ); alter table test add column username varchart(255); update table test.test set username = '******'; drop table test; drop database test; """ self.assertSequenceEqual( get_table_statement_type(raw_query), ["SELECT", None, None, "CREATE", "ALTER", "UPDATE", "DROP", None], )
def query_cell_to_es(query_cell, fields=None, session=None): query_cell_meta = query_cell.meta query = query_cell.context datadoc = query_cell.doc engine_id = query_cell_meta.get("engine") engine = admin_logic.get_query_engine_by_id(engine_id, session=session) field_to_getter = { "id": query_cell.id, "query_type": "query_cell", "title": query_cell_meta.get("title", "Untitled"), "data_doc_id": datadoc and datadoc.id, "environment_id": datadoc and datadoc.environment_id, "author_uid": datadoc and datadoc.owner_uid, "engine_id": engine_id, "statement_type": lambda: get_table_statement_type(query), "created_at": lambda: DATETIME_TO_UTC(query_cell.created_at), "full_table_name": lambda: _get_table_names_from_query( query, language=(engine and engine.language)), "query_text": query, "public": datadoc is not None and datadoc.public, "readable_user_ids": lambda: _get_datadoc_editors(datadoc, session=session), } return _get_dict_by_field(field_to_getter, fields=fields)
def log_query_per_table_task(self, query_execution_id): with DBSession() as session: query_execution = qe_logic.get_query_execution_by_id( query_execution_id, session=session) assert query_execution.status == QueryExecutionStatus.DONE metastore_id = query_execution.engine.metastore_id if metastore_id is None: # This query engine has no metastore configured return statement_types = get_table_statement_type(query_execution.query) table_per_statement, _ = process_query(query_execution.query, query_execution.engine.language) sync_table_to_metastore(table_per_statement, statement_types, metastore_id, session=session) datadoc_cell = next(iter(query_execution.cells), None) if any(statement in statement_types for statement in ["CREATE", "INSERT"]): create_lineage_from_query(query_execution, metastore_id, datadoc_cell, session=session) if datadoc_cell is None or not datadoc_cell.doc.public: return log_table_per_statement( table_per_statement, statement_types, query_execution_id, metastore_id, datadoc_cell.id, session=session, )