def test_redshift(): assert get_query_tables( "ALTER TABLE target_table APPEND FROM source_table") == [ "target_table", "source_table", ] assert get_query_tables("ALTER TABLE x APPEND FROM y") == ["x", "y"]
def test_insert_overwrite_table(): assert ["foo_report"] == get_query_tables("INSERT TABLE foo_report") assert ["foo_report" ] == get_query_tables("INSERT OVERWRITE TABLE foo_report") assert ["foo_report", "bar"] == get_query_tables( "INSERT OVERWRITE TABLE foo_report SELECT foo FROM bar") assert ["foo"] == get_query_columns( "INSERT OVERWRITE TABLE foo_report SELECT foo FROM bar")
def test_select_aliases(): assert get_query_tables('SELECT e.foo FROM bar AS e') == ['bar'] assert get_query_tables('SELECT e.foo FROM bar e') == ['bar'] assert get_query_tables('SELECT e.foo FROM (SELECT * FROM bar) AS e') == [ 'bar' ] assert get_query_tables('SELECT e.foo FROM (SELECT * FROM bar) e') == [ 'bar' ]
def test_insert_overwrite_table(): assert ['foo_report'] == get_query_tables('INSERT TABLE foo_report') assert ['foo_report' ] == get_query_tables('INSERT OVERWRITE TABLE foo_report') assert ['foo_report', 'bar'] == get_query_tables( 'INSERT OVERWRITE TABLE foo_report SELECT foo FROM bar') assert ['foo'] == get_query_columns( 'INSERT OVERWRITE TABLE foo_report SELECT foo FROM bar')
def test_select_aliases(): assert get_query_tables("SELECT e.foo FROM bar AS e") == ["bar"] assert get_query_tables("SELECT e.foo FROM bar e") == ["bar"] assert get_query_tables("SELECT e.foo FROM (SELECT * FROM bar) AS e") == [ "bar" ] assert get_query_tables("SELECT e.foo FROM (SELECT * FROM bar) e") == [ "bar" ]
def test_unions(): # @see https://github.com/macbre/sql-metadata/issues/79 assert ["tab1", "tab2"] == get_query_tables( "select col1, col2, col3 from tab1 union all select col4, col5, col6 from tab2" ) # @see https://github.com/macbre/sql-metadata/issues/94 assert [ "d", "g" ] == get_query_tables("SELECT a,b,c FROM d UNION ALL SELECT e,f FROM g")
def test_joins(): assert ['redirect', 'page'] == \ get_query_tables("SELECT page_title FROM `redirect` INNER JOIN `page` " "ON (rd_title = 'foo' AND rd_namespace = '100' AND (page_id = rd_from))") assert ['redirect', 'page'] == \ get_query_tables("SELECT page_title FROM `redirect` INNER JOIN `page` `foo` " "ON (rd_title = 'foo' AND rd_namespace = '100' AND (foo.page_id = rd_from))") assert ['page_title', 'rd_title', 'rd_namespace', 'page_id', 'rd_from'] == \ get_query_columns("SELECT page_title FROM `redirect` INNER JOIN `page` " "ON (rd_title = 'foo' AND rd_namespace = '100' AND (page_id = rd_from))")
def test_insert_into_select(): # https://dev.mysql.com/doc/refman/5.7/en/insert-select.html query = "INSERT INTO foo SELECT * FROM bar" assert get_query_tables(query) == ["foo", "bar"] assert get_query_columns(query) == ["*"] query = "INSERT INTO foo SELECT id, price FROM bar" assert get_query_tables(query) == ["foo", "bar"] assert get_query_columns(query) == ["id", "price"] query = "INSERT INTO foo SELECT id, price FROM bar WHERE qty > 200" assert get_query_tables(query) == ["foo", "bar"] assert get_query_columns(query) == ["id", "price", "qty"]
def test_datasets(): # see https://github.com/macbre/sql-metadata/issues/38 assert get_query_tables( "SELECT A.FIELD1, B.FIELD1, (A.FIELD1 * B.FIELD1) AS QTY FROM TABLE1 AS A, TABLE2 AS B" ) == ["TABLE1", "TABLE2"] assert get_query_tables( "SELECT A.FIELD1, B.FIELD1, (A.FIELD1 * B.FIELD1) AS QTY FROM DATASET1.TABLE1, DATASET2.TABLE2" ) == ["DATASET1.TABLE1", "DATASET2.TABLE2"] assert get_query_tables( "SELECT A.FIELD1, B.FIELD1, (A.FIELD1 * B.FIELD1) AS QTY FROM DATASET1.TABLE1 AS A, DATASET2.TABLE2 AS B" ) == ["DATASET1.TABLE1", "DATASET2.TABLE2"]
def test_insert_into_select(): # https://dev.mysql.com/doc/refman/5.7/en/insert-select.html query = "INSERT INTO foo SELECT * FROM bar" assert get_query_tables(query) == ['foo', 'bar'] assert get_query_columns(query) == ['*'] query = "INSERT INTO foo SELECT id, price FROM bar" assert get_query_tables(query) == ['foo', 'bar'] assert get_query_columns(query) == ['id', 'price'] query = "INSERT INTO foo SELECT id, price FROM bar WHERE qty > 200" assert get_query_tables(query) == ['foo', 'bar'] assert get_query_columns(query) == ['id', 'price', 'qty']
def test_table_name_with_group_by(): expected_tables = ['SH.sales'] assert get_query_tables( "SELECT s.cust_id,count(s.cust_id) FROM SH.sales s") == expected_tables assert get_query_tables( "SELECT s.cust_id,count(s.cust_id) FROM SH.sales s GROUP BY s.cust_id" ) == expected_tables assert get_query_tables(""" SELECT s.cust_id,count(s.cust_id) FROM SH.sales s GROUP BY s.cust_id HAVING s.cust_id != '1660' AND s.cust_id != '2' """.strip()) == expected_tables
def parse_sql(params): ''' parse sql ''' files = get_files_in_path(params) result = {} for filepath, filename in files: with open(filepath, 'r') as f: sql_data = f.read() get_tables = sql_metadata.get_query_tables(sql_data) source_table = filename.replace('.sql', '') local_depends = {} for table in get_tables: if '.' in table: table_name = table.split('.')[1] else: table_name = table local_depends[table_name] = None result[source_table] = local_depends #print(result) return result
def test_complex_hive_query(): # https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML#LanguageManualDML-InsertingdataintoHiveTablesfromqueries dag = """ INSERT OVERWRITE TABLE foo_report SELECT d.domain, r.wiki_id, r.beacon, r.pageviews FROM (SELECT wiki_id, beacon, sum(pageviews) AS pageviews FROM rollup_wiki_beacon_pageviews WHERE period_id = '1' AND ( year > '{{ beginYear28 }}' OR (year = '{{ beginYear28 }}' AND month > '{{ beginMonth28 }}') OR (year = '{{ beginYear28 }}' AND month = '{{ beginMonth28 }}' AND day > '{{ beginDay28 }}') ) AND ( year < '{{ beginYear }}' OR (year = '{{ beginYear }}' AND month < '{{ beginMonth }}') OR (year = '{{ beginYear }}' AND month = '{{ beginMonth }}' AND day <= '{{ beginDay }}') ) GROUP BY wiki_id, beacon) r JOIN statsdb.dimension_wikis d ON r.wiki_id = d.wiki_id; """ assert [ 'foo_report', 'rollup_wiki_beacon_pageviews', 'statsdb.dimension_wikis' ] == get_query_tables(dag)
def test_with_brackets(): assert ["database1.table1", "database2.table2"] == get_query_tables(""" SELECT "xxxxx" FROM (database1.table1 alias LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx")) """) assert ["inner_table"] == get_query_tables(""" SELECT t.foo FROM (SELECT foo FROM inner_table WHERE bar = '1') t """)
def check_high_offset_selects(_, queries): """ :type _ indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] """ for query in queries: # ignore insert queries (#140) if not is_select_query(query): continue res = get_query_limit_and_offset(query) if res is None: continue (limit, offset) = res if offset < OFFSET_THRESHOLD: continue table_name = get_query_tables(query)[0] context = OrderedDict() context['query'] = query context['limit'] = limit context['offset'] = offset yield LinterEntry(linter_type='high_offset_selects', table_name=table_name, message='"{}" query uses too high offset impacting the performance'. format(shorten_query(query)), context=context)
def get_portability_metrics_query(logline, job_name): """ :type logline str :type job_name str :rtype: list[str, str, str] """ # extract SQL query try: sql = str(re.search(r'INFO (.*)$', logline).group(1).strip()) except AttributeError: return # remove "SQL: " prefix sql = sql.replace('SQL:', '').lstrip() # script name # portability-metric-metric-article-provider-py-1548232800 script_name = str( re.match(r'portability-metric-([a-z-]+)-\d+', job_name).group(1)) script_name = script_name.replace('-py', '.py') # SELECT, UPDATE, ... query_type = sql.split(' ')[0].upper() # get query metadata for table in get_query_tables(sql): # ignore one letter table aliases if len(table) == 1: continue yield script_name, query_type, table
def test_case_syntax(): # https://dev.mysql.com/doc/refman/8.0/en/case.html assert get_query_columns( 'select case when p > 0 then 1 else 0 end as cs from c where g > f' ) == ['p', 'g', 'f'] assert get_query_tables( 'select case when p > 0 then 1 else 0 end as cs from c where g > f' ) == ['c']
def test_case_syntax(): # https://dev.mysql.com/doc/refman/8.0/en/case.html assert get_query_columns( "select case when p > 0 then 1 else 0 end as cs from c where g > f" ) == ["p", "g", "f"] assert get_query_tables( "select case when p > 0 then 1 else 0 end as cs from c where g > f" ) == ["c"]
def test_joins(): assert ['redirect', 'page'] == \ get_query_tables("SELECT page_title FROM `redirect` INNER JOIN `page` " "ON (rd_title = 'foo' AND rd_namespace = '100' AND (page_id = rd_from))") assert ['redirect', 'page'] == \ get_query_tables("SELECT page_title FROM `redirect` INNER JOIN `page` `foo` " "ON (rd_title = 'foo' AND rd_namespace = '100' AND (foo.page_id = rd_from))") assert ['page_title', 'rd_title', 'rd_namespace', 'page_id', 'rd_from'] == \ get_query_columns("SELECT page_title FROM `redirect` INNER JOIN `page` " "ON (rd_title = 'foo' AND rd_namespace = '100' AND (page_id = rd_from))") # see #34 assert ['foos', 'bars'] == \ get_query_tables("SELECT foo FROM `foos` JOIN `bars` ON (foos.id = bars.id)") assert ['foos', 'bars'] == \ get_query_tables("SELECT foo FROM `foos` FULL JOIN `bars` ON (foos.id = bars.id)") assert ['foos', 'bars'] == \ get_query_tables("SELECT foo FROM `foos` FULL OUTER JOIN `bars` ON (foos.id = bars.id)") assert ['foos', 'bars'] == \ get_query_tables("SELECT foo FROM `foos` RIGHT OUTER JOIN `bars` ON (foos.id = bars.id)") assert ['foos', 'bars'] == \ get_query_tables("SELECT foo FROM `foos` LEFT OUTER JOIN `bars` ON (foos.id = bars.id)")
def test_case_insensitive(): # case-insensitive handling # https://github.com/macbre/sql-metadata/issues/71 assert ["abc.foo", "foo", "bar"] == get_query_tables( "create table abc.foo as SELECT pqr.foo1 , ab.foo2 FROM foo pqr, bar ab" ) assert ["abc.foo", "foo", "bar"] == get_query_tables( "create table abc.foo as select pqr.foo1 , ab.foo2 FROM foo pqr, bar ab" ) assert ["pqr.foo1", "ab.foo2"] == get_query_columns( "create table abc.foo as SELECT pqr.foo1 , ab.foo2 FROM foo pqr, bar ab" ) assert ["pqr.foo1", "ab.foo2"] == get_query_columns( "create table abc.foo as select pqr.foo1 , ab.foo2 FROM foo pqr, bar ab" )
def test_handle_force_index(): query = "SELECT page_title,page_namespace FROM `page` FORCE INDEX (page_random) " \ "JOIN `categorylinks` ON ((page_id=cl_from)) WHERE page_is_redirect = '0' " \ "AND (page_random >= 0.197372293871) AND cl_to = 'Muppet_Characters' " \ "ORDER BY page_random LIMIT 1" assert get_query_tables(query) == ['page', 'categorylinks'] assert get_query_columns(query) == \ ['page_title', 'page_namespace', 'page_id', 'cl_from', 'page_is_redirect', 'page_random', 'cl_to']
def test_tables_aliases_are_resolved(): """ See https://github.com/macbre/sql-metadata/issues/52 """ sql = "SELECT a.* FROM users1 AS a JOIN users2 AS b ON a.ip_address = b.ip_address" assert get_query_tables(sql) == ['users1', 'users2'] assert get_query_table_aliases(sql) == {'a': 'users1', 'b': 'users2'} assert get_query_columns(sql) == [ 'users1.*', 'users1.ip_address', 'users2.ip_address' ], 'Should resolve table aliases'
def check_not_used_columns(database, queries): """ :type database indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] :raises Exception """ logger = logging.getLogger(__name__) # analyze only SELECT queries from the log queries = list(filter(is_select_query, queries)) used_tables = get_used_tables_from_queries(queries) used_columns = defaultdict(list) logger.info("Will check these tables: %s", used_tables) # analyze given queries and collect used columns for each table for query in queries: tables = get_query_tables(query) if tables: columns = get_query_columns(query) # print(query, table, columns) # add used columns per table # FIXME: assume we're querying just a single table for now used_columns[tables[0]] += columns else: logger.error( 'Unable to extract tables and columns used from the query: %s', query) # analyze table schemas and report not used columns for each table for table in used_tables: logger.info("Checking %s table", table) table_columns = database.get_table_columns(table) # now get the difference and report them not_used_columns = [ column for column in table_columns if column.name not in set(used_columns[table]) ] if table_columns else [] for column in not_used_columns: yield LinterEntry( linter_type='not_used_columns', table_name=table, message='"{}" column was not used by provided queries'.format( column), context={ 'column_name': column.name, 'column_type': column.type })
def test_with_with(): pytest.skip("Improve WITH syntax handling with a new parser (#98)") assert ["table3", "database2.table2"] == get_query_tables(""" WITH database1.tableFromWith AS SELECT * FROM table3 SELECT "xxxxx" FROM database1.tableFromWith alias LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx") """)
def test_queries_with_distinct(): assert get_query_columns("SELECT DISTINCT DATA.ASSAY_ID FROM foo") == [ "DATA.ASSAY_ID" ] assert get_query_columns("SELECT UNIQUE DATA.ASSAY_ID FROM foo") == [ "DATA.ASSAY_ID" ] assert get_query_tables("SELECT DISTINCT DATA.ASSAY_ID FROM foo") == [ "foo" ]
def test_tables_aliases_are_resolved(): """ See https://github.com/macbre/sql-metadata/issues/52 """ sql = "SELECT a.* FROM users1 AS a JOIN users2 AS b ON a.ip_address = b.ip_address" assert get_query_tables(sql) == ["users1", "users2"] assert get_query_table_aliases(sql) == {"a": "users1", "b": "users2"} assert get_query_columns(sql) == [ "users1.*", "users1.ip_address", "users2.ip_address", ], "Should resolve table aliases"
def get_columns(sql_query: str) -> list: if sql_query is not None: tables: list = sql_metadata.get_query_tables(sql_query) columns: list = sql_metadata.get_query_columns(sql_query) if len(columns) == 1: #* Return all columns if len(columns) == 1 and columns[0] == '*': if tables[0] == 'public.store': columns = TABLE_STORE_COLUMNS else: columns = TABLE_USER_COLUMNS return columns
def test_sql_server_cte(): """ Tests support for SQL Server's common table expression (CTE). @see https://www.sqlservertutorial.net/sql-server-basics/sql-server-cte/ """ assert get_query_tables(""" WITH x AS ( SELECT * FROM n ) SELECT * FROM x JOIN y ON x.a = y.a """.strip()) == ['n', 'x', 'y'] assert get_query_tables(""" WITH x AS ( SELECT * FROM n ) select * FROM x JOIN y ON x.a = y.a """.strip()) == ['n', 'x', 'y'] assert get_query_tables(""" WITH foo AS ( SELECT * FROM n ) update z from foo set z.q = fpp.y """.strip()) == ['n', 'z', 'foo'] assert get_query_tables(""" WITH foo AS ( SELECT * FROM tab ) DELETE FROM z JOIN foo ON z.a = foo.a """.strip()) == ['tab', 'z', 'foo']
def get_used_tables_from_queries(queries): """ :type queries list[str] :rtype: list[str] """ used_tables = [] queries = filter(is_select_query, queries) for query in queries: # parse each query from the log tables = get_query_tables(query) if tables and tables[0] not in used_tables: used_tables.append(tables[0]) return used_tables
def _get_sql_table_names(cls, sql: str) -> List[str]: sql_tables: List[str] = get_query_tables(sql) # Remove temporary tables from WITH statements sql_table_names = [ t for t in sql_tables if not re.search( fr"WITH(.*,)?\s+{t}(\s*\([\w\s,]+\))?\s+AS\s+\(", sql, re.IGNORECASE | re.DOTALL, ) ] # Remove quotes from tables sql_table_names = [t.replace('"', "") for t in sql_table_names] return sql_table_names