def test_sanity_ctas(target, sources, sql): parsed = parse(sql) visitor = SelectSourceVisitor("test_sanity_ctas") parsed.node.accept(visitor) visitor.resolve() assert visitor.target_table == target assert visitor.source_tables == sources
def post(self): args = self._parser.parse_args() logging.debug("Parse query: {}".format(args["query"])) try: parsed = parse(args["query"], "parse_api") except ParseError as error: raise ParseErrorHTTP(description=str(error)) try: source = self._catalog.get_source_by_id(args["source_id"]) logging.debug("Parsing query for source {}".format(source)) binder = parse_dml_query( catalog=self._catalog, parsed=parsed, source=source ) return ( { "select_tables": [table.name for table in binder.tables], "select_columns": [context.alias for context in binder.columns], }, 200, ) except TableNotFound as table_error: raise TableNotFoundHTTP(description=str(table_error)) except ColumnNotFound as column_error: raise ColumnNotFoundHTTP(description=str(column_error)) except SemanticError as semantic_error: raise SemanticErrorHTTP(description=str(semantic_error)) finally: self._catalog.scoped_session.remove()
def test_copy(target, query): parsed = parse(query) visitor = CopyFromVisitor("test_copy") parsed.node.accept(visitor) visitor.resolve() assert visitor.target_table == target
def test_sanity_ctas(target, sources, sql): parsed = parse(sql) visitor = CTASVisitor("test_sanity_ctas") visitor(parsed.node) bound_target, bound_tables, bound_cols = visitor.resolve() assert bound_target == target assert bound_tables == sources
def test_sanity_select_into(target, sources, sql): parsed = parse(sql) visitor = SelectIntoVisitor("test_sanity_select_into") visitor(parsed.node) bound_target, bound_tables, bound_cols = visitor.resolve() assert bound_target == target assert bound_tables == sources
def test_sanity_insert(target, sources, sql): parsed = parse(sql) insert_visitor = SelectSourceVisitor("test_sanity_insert") parsed.node.accept(insert_visitor) insert_visitor.resolve() assert insert_visitor.target_table == target assert insert_visitor.source_tables == sources
def test_sanity_insert(target, sources, sql): parsed = parse(sql) insert_visitor = SelectSourceVisitor("test_sanity_insert") insert_visitor(parsed.node) bound_target, bound_tables, bound_cols = insert_visitor.resolve() assert bound_target == target assert bound_tables == sources
def test_no_insert_column_graph(save_catalog): catalog = save_catalog query = """ INSERT INTO page_lookup_nonredirect SELECT page.page_id as redirect_id, page.page_title as redirect_title, page.page_title true_title, page.page_id, page.page_latest FROM page """ parsed = parse(query) visitor = SelectSourceVisitor(parsed.name) parsed.node.accept(visitor) visitor.bind(catalog) graph = create_graph(catalog, [visitor]) assert [node.fqdn for node in sorted(list(nodes(graph.graph)))] == [ ("test", "default", "page", "page_id"), ("test", "default", "page", "page_latest"), ("test", "default", "page", "page_title"), ("test", "default", "page_lookup_nonredirect", "redirect_id"), ("test", "default", "page_lookup_nonredirect", "redirect_title"), ("test", "default", "page_lookup_nonredirect", "true_title"), ("test", "default", "page_lookup_nonredirect", "page_id"), ("test", "default", "page_lookup_nonredirect", "page_version"), ] expected_edges = [ ( ("test", "default", "page", "page_id"), ("test", "default", "page_lookup_nonredirect", "redirect_id"), ), ( ("test", "default", "page", "page_id"), ("test", "default", "page_lookup_nonredirect", "page_id"), ), ( ("test", "default", "page", "page_title"), ("test", "default", "page_lookup_nonredirect", "redirect_title"), ), ( ("test", "default", "page", "page_title"), ("test", "default", "page_lookup_nonredirect", "true_title"), ), ( ("test", "default", "page", "page_latest"), ("test", "default", "page_lookup_nonredirect", "page_version"), ), ] assert [(edge[0].fqdn, edge[1].fqdn) for edge in list(edges(graph.graph))] == expected_edges session = catalog.scoped_session all_edges = session.query(ColumnLineage).all() assert set([(e.source.fqdn, e.target.fqdn) for e in all_edges]) == set(expected_edges)
def test_insert(save_catalog, query): source = save_catalog.get_source("test") parsed = parse(query) visitor = analyze_dml_query(save_catalog, parsed, source) assert visitor is not None assert len(visitor.target_columns) == 5 assert visitor.target_table.fqdn == ("test", "default", "page_lookup") assert len(visitor.source_columns) == 5 assert [table.fqdn for table in visitor.source_tables ] == [("test", "default", "page_lookup_redirect")]
def test_insert_cols(): query = "INSERT INTO page_lookup_nonredirect(page_id, latest) SELECT page.page_id, page.page_latest FROM page" parsed = parse(query) visitor = SelectSourceVisitor("test_insert_cols") parsed.node.accept(visitor) visitor.resolve() assert len(visitor.target_columns) == 2 assert visitor.target_table == (None, "page_lookup_nonredirect") assert len(visitor.source_columns) == 2 assert visitor.source_tables == [(None, "page")]
def test_insert_cols(save_catalog): source = save_catalog.get_source("test") query = "INSERT INTO page_lookup_nonredirect(page_id, page_version) SELECT page.page_id, page.page_latest FROM page" parsed = parse(query) visitor = analyze_dml_query(save_catalog, parsed, source) assert visitor is not None assert len(visitor.target_columns) == 2 assert visitor.target_table.fqdn == ("test", "default", "page_lookup_nonredirect") assert len(visitor.source_columns) == 2 assert [table.fqdn for table in visitor.source_tables ] == [("test", "default", "page")]
def test_insert_with_join(save_catalog): source = save_catalog.get_source("test") query = "insert into page_lookup_redirect select original_page.page_id redirect_id, original_page.page_title redirect_title, final_page.page_title as true_title, final_page.page_id, final_page.page_latest from page final_page join redirect on (redirect.page_title = final_page.page_title) join page original_page on (redirect.rd_from = original_page.page_id)" parsed = parse(query) visitor = analyze_dml_query(save_catalog, parsed, source) assert visitor is not None assert len(visitor.target_columns) == 5 assert visitor.target_table.fqdn == ("test", "default", "page_lookup_redirect") assert len(visitor.source_columns) == 5 assert sorted([table.fqdn for table in visitor.source_tables]) == [ ("test", "default", "page"), ("test", "default", "redirect"), ]
def post(self): args = self._parser.parse_args() logging.debug("Parse query: {}".format(args["query"])) try: parsed = parse(args["query"], args["name"]) except ParseError as error: raise ParseErrorHTTP(description=str(error)) try: source = self._catalog.get_source_by_id(args["source_id"]) logging.debug("Parsing query for source {}".format(source)) chosen_visitor = analyze_dml_query(self._catalog, parsed, source) job_execution = extract_lineage( catalog=self._catalog, visited_query=chosen_visitor, source=source, parsed=parsed, start_time=datetime.datetime.fromisoformat(args["start_time"]), end_time=datetime.datetime.fromisoformat(args["end_time"]), ) return ( { "data": { "id": job_execution.id, "type": "job_executions", "attributes": { "job_id": job_execution.job_id, "started_at": job_execution.started_at.strftime( "%Y-%m-%d %H:%M:%S" ), "ended_at": job_execution.ended_at.strftime( "%Y-%m-%d %H:%M:%S" ), "status": job_execution.status.name, }, } }, 200, ) except TableNotFound as table_error: raise TableNotFoundHTTP(description=str(table_error)) except ColumnNotFound as column_error: raise ColumnNotFoundHTTP(description=str(column_error)) except SemanticError as semantic_error: raise SemanticErrorHTTP(description=str(semantic_error)) finally: self._catalog.scoped_session.remove()
def test_basic_column_graph(save_catalog): catalog = save_catalog query = "INSERT INTO page_lookup_nonredirect(page_id, page_version) SELECT page.page_id, page.page_latest FROM page" parsed = parse(query, "basic_column_graph") visitor = SelectSourceVisitor(parsed.name) parsed.node.accept(visitor) visitor.bind(catalog) graph = create_graph(catalog, [visitor]) assert [node.fqdn for node in sorted(list(nodes(graph.graph)))] == [ ("test", "default", "page", "page_id"), ("test", "default", "page", "page_latest"), ("test", "default", "page_lookup_nonredirect", "page_id"), ("test", "default", "page_lookup_nonredirect", "page_version"), ] expected_edges = [ ( ("test", "default", "page", "page_id"), ("test", "default", "page_lookup_nonredirect", "page_id"), ), ( ("test", "default", "page", "page_latest"), ("test", "default", "page_lookup_nonredirect", "page_version"), ), ] assert [(edge[0].fqdn, edge[1].fqdn) for edge in list(edges(graph.graph))] == expected_edges table = catalog.get_table( source_name="test", schema_name="default", table_name="page_lookup_nonredirect", ) columns = catalog.get_columns_for_table( table, column_names=["page_id", "page_version"]) assert len(columns) == 2 session = catalog.scoped_session all_edges = (session.query(ColumnLineage).filter( ColumnLineage.target_id.in_([c.id for c in columns])).all()) assert set([(e.source.fqdn, e.target.fqdn) for e in all_edges]) == set(expected_edges)
def test_ctas(save_catalog): query = """ CREATE TEMP TABLE temp_table_x(page_title) AS select redirect_title from page_lookup_nonredirect where redirect_title is not null """ source = save_catalog.get_source("test") schema = save_catalog.get_schema("test", "default") save_catalog.update_source(source, schema) parsed = parse(query) visitor = analyze_dml_query(save_catalog, parsed, source) assert visitor is not None assert len(visitor.target_columns) == 1 assert visitor.target_table.fqdn == ("test", "default", "temp_table_x") assert len(visitor.source_columns) == 1 assert [table.fqdn for table in visitor.source_tables ] == [("test", "default", "page_lookup_nonredirect")]
def test_col_exprs(save_catalog): query = """ INSERT INTO page_lookup_redirect(true_title) SELECT BTRIM(TO_CHAR(DATEADD (MONTH,-1,('20' ||MAX ("redirect_id") || '-01')::DATE)::DATE,'YY-MM')) AS "max_month" FROM page_lookup_nonredirect; """ source = save_catalog.get_source("test") parsed = parse(query) visitor = analyze_dml_query(catalog=save_catalog, parsed=parsed, source=source) assert visitor is not None assert len(visitor.target_columns) == 1 assert visitor.target_table.fqdn == ("test", "default", "page_lookup_redirect") assert len(visitor.source_columns) == 1 assert [table.fqdn for table in visitor.source_tables ] == [("test", "default", "page_lookup_nonredirect")]
def test_parse_query(save_catalog): query = """ SELECT BTRIM(TO_CHAR(DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE,\'YY-MM\')) AS "max_month", DATEADD(YEAR,-1,DATEADD (MONTH,-3,LAST_DAY (DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE))::DATE)::DATE AS "min_date", DATEADD(MONTH,-3,LAST_DAY (DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE))::DATE AS "max_date", page_title, bytes_sent as mb_sent INTO "new_table" FROM pagecounts; """ source = save_catalog.get_source("test") parsed = parse(query) binder = parse_dml_query(catalog=save_catalog, parsed=parsed, source=source) assert [context.alias for context in binder.columns] == [ "max_month", "min_date", "max_date", "page_title", "mb_sent", ]
def parse_queries_fixture(load_queries): parsed = [ parse(sql=query["query"], name=query["name"]) for query in load_queries ] yield parsed
def test_basic_column_graph(save_catalog, graph_sdk): catalog = save_catalog query = "INSERT INTO page_lookup_nonredirect(page_id, page_version) SELECT page.page_id, page.page_latest FROM page" parsed = parse(query, "basic_column_graph") visitor = SelectSourceVisitor(parsed.name) visitor(parsed.node) source = catalog.get_source("test") visitor.bind(catalog, source) job_execution = extract_lineage( catalog, visitor, source, parsed, datetime.datetime.now(), datetime.datetime.now(), ) graph = load_graph(graph_sdk, [job_execution.job_id]) assert sorted([node[1]["name"] for node in list(graph.graph.nodes(data=True))]) == [ "basic_column_graph", "test.default.page.page_id", "test.default.page.page_latest", "test.default.page_lookup_nonredirect.page_id", "test.default.page_lookup_nonredirect.page_version", ] expected_edges = [ ("column:5", "task:2"), ("task:2", "column:13"), ("task:2", "column:14"), ("column:6", "task:2"), ] assert [(edge[0], edge[1]) for edge in list(edges(graph.graph))] == expected_edges table = catalog.get_table( source_name="test", schema_name="default", table_name="page_lookup_nonredirect", ) columns = catalog.get_columns_for_table( table, column_names=["page_id", "page_version"] ) assert len(columns) == 2 session = catalog.scoped_session expected_db_edges = [ ( ("test", "default", "page", "page_id"), ("test", "default", "page_lookup_nonredirect", "page_id"), ), ( ("test", "default", "page", "page_latest"), ("test", "default", "page_lookup_nonredirect", "page_version"), ), ] all_edges = ( session.query(ColumnLineage) .filter(ColumnLineage.target_id.in_([c.id for c in columns])) .all() ) assert set([(e.source.fqdn, e.target.fqdn) for e in all_edges]) == set( expected_db_edges )
def test_no_insert_column_graph(save_catalog, graph_sdk): catalog = save_catalog query = """ INSERT INTO page_lookup_nonredirect SELECT page.page_id as redirect_id, page.page_title as redirect_title, page.page_title true_title, page.page_id, page.page_latest FROM page """ parsed = parse( query, name="LOAD page_lookup_nonredirect-test_no_insert_column_graph" ) visitor = SelectSourceVisitor(parsed.name) visitor(parsed.node) source = catalog.get_source("test") visitor.bind(catalog, source) job_execution = extract_lineage( catalog, visitor, source, parsed, datetime.datetime.now(), datetime.datetime.now(), ) graph = load_graph(graph_sdk, [job_execution.job_id]) assert sorted([node[1]["name"] for node in list(graph.graph.nodes(data=True))]) == [ "LOAD page_lookup_nonredirect-test_no_insert_column_graph", "test.default.page.page_id", "test.default.page.page_latest", "test.default.page.page_title", "test.default.page_lookup_nonredirect.page_id", "test.default.page_lookup_nonredirect.page_version", "test.default.page_lookup_nonredirect.redirect_id", "test.default.page_lookup_nonredirect.redirect_title", "test.default.page_lookup_nonredirect.true_title", ] expected_edges = [ ("column:5", "task:1"), ("task:1", "column:10"), ("task:1", "column:11"), ("task:1", "column:12"), ("task:1", "column:13"), ("task:1", "column:14"), ("column:7", "task:1"), ("column:6", "task:1"), ] assert [(edge[0], edge[1]) for edge in list(edges(graph.graph))] == expected_edges expected_db_edges = [ ( ("test", "default", "page", "page_id"), ("test", "default", "page_lookup_nonredirect", "redirect_id"), ), ( ("test", "default", "page", "page_id"), ("test", "default", "page_lookup_nonredirect", "page_id"), ), ( ("test", "default", "page", "page_title"), ("test", "default", "page_lookup_nonredirect", "redirect_title"), ), ( ("test", "default", "page", "page_title"), ("test", "default", "page_lookup_nonredirect", "true_title"), ), ( ("test", "default", "page", "page_latest"), ("test", "default", "page_lookup_nonredirect", "page_version"), ), ] session = catalog.scoped_session all_edges = session.query(ColumnLineage).all() assert set([(e.source.fqdn, e.target.fqdn) for e in all_edges]) == set( expected_db_edges )