Esempio n. 1
0
def test_sanity_ctas(target, sources, sql):
    parsed = parse(sql)
    visitor = SelectSourceVisitor("test_sanity_ctas")
    parsed.node.accept(visitor)
    visitor.resolve()
    assert visitor.target_table == target
    assert visitor.source_tables == sources
Esempio n. 2
0
    def post(self):
        args = self._parser.parse_args()
        logging.debug("Parse query: {}".format(args["query"]))
        try:
            parsed = parse(args["query"], "parse_api")
        except ParseError as error:
            raise ParseErrorHTTP(description=str(error))

        try:
            source = self._catalog.get_source_by_id(args["source_id"])
            logging.debug("Parsing query for source {}".format(source))
            binder = parse_dml_query(
                catalog=self._catalog, parsed=parsed, source=source
            )

            return (
                {
                    "select_tables": [table.name for table in binder.tables],
                    "select_columns": [context.alias for context in binder.columns],
                },
                200,
            )
        except TableNotFound as table_error:
            raise TableNotFoundHTTP(description=str(table_error))
        except ColumnNotFound as column_error:
            raise ColumnNotFoundHTTP(description=str(column_error))
        except SemanticError as semantic_error:
            raise SemanticErrorHTTP(description=str(semantic_error))
        finally:
            self._catalog.scoped_session.remove()
Esempio n. 3
0
def test_copy(target, query):
    parsed = parse(query)
    visitor = CopyFromVisitor("test_copy")
    parsed.node.accept(visitor)
    visitor.resolve()

    assert visitor.target_table == target
Esempio n. 4
0
def test_sanity_ctas(target, sources, sql):
    parsed = parse(sql)
    visitor = CTASVisitor("test_sanity_ctas")
    visitor(parsed.node)
    bound_target, bound_tables, bound_cols = visitor.resolve()

    assert bound_target == target
    assert bound_tables == sources
Esempio n. 5
0
def test_sanity_select_into(target, sources, sql):
    parsed = parse(sql)
    visitor = SelectIntoVisitor("test_sanity_select_into")
    visitor(parsed.node)
    bound_target, bound_tables, bound_cols = visitor.resolve()

    assert bound_target == target
    assert bound_tables == sources
Esempio n. 6
0
def test_sanity_insert(target, sources, sql):
    parsed = parse(sql)
    insert_visitor = SelectSourceVisitor("test_sanity_insert")
    parsed.node.accept(insert_visitor)
    insert_visitor.resolve()

    assert insert_visitor.target_table == target
    assert insert_visitor.source_tables == sources
Esempio n. 7
0
def test_sanity_insert(target, sources, sql):
    parsed = parse(sql)
    insert_visitor = SelectSourceVisitor("test_sanity_insert")
    insert_visitor(parsed.node)
    bound_target, bound_tables, bound_cols = insert_visitor.resolve()

    assert bound_target == target
    assert bound_tables == sources
Esempio n. 8
0
def test_no_insert_column_graph(save_catalog):
    catalog = save_catalog
    query = """
        INSERT INTO page_lookup_nonredirect
        SELECT page.page_id as redirect_id, page.page_title as redirect_title,
            page.page_title true_title, page.page_id, page.page_latest
        FROM page
    """

    parsed = parse(query)
    visitor = SelectSourceVisitor(parsed.name)
    parsed.node.accept(visitor)
    visitor.bind(catalog)

    graph = create_graph(catalog, [visitor])

    assert [node.fqdn for node in sorted(list(nodes(graph.graph)))] == [
        ("test", "default", "page", "page_id"),
        ("test", "default", "page", "page_latest"),
        ("test", "default", "page", "page_title"),
        ("test", "default", "page_lookup_nonredirect", "redirect_id"),
        ("test", "default", "page_lookup_nonredirect", "redirect_title"),
        ("test", "default", "page_lookup_nonredirect", "true_title"),
        ("test", "default", "page_lookup_nonredirect", "page_id"),
        ("test", "default", "page_lookup_nonredirect", "page_version"),
    ]

    expected_edges = [
        (
            ("test", "default", "page", "page_id"),
            ("test", "default", "page_lookup_nonredirect", "redirect_id"),
        ),
        (
            ("test", "default", "page", "page_id"),
            ("test", "default", "page_lookup_nonredirect", "page_id"),
        ),
        (
            ("test", "default", "page", "page_title"),
            ("test", "default", "page_lookup_nonredirect", "redirect_title"),
        ),
        (
            ("test", "default", "page", "page_title"),
            ("test", "default", "page_lookup_nonredirect", "true_title"),
        ),
        (
            ("test", "default", "page", "page_latest"),
            ("test", "default", "page_lookup_nonredirect", "page_version"),
        ),
    ]
    assert [(edge[0].fqdn, edge[1].fqdn)
            for edge in list(edges(graph.graph))] == expected_edges

    session = catalog.scoped_session
    all_edges = session.query(ColumnLineage).all()
    assert set([(e.source.fqdn, e.target.fqdn)
                for e in all_edges]) == set(expected_edges)
Esempio n. 9
0
def test_insert(save_catalog, query):
    source = save_catalog.get_source("test")
    parsed = parse(query)
    visitor = analyze_dml_query(save_catalog, parsed, source)
    assert visitor is not None

    assert len(visitor.target_columns) == 5
    assert visitor.target_table.fqdn == ("test", "default", "page_lookup")
    assert len(visitor.source_columns) == 5
    assert [table.fqdn for table in visitor.source_tables
            ] == [("test", "default", "page_lookup_redirect")]
Esempio n. 10
0
def test_insert_cols():
    query = "INSERT INTO page_lookup_nonredirect(page_id, latest) SELECT page.page_id, page.page_latest FROM page"
    parsed = parse(query)
    visitor = SelectSourceVisitor("test_insert_cols")
    parsed.node.accept(visitor)
    visitor.resolve()

    assert len(visitor.target_columns) == 2
    assert visitor.target_table == (None, "page_lookup_nonredirect")
    assert len(visitor.source_columns) == 2
    assert visitor.source_tables == [(None, "page")]
Esempio n. 11
0
def test_insert_cols(save_catalog):
    source = save_catalog.get_source("test")
    query = "INSERT INTO page_lookup_nonredirect(page_id, page_version) SELECT page.page_id, page.page_latest FROM page"
    parsed = parse(query)
    visitor = analyze_dml_query(save_catalog, parsed, source)
    assert visitor is not None

    assert len(visitor.target_columns) == 2
    assert visitor.target_table.fqdn == ("test", "default",
                                         "page_lookup_nonredirect")
    assert len(visitor.source_columns) == 2
    assert [table.fqdn for table in visitor.source_tables
            ] == [("test", "default", "page")]
Esempio n. 12
0
def test_insert_with_join(save_catalog):
    source = save_catalog.get_source("test")
    query = "insert into page_lookup_redirect select original_page.page_id redirect_id, original_page.page_title redirect_title, final_page.page_title as true_title, final_page.page_id, final_page.page_latest from page final_page join redirect on (redirect.page_title = final_page.page_title) join page original_page on (redirect.rd_from = original_page.page_id)"
    parsed = parse(query)
    visitor = analyze_dml_query(save_catalog, parsed, source)
    assert visitor is not None

    assert len(visitor.target_columns) == 5
    assert visitor.target_table.fqdn == ("test", "default",
                                         "page_lookup_redirect")
    assert len(visitor.source_columns) == 5
    assert sorted([table.fqdn for table in visitor.source_tables]) == [
        ("test", "default", "page"),
        ("test", "default", "redirect"),
    ]
Esempio n. 13
0
    def post(self):
        args = self._parser.parse_args()
        logging.debug("Parse query: {}".format(args["query"]))
        try:
            parsed = parse(args["query"], args["name"])
        except ParseError as error:
            raise ParseErrorHTTP(description=str(error))

        try:
            source = self._catalog.get_source_by_id(args["source_id"])
            logging.debug("Parsing query for source {}".format(source))
            chosen_visitor = analyze_dml_query(self._catalog, parsed, source)
            job_execution = extract_lineage(
                catalog=self._catalog,
                visited_query=chosen_visitor,
                source=source,
                parsed=parsed,
                start_time=datetime.datetime.fromisoformat(args["start_time"]),
                end_time=datetime.datetime.fromisoformat(args["end_time"]),
            )

            return (
                {
                    "data": {
                        "id": job_execution.id,
                        "type": "job_executions",
                        "attributes": {
                            "job_id": job_execution.job_id,
                            "started_at": job_execution.started_at.strftime(
                                "%Y-%m-%d %H:%M:%S"
                            ),
                            "ended_at": job_execution.ended_at.strftime(
                                "%Y-%m-%d %H:%M:%S"
                            ),
                            "status": job_execution.status.name,
                        },
                    }
                },
                200,
            )
        except TableNotFound as table_error:
            raise TableNotFoundHTTP(description=str(table_error))
        except ColumnNotFound as column_error:
            raise ColumnNotFoundHTTP(description=str(column_error))
        except SemanticError as semantic_error:
            raise SemanticErrorHTTP(description=str(semantic_error))
        finally:
            self._catalog.scoped_session.remove()
Esempio n. 14
0
def test_basic_column_graph(save_catalog):
    catalog = save_catalog

    query = "INSERT INTO page_lookup_nonredirect(page_id, page_version) SELECT page.page_id, page.page_latest FROM page"
    parsed = parse(query, "basic_column_graph")
    visitor = SelectSourceVisitor(parsed.name)
    parsed.node.accept(visitor)
    visitor.bind(catalog)

    graph = create_graph(catalog, [visitor])

    assert [node.fqdn for node in sorted(list(nodes(graph.graph)))] == [
        ("test", "default", "page", "page_id"),
        ("test", "default", "page", "page_latest"),
        ("test", "default", "page_lookup_nonredirect", "page_id"),
        ("test", "default", "page_lookup_nonredirect", "page_version"),
    ]

    expected_edges = [
        (
            ("test", "default", "page", "page_id"),
            ("test", "default", "page_lookup_nonredirect", "page_id"),
        ),
        (
            ("test", "default", "page", "page_latest"),
            ("test", "default", "page_lookup_nonredirect", "page_version"),
        ),
    ]

    assert [(edge[0].fqdn, edge[1].fqdn)
            for edge in list(edges(graph.graph))] == expected_edges

    table = catalog.get_table(
        source_name="test",
        schema_name="default",
        table_name="page_lookup_nonredirect",
    )
    columns = catalog.get_columns_for_table(
        table, column_names=["page_id", "page_version"])

    assert len(columns) == 2
    session = catalog.scoped_session

    all_edges = (session.query(ColumnLineage).filter(
        ColumnLineage.target_id.in_([c.id for c in columns])).all())
    assert set([(e.source.fqdn, e.target.fqdn)
                for e in all_edges]) == set(expected_edges)
Esempio n. 15
0
def test_ctas(save_catalog):
    query = """
        CREATE TEMP TABLE temp_table_x(page_title) AS select redirect_title from page_lookup_nonredirect
        where redirect_title is not null
    """
    source = save_catalog.get_source("test")
    schema = save_catalog.get_schema("test", "default")
    save_catalog.update_source(source, schema)
    parsed = parse(query)
    visitor = analyze_dml_query(save_catalog, parsed, source)
    assert visitor is not None

    assert len(visitor.target_columns) == 1
    assert visitor.target_table.fqdn == ("test", "default", "temp_table_x")
    assert len(visitor.source_columns) == 1
    assert [table.fqdn for table in visitor.source_tables
            ] == [("test", "default", "page_lookup_nonredirect")]
Esempio n. 16
0
def test_col_exprs(save_catalog):
    query = """
        INSERT INTO page_lookup_redirect(true_title)
        SELECT
            BTRIM(TO_CHAR(DATEADD (MONTH,-1,('20' ||MAX ("redirect_id") || '-01')::DATE)::DATE,'YY-MM')) AS "max_month"
        FROM page_lookup_nonredirect;
    """
    source = save_catalog.get_source("test")
    parsed = parse(query)
    visitor = analyze_dml_query(catalog=save_catalog,
                                parsed=parsed,
                                source=source)
    assert visitor is not None

    assert len(visitor.target_columns) == 1
    assert visitor.target_table.fqdn == ("test", "default",
                                         "page_lookup_redirect")
    assert len(visitor.source_columns) == 1
    assert [table.fqdn for table in visitor.source_tables
            ] == [("test", "default", "page_lookup_nonredirect")]
Esempio n. 17
0
def test_parse_query(save_catalog):
    query = """
    SELECT BTRIM(TO_CHAR(DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE,\'YY-MM\')) AS "max_month",
        DATEADD(YEAR,-1,DATEADD (MONTH,-3,LAST_DAY (DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE))::DATE)::DATE AS "min_date",
        DATEADD(MONTH,-3,LAST_DAY (DATEADD (MONTH,-1,(\'20\' ||MAX ("group") || \'-01\')::DATE)::DATE))::DATE AS "max_date",
        page_title,
        bytes_sent as mb_sent
    INTO "new_table"
    FROM pagecounts;
    """
    source = save_catalog.get_source("test")
    parsed = parse(query)
    binder = parse_dml_query(catalog=save_catalog,
                             parsed=parsed,
                             source=source)
    assert [context.alias for context in binder.columns] == [
        "max_month",
        "min_date",
        "max_date",
        "page_title",
        "mb_sent",
    ]
Esempio n. 18
0
def parse_queries_fixture(load_queries):
    parsed = [
        parse(sql=query["query"], name=query["name"]) for query in load_queries
    ]
    yield parsed
Esempio n. 19
0
def test_basic_column_graph(save_catalog, graph_sdk):
    catalog = save_catalog

    query = "INSERT INTO page_lookup_nonredirect(page_id, page_version) SELECT page.page_id, page.page_latest FROM page"
    parsed = parse(query, "basic_column_graph")
    visitor = SelectSourceVisitor(parsed.name)
    visitor(parsed.node)
    source = catalog.get_source("test")
    visitor.bind(catalog, source)

    job_execution = extract_lineage(
        catalog,
        visitor,
        source,
        parsed,
        datetime.datetime.now(),
        datetime.datetime.now(),
    )
    graph = load_graph(graph_sdk, [job_execution.job_id])

    assert sorted([node[1]["name"] for node in list(graph.graph.nodes(data=True))]) == [
        "basic_column_graph",
        "test.default.page.page_id",
        "test.default.page.page_latest",
        "test.default.page_lookup_nonredirect.page_id",
        "test.default.page_lookup_nonredirect.page_version",
    ]

    expected_edges = [
        ("column:5", "task:2"),
        ("task:2", "column:13"),
        ("task:2", "column:14"),
        ("column:6", "task:2"),
    ]

    assert [(edge[0], edge[1]) for edge in list(edges(graph.graph))] == expected_edges

    table = catalog.get_table(
        source_name="test", schema_name="default", table_name="page_lookup_nonredirect",
    )
    columns = catalog.get_columns_for_table(
        table, column_names=["page_id", "page_version"]
    )

    assert len(columns) == 2
    session = catalog.scoped_session

    expected_db_edges = [
        (
            ("test", "default", "page", "page_id"),
            ("test", "default", "page_lookup_nonredirect", "page_id"),
        ),
        (
            ("test", "default", "page", "page_latest"),
            ("test", "default", "page_lookup_nonredirect", "page_version"),
        ),
    ]

    all_edges = (
        session.query(ColumnLineage)
        .filter(ColumnLineage.target_id.in_([c.id for c in columns]))
        .all()
    )
    assert set([(e.source.fqdn, e.target.fqdn) for e in all_edges]) == set(
        expected_db_edges
    )
Esempio n. 20
0
def test_no_insert_column_graph(save_catalog, graph_sdk):
    catalog = save_catalog
    query = """
        INSERT INTO page_lookup_nonredirect
        SELECT page.page_id as redirect_id, page.page_title as redirect_title,
            page.page_title true_title, page.page_id, page.page_latest
        FROM page
    """

    parsed = parse(
        query, name="LOAD page_lookup_nonredirect-test_no_insert_column_graph"
    )
    visitor = SelectSourceVisitor(parsed.name)
    visitor(parsed.node)
    source = catalog.get_source("test")
    visitor.bind(catalog, source)

    job_execution = extract_lineage(
        catalog,
        visitor,
        source,
        parsed,
        datetime.datetime.now(),
        datetime.datetime.now(),
    )
    graph = load_graph(graph_sdk, [job_execution.job_id])

    assert sorted([node[1]["name"] for node in list(graph.graph.nodes(data=True))]) == [
        "LOAD page_lookup_nonredirect-test_no_insert_column_graph",
        "test.default.page.page_id",
        "test.default.page.page_latest",
        "test.default.page.page_title",
        "test.default.page_lookup_nonredirect.page_id",
        "test.default.page_lookup_nonredirect.page_version",
        "test.default.page_lookup_nonredirect.redirect_id",
        "test.default.page_lookup_nonredirect.redirect_title",
        "test.default.page_lookup_nonredirect.true_title",
    ]

    expected_edges = [
        ("column:5", "task:1"),
        ("task:1", "column:10"),
        ("task:1", "column:11"),
        ("task:1", "column:12"),
        ("task:1", "column:13"),
        ("task:1", "column:14"),
        ("column:7", "task:1"),
        ("column:6", "task:1"),
    ]

    assert [(edge[0], edge[1]) for edge in list(edges(graph.graph))] == expected_edges

    expected_db_edges = [
        (
            ("test", "default", "page", "page_id"),
            ("test", "default", "page_lookup_nonredirect", "redirect_id"),
        ),
        (
            ("test", "default", "page", "page_id"),
            ("test", "default", "page_lookup_nonredirect", "page_id"),
        ),
        (
            ("test", "default", "page", "page_title"),
            ("test", "default", "page_lookup_nonredirect", "redirect_title"),
        ),
        (
            ("test", "default", "page", "page_title"),
            ("test", "default", "page_lookup_nonredirect", "true_title"),
        ),
        (
            ("test", "default", "page", "page_latest"),
            ("test", "default", "page_lookup_nonredirect", "page_version"),
        ),
    ]
    session = catalog.scoped_session
    all_edges = session.query(ColumnLineage).all()
    assert set([(e.source.fqdn, e.target.fqdn) for e in all_edges]) == set(
        expected_db_edges
    )