def test_afdb_lineage(test_meta_adapter: BaseMetaAdapter,
                      test_db_hook: DbApiHook):
    test_meta_adapter.write_lineage(
        Lineage(
            data_target=ShellDataAsset(DUMMY_TABLE2),
            data_sources=[ShellDataAsset(DUMMY_TABLE)],
            dag_id="test_dag",
            task_id="test_task",
            dag_exec_date=datetime.now(),
        ))

    results = test_utils.run_on_db(
        test_db_hook=test_db_hook,
        sql=f"""
    select
        {SQLMetaAdapter.FN_DATA_ASSET_SRC},
        {SQLMetaAdapter.FN_METADATA_TIME}
    from {SQLMetaAdapter.TN_LINEAGE}
    where {SQLMetaAdapter.FN_DATA_ASSET_TRG} = '{DUMMY_TABLE2}'
    """,
    )

    assert len(results) == 1
    assert results.loc[:,
                       SQLMetaAdapter.FN_DATA_ASSET_SRC].iloc[0] == DUMMY_TABLE
def test_lineage(test_sql1, test_sql2):
    for test_sql, expected_lineage in (test_sql1, test_sql2):
        assert (Lineage.lineage_from_sql_statement(
            test_sql, known_data_assets=[f"table{n}" for n in range(0, 10)
                                         ]) == expected_lineage)

    # test lineage comparison when not equal:
    assert test_sql1[1] != test_sql2[1]
def lineage() -> Lineage:
    yield Lineage(
        data_sources=[ShellDataAsset(DUMMY_TABLE)],
        data_target=ShellDataAsset(DUMMY_TABLE2),
        dag_id="test_dag",
        task_id="test_task",
        dag_exec_date=datetime.now(),
    )
def test_sql1() -> Tuple[str, Lineage]:
    return (
        """
       insert into table table5
       select * from table3

    """,
        Lineage(
            data_sources=[ShellDataAsset("table3")],
            data_target=ShellDataAsset("table5"),
        ),
    )
def test_sql2() -> Tuple[str, Lineage]:
    return (
        """
        INSERT OVERWRITE TABLE table4
        select
            from
             table1 t1 join table2 t2
            on t1.key = t2.key
        where exist (select 1 from table3 as t3 where t3.key = t2.fkey


    """,
        Lineage(
            data_sources=ShellDataAsset.from_names(
                ["table1", "table2", "table3"]),
            data_target=ShellDataAsset("table4"),
        ),
    )
def test_lineage_from_script() -> None:
    Lineage.lineage_from_sql_script(
        script_file_relative_path="/dml/test_schema/test_table.sql")
Exemple #7
0
        def _get_lineage_sources_for_target(
                target_name: str) -> Iterable[Lineage]:
            select = self.t_lineage.select().where(
                self.t_lineage.c[self.FN_DATA_ASSET_TRG] == target_name)

            lineage_for_target = pd.read_sql(sql=select,
                                             con=self._connection())

            if dag_id is not None:
                lineage_for_target = lineage_for_target.loc[lineage_for_target[
                    self.FN_DAG_ID] == dag_id]
            if dag_exec_date is not None:
                lineage_for_target = lineage_for_target.loc[lineage_for_target[
                    self.FN_EXEC_DATE] == dag_exec_date]

            max_execution_date_per_dag = (
                lineage_for_target.loc[:, [self.FN_DAG_ID, self.FN_EXEC_DATE]].
                groupby(self.FN_DAG_ID).max())

            lineage_for_target_latest: pd.DataFrame = lineage_for_target.merge(
                right=max_execution_date_per_dag,
                on=[self.FN_DAG_ID, self.FN_EXEC_DATE])

            # we create a lineage entity for each unique dag_id, task_id combination for this source:
            # sort by dag_id, task_id:
            lineage_for_target_latest = lineage_for_target_latest.sort_values(
                by=[self.FN_DAG_ID, self.FN_TASK_ID])

            # now we aggregate each sources per dag_id, task_id, dag_exec_date combination:
            last_dag_id, last_task_id, last_exec_date = None, None, None
            found_sources = []
            lineage_returned = []

            for idx, row in lineage_for_target_latest.iterrows():
                cur_dag_id, cur_task_id, cur_exec_date = (
                    row[self.FN_DAG_ID],
                    row[self.FN_TASK_ID],
                    row[self.FN_EXEC_DATE],
                )
                if last_dag_id is not None and last_task_id is not None:
                    if cur_dag_id != last_dag_id or cur_task_id != last_task_id:
                        lineage_returned.append(
                            Lineage(
                                data_sources=found_sources,
                                data_target=ShellDataAsset(target_name),
                                dag_id=last_dag_id,
                                task_id=last_task_id,
                                dag_exec_date=last_exec_date,
                            ))
                        found_sources = []

                found_sources.append(
                    ShellDataAsset(row[self.FN_DATA_ASSET_SRC]))
                last_dag_id, last_task_id, last_exec_date = (
                    cur_dag_id,
                    cur_task_id,
                    cur_exec_date,
                )
            if len(found_sources) > 0:
                lineage_returned.append(
                    Lineage(
                        data_sources=found_sources,
                        data_target=ShellDataAsset(target_name),
                        dag_id=last_dag_id,
                        task_id=last_task_id,
                        dag_exec_date=last_exec_date,
                    ))

            return lineage_returned
Exemple #8
0
    def read_lineage(
        self,
        for_target: BaseDataAsset,
        dag_id: Optional[str] = None,
        dag_exec_date: Optional[datetime] = None,
    ) -> List[Tuple[Lineage, int]]:
        """
        Read previously logged lineage metadata.

        :param for_target: the data asset for which to retrieve lineage information for, here the data target.
        :param dag_id: the DAG ID for which to restrict the lineage search on (optional)
        :param dag_exec_date: the DAG execution datetime for which to restrict the lineage search on (optional)
        :return: the lineage information captured as a list of tuples, in which the first element is a lineage metadata
                 entity and the second an int of the upstream level in the lineage chain
        """

        collected_lineage = []

        self._setup()

        # we create a dummy entry as a starting anchor for the recursive search
        lineage_to_query = [(Lineage(data_sources=[],
                                     data_target=for_target), 0)]

        def _get_lineage_sources_for_target(
                target_name: str) -> Iterable[Lineage]:
            select = self.t_lineage.select().where(
                self.t_lineage.c[self.FN_DATA_ASSET_TRG] == target_name)

            lineage_for_target = pd.read_sql(sql=select,
                                             con=self._connection())

            if dag_id is not None:
                lineage_for_target = lineage_for_target.loc[lineage_for_target[
                    self.FN_DAG_ID] == dag_id]
            if dag_exec_date is not None:
                lineage_for_target = lineage_for_target.loc[lineage_for_target[
                    self.FN_EXEC_DATE] == dag_exec_date]

            max_execution_date_per_dag = (
                lineage_for_target.loc[:, [self.FN_DAG_ID, self.FN_EXEC_DATE]].
                groupby(self.FN_DAG_ID).max())

            lineage_for_target_latest: pd.DataFrame = lineage_for_target.merge(
                right=max_execution_date_per_dag,
                on=[self.FN_DAG_ID, self.FN_EXEC_DATE])

            # we create a lineage entity for each unique dag_id, task_id combination for this source:
            # sort by dag_id, task_id:
            lineage_for_target_latest = lineage_for_target_latest.sort_values(
                by=[self.FN_DAG_ID, self.FN_TASK_ID])

            # now we aggregate each sources per dag_id, task_id, dag_exec_date combination:
            last_dag_id, last_task_id, last_exec_date = None, None, None
            found_sources = []
            lineage_returned = []

            for idx, row in lineage_for_target_latest.iterrows():
                cur_dag_id, cur_task_id, cur_exec_date = (
                    row[self.FN_DAG_ID],
                    row[self.FN_TASK_ID],
                    row[self.FN_EXEC_DATE],
                )
                if last_dag_id is not None and last_task_id is not None:
                    if cur_dag_id != last_dag_id or cur_task_id != last_task_id:
                        lineage_returned.append(
                            Lineage(
                                data_sources=found_sources,
                                data_target=ShellDataAsset(target_name),
                                dag_id=last_dag_id,
                                task_id=last_task_id,
                                dag_exec_date=last_exec_date,
                            ))
                        found_sources = []

                found_sources.append(
                    ShellDataAsset(row[self.FN_DATA_ASSET_SRC]))
                last_dag_id, last_task_id, last_exec_date = (
                    cur_dag_id,
                    cur_task_id,
                    cur_exec_date,
                )
            if len(found_sources) > 0:
                lineage_returned.append(
                    Lineage(
                        data_sources=found_sources,
                        data_target=ShellDataAsset(target_name),
                        dag_id=last_dag_id,
                        task_id=last_task_id,
                        dag_exec_date=last_exec_date,
                    ))

            return lineage_returned

        while len(lineage_to_query) > 0:
            target_to_get_lineage_for, level = lineage_to_query.pop(0)
            sources = _get_lineage_sources_for_target(
                target_to_get_lineage_for.data_target.name)

            for s in sources:
                collected_lineage.append((s, level))
                if s.data_target != for_target:
                    lineage_to_query.append((s, level + 1))

        return collected_lineage