def test_afdb_lineage(test_meta_adapter: BaseMetaAdapter, test_db_hook: DbApiHook): test_meta_adapter.write_lineage( Lineage( data_target=ShellDataAsset(DUMMY_TABLE2), data_sources=[ShellDataAsset(DUMMY_TABLE)], dag_id="test_dag", task_id="test_task", dag_exec_date=datetime.now(), )) results = test_utils.run_on_db( test_db_hook=test_db_hook, sql=f""" select {SQLMetaAdapter.FN_DATA_ASSET_SRC}, {SQLMetaAdapter.FN_METADATA_TIME} from {SQLMetaAdapter.TN_LINEAGE} where {SQLMetaAdapter.FN_DATA_ASSET_TRG} = '{DUMMY_TABLE2}' """, ) assert len(results) == 1 assert results.loc[:, SQLMetaAdapter.FN_DATA_ASSET_SRC].iloc[0] == DUMMY_TABLE
def lineage() -> Lineage: yield Lineage( data_sources=[ShellDataAsset(DUMMY_TABLE)], data_target=ShellDataAsset(DUMMY_TABLE2), dag_id="test_dag", task_id="test_task", dag_exec_date=datetime.now(), )
def test_sql1() -> Tuple[str, Lineage]: return ( """ insert into table table5 select * from table3 """, Lineage( data_sources=[ShellDataAsset("table3")], data_target=ShellDataAsset("table5"), ), )
def test_sql2() -> Tuple[str, Lineage]: return ( """ INSERT OVERWRITE TABLE table4 select from table1 t1 join table2 t2 on t1.key = t2.key where exist (select 1 from table3 as t3 where t3.key = t2.fkey """, Lineage( data_sources=ShellDataAsset.from_names( ["table1", "table2", "table3"]), data_target=ShellDataAsset("table4"), ), )
def _get_lineage_sources_for_target( target_name: str) -> Iterable[Lineage]: select = self.t_lineage.select().where( self.t_lineage.c[self.FN_DATA_ASSET_TRG] == target_name) lineage_for_target = pd.read_sql(sql=select, con=self._connection()) if dag_id is not None: lineage_for_target = lineage_for_target.loc[lineage_for_target[ self.FN_DAG_ID] == dag_id] if dag_exec_date is not None: lineage_for_target = lineage_for_target.loc[lineage_for_target[ self.FN_EXEC_DATE] == dag_exec_date] max_execution_date_per_dag = ( lineage_for_target.loc[:, [self.FN_DAG_ID, self.FN_EXEC_DATE]]. groupby(self.FN_DAG_ID).max()) lineage_for_target_latest: pd.DataFrame = lineage_for_target.merge( right=max_execution_date_per_dag, on=[self.FN_DAG_ID, self.FN_EXEC_DATE]) # we create a lineage entity for each unique dag_id, task_id combination for this source: # sort by dag_id, task_id: lineage_for_target_latest = lineage_for_target_latest.sort_values( by=[self.FN_DAG_ID, self.FN_TASK_ID]) # now we aggregate each sources per dag_id, task_id, dag_exec_date combination: last_dag_id, last_task_id, last_exec_date = None, None, None found_sources = [] lineage_returned = [] for idx, row in lineage_for_target_latest.iterrows(): cur_dag_id, cur_task_id, cur_exec_date = ( row[self.FN_DAG_ID], row[self.FN_TASK_ID], row[self.FN_EXEC_DATE], ) if last_dag_id is not None and last_task_id is not None: if cur_dag_id != last_dag_id or cur_task_id != last_task_id: lineage_returned.append( Lineage( data_sources=found_sources, data_target=ShellDataAsset(target_name), dag_id=last_dag_id, task_id=last_task_id, dag_exec_date=last_exec_date, )) found_sources = [] found_sources.append( ShellDataAsset(row[self.FN_DATA_ASSET_SRC])) last_dag_id, last_task_id, last_exec_date = ( cur_dag_id, cur_task_id, cur_exec_date, ) if len(found_sources) > 0: lineage_returned.append( Lineage( data_sources=found_sources, data_target=ShellDataAsset(target_name), dag_id=last_dag_id, task_id=last_task_id, dag_exec_date=last_exec_date, )) return lineage_returned
def read_lineage( self, for_target: BaseDataAsset, dag_id: Optional[str] = None, dag_exec_date: Optional[datetime] = None, ) -> List[Tuple[Lineage, int]]: """ Read previously logged lineage metadata. :param for_target: the data asset for which to retrieve lineage information for, here the data target. :param dag_id: the DAG ID for which to restrict the lineage search on (optional) :param dag_exec_date: the DAG execution datetime for which to restrict the lineage search on (optional) :return: the lineage information captured as a list of tuples, in which the first element is a lineage metadata entity and the second an int of the upstream level in the lineage chain """ collected_lineage = [] self._setup() # we create a dummy entry as a starting anchor for the recursive search lineage_to_query = [(Lineage(data_sources=[], data_target=for_target), 0)] def _get_lineage_sources_for_target( target_name: str) -> Iterable[Lineage]: select = self.t_lineage.select().where( self.t_lineage.c[self.FN_DATA_ASSET_TRG] == target_name) lineage_for_target = pd.read_sql(sql=select, con=self._connection()) if dag_id is not None: lineage_for_target = lineage_for_target.loc[lineage_for_target[ self.FN_DAG_ID] == dag_id] if dag_exec_date is not None: lineage_for_target = lineage_for_target.loc[lineage_for_target[ self.FN_EXEC_DATE] == dag_exec_date] max_execution_date_per_dag = ( lineage_for_target.loc[:, [self.FN_DAG_ID, self.FN_EXEC_DATE]]. groupby(self.FN_DAG_ID).max()) lineage_for_target_latest: pd.DataFrame = lineage_for_target.merge( right=max_execution_date_per_dag, on=[self.FN_DAG_ID, self.FN_EXEC_DATE]) # we create a lineage entity for each unique dag_id, task_id combination for this source: # sort by dag_id, task_id: lineage_for_target_latest = lineage_for_target_latest.sort_values( by=[self.FN_DAG_ID, self.FN_TASK_ID]) # now we aggregate each sources per dag_id, task_id, dag_exec_date combination: last_dag_id, last_task_id, last_exec_date = None, None, None found_sources = [] lineage_returned = [] for idx, row in lineage_for_target_latest.iterrows(): cur_dag_id, cur_task_id, cur_exec_date = ( row[self.FN_DAG_ID], row[self.FN_TASK_ID], row[self.FN_EXEC_DATE], ) if last_dag_id is not None and last_task_id is not None: if cur_dag_id != last_dag_id or cur_task_id != last_task_id: lineage_returned.append( Lineage( data_sources=found_sources, data_target=ShellDataAsset(target_name), dag_id=last_dag_id, task_id=last_task_id, dag_exec_date=last_exec_date, )) found_sources = [] found_sources.append( ShellDataAsset(row[self.FN_DATA_ASSET_SRC])) last_dag_id, last_task_id, last_exec_date = ( cur_dag_id, cur_task_id, cur_exec_date, ) if len(found_sources) > 0: lineage_returned.append( Lineage( data_sources=found_sources, data_target=ShellDataAsset(target_name), dag_id=last_dag_id, task_id=last_task_id, dag_exec_date=last_exec_date, )) return lineage_returned while len(lineage_to_query) > 0: target_to_get_lineage_for, level = lineage_to_query.pop(0) sources = _get_lineage_sources_for_target( target_to_get_lineage_for.data_target.name) for s in sources: collected_lineage.append((s, level)) if s.data_target != for_target: lineage_to_query.append((s, level + 1)) return collected_lineage