def test_afdb_lineage(test_meta_adapter: BaseMetaAdapter, test_db_hook: DbApiHook): test_meta_adapter.write_lineage( Lineage( data_target=ShellDataAsset(DUMMY_TABLE2), data_sources=[ShellDataAsset(DUMMY_TABLE)], dag_id="test_dag", task_id="test_task", dag_exec_date=datetime.now(), )) results = test_utils.run_on_db( test_db_hook=test_db_hook, sql=f""" select {SQLMetaAdapter.FN_DATA_ASSET_SRC}, {SQLMetaAdapter.FN_METADATA_TIME} from {SQLMetaAdapter.TN_LINEAGE} where {SQLMetaAdapter.FN_DATA_ASSET_TRG} = '{DUMMY_TABLE2}' """, ) assert len(results) == 1 assert results.loc[:, SQLMetaAdapter.FN_DATA_ASSET_SRC].iloc[0] == DUMMY_TABLE
def lineage() -> Lineage: yield Lineage( data_sources=[ShellDataAsset(DUMMY_TABLE)], data_target=ShellDataAsset(DUMMY_TABLE2), dag_id="test_dag", task_id="test_task", dag_exec_date=datetime.now(), )
def test_sql1() -> Tuple[str, Lineage]: return ( """ insert into table table5 select * from table3 """, Lineage( data_sources=[ShellDataAsset("table3")], data_target=ShellDataAsset("table5"), ), )
def test_afdb_load_status(test_meta_adapter: BaseMetaAdapter, test_db_hook: DbApiHook): test_meta_adapter.write_load_status( LoadStatus( for_asset=ShellDataAsset(DUMMY_TABLE), load_time=datetime.now(), dag_id="test_dag", task_id="test_task", dag_exec_date=datetime.now(), )) test_meta_adapter.write_load_status( LoadStatus( for_asset=ShellDataAsset(DUMMY_TABLE), load_time=datetime.now() + timedelta(days=1), dag_id="test_dag", task_id="test_task", dag_exec_date=datetime.now(), )) test_meta_adapter.write_load_status( LoadStatus( for_asset=ShellDataAsset(DUMMY_TABLE), load_time=datetime.now() + timedelta(days=2), dag_id="test_dag", task_id="test_task", dag_exec_date=datetime.now(), )) # check row-counts after two updates to the load status metadata table predicate = f"{SQLMetaAdapter.FN_DATA_ASSET} = '{DUMMY_TABLE}'" rc = table_rowcount( test_db_hook=test_db_hook, table=SQLMetaAdapter.TN_LOAD_STATUS, predicate=predicate, ) # we expect exactly one entry in the master load status table: assert rc == 1 rc_hist = table_rowcount( test_db_hook=test_db_hook, table=SQLMetaAdapter.TN_LOAD_STATUS_HIST, predicate=predicate, ) # we expect at least 2 entries here now: assert rc_hist > 1
def load_status() -> LoadStatus: yield LoadStatus( for_asset=ShellDataAsset(DUMMY_TABLE2), load_time=datetime.now(), dag_id="test_dag", task_id="test_task", dag_exec_date=datetime.now(), )
def test_sql2() -> Tuple[str, Lineage]: return ( """ INSERT OVERWRITE TABLE table4 select from table1 t1 join table2 t2 on t1.key = t2.key where exist (select 1 from table3 as t3 where t3.key = t2.fkey """, Lineage( data_sources=ShellDataAsset.from_names( ["table1", "table2", "table3"]), data_target=ShellDataAsset("table4"), ), )
def lineage_from_sql_statement( statement: str, known_data_assets: Optional[Iterable[str]] = None ) -> "Lineage": """ Extract the lineage metadata from a (simple) SQL statement. :param statement: the string with the SQL statement :param known_data_assets: iterable of known data asset names to restrict the search space of lineage sources and targets to it - if not given, all known data assets will be fetched from the declaration store :return: the lineage entity with ShellDataAssets as data source(s) and data target and without additional context (DAG ID, task ID, DAG execution datetime) """ """ Note: simplistic algorithm with room for improvement! (i.e. would not support WITH style CTEs) """ # ensure we either have received a list of known data assets or can look them up from the declaration store if known_data_assets is None: known_data_assets = [ d.asset_name for d in declaration_store.fetch_all_declarations() ] # parse the affected data assets using their known tokens, focusing on "whole word" tokens tokens = [ t.strip().lower() for t in re.compile(r"\s|,|=|\(|\)").split( statement.replace("\n", " ").replace("\r", "") ) if len(t) > 2 ] target, sources = None, [] for tok in tokens: if tok in known_data_assets: if target is None: target = ShellDataAsset(tok) else: sources.append(ShellDataAsset(tok)) return Lineage(data_sources=sources, data_target=target)
def ingested_file_metadata() -> IngestedFileMetadata: yield IngestedFileMetadata( for_asset=ShellDataAsset(DUMMY_TABLE), filepath="test_path.csv", filesize=100, file_mod_time=datetime.now(), file_create_time=datetime.now(), dag_id="test_dag", dag_exec_date=datetime.now(), task_id="test_task_id", )
def test_shell_data_asset(fake_airflow_context: Dict) -> None: shell_data_asset = ShellDataAsset("test_parquet_in_asset") with pytest.raises(NotImplementedError): shell_data_asset.rebuild_for_store(fake_airflow_context) with pytest.raises(NotImplementedError): shell_data_asset.retrieve_from_store() pandas_data_asset = shell_data_asset.to_full_data_asset( target_type=PandasDataAsset) assert isinstance(pandas_data_asset, PandasDataAsset)
def test_pandas_data_asset( fake_airflow_context: Dict, test_parquet_asset: PandasDataAsset, test_parquet_asset_df: pd.DataFrame, ) -> None: # none, none d1 = test_parquet_asset.retrieve_from_store() # only airflow_context d2 = test_parquet_asset.retrieve_from_store( airflow_context=fake_airflow_context) # only consuming asset d3 = test_parquet_asset.retrieve_from_store(consuming_asset=ShellDataAsset( name="test_consumer")) # both parameters set d4 = test_parquet_asset.retrieve_from_store( airflow_context=fake_airflow_context, consuming_asset=ShellDataAsset(name="test_consumer"), ) assert d1.equals(d2) and d2.equals(d3) and d3.equals(d4)
def test_generic_metadata(test_meta_adapter: BaseMetaAdapter): payload = {"my_data": "my_value", "test_numerid": 42} test_meta_adapter.write_generic_metadata( for_asset=ShellDataAsset(DUMMY_TABLE), payload=payload)
def _get_lineage_sources_for_target( target_name: str) -> Iterable[Lineage]: select = self.t_lineage.select().where( self.t_lineage.c[self.FN_DATA_ASSET_TRG] == target_name) lineage_for_target = pd.read_sql(sql=select, con=self._connection()) if dag_id is not None: lineage_for_target = lineage_for_target.loc[lineage_for_target[ self.FN_DAG_ID] == dag_id] if dag_exec_date is not None: lineage_for_target = lineage_for_target.loc[lineage_for_target[ self.FN_EXEC_DATE] == dag_exec_date] max_execution_date_per_dag = ( lineage_for_target.loc[:, [self.FN_DAG_ID, self.FN_EXEC_DATE]]. groupby(self.FN_DAG_ID).max()) lineage_for_target_latest: pd.DataFrame = lineage_for_target.merge( right=max_execution_date_per_dag, on=[self.FN_DAG_ID, self.FN_EXEC_DATE]) # we create a lineage entity for each unique dag_id, task_id combination for this source: # sort by dag_id, task_id: lineage_for_target_latest = lineage_for_target_latest.sort_values( by=[self.FN_DAG_ID, self.FN_TASK_ID]) # now we aggregate each sources per dag_id, task_id, dag_exec_date combination: last_dag_id, last_task_id, last_exec_date = None, None, None found_sources = [] lineage_returned = [] for idx, row in lineage_for_target_latest.iterrows(): cur_dag_id, cur_task_id, cur_exec_date = ( row[self.FN_DAG_ID], row[self.FN_TASK_ID], row[self.FN_EXEC_DATE], ) if last_dag_id is not None and last_task_id is not None: if cur_dag_id != last_dag_id or cur_task_id != last_task_id: lineage_returned.append( Lineage( data_sources=found_sources, data_target=ShellDataAsset(target_name), dag_id=last_dag_id, task_id=last_task_id, dag_exec_date=last_exec_date, )) found_sources = [] found_sources.append( ShellDataAsset(row[self.FN_DATA_ASSET_SRC])) last_dag_id, last_task_id, last_exec_date = ( cur_dag_id, cur_task_id, cur_exec_date, ) if len(found_sources) > 0: lineage_returned.append( Lineage( data_sources=found_sources, data_target=ShellDataAsset(target_name), dag_id=last_dag_id, task_id=last_task_id, dag_exec_date=last_exec_date, )) return lineage_returned