Ejemplo n.º 1
0
def test_afdb_lineage(test_meta_adapter: BaseMetaAdapter,
                      test_db_hook: DbApiHook):
    test_meta_adapter.write_lineage(
        Lineage(
            data_target=ShellDataAsset(DUMMY_TABLE2),
            data_sources=[ShellDataAsset(DUMMY_TABLE)],
            dag_id="test_dag",
            task_id="test_task",
            dag_exec_date=datetime.now(),
        ))

    results = test_utils.run_on_db(
        test_db_hook=test_db_hook,
        sql=f"""
    select
        {SQLMetaAdapter.FN_DATA_ASSET_SRC},
        {SQLMetaAdapter.FN_METADATA_TIME}
    from {SQLMetaAdapter.TN_LINEAGE}
    where {SQLMetaAdapter.FN_DATA_ASSET_TRG} = '{DUMMY_TABLE2}'
    """,
    )

    assert len(results) == 1
    assert results.loc[:,
                       SQLMetaAdapter.FN_DATA_ASSET_SRC].iloc[0] == DUMMY_TABLE
Ejemplo n.º 2
0
def lineage() -> Lineage:
    yield Lineage(
        data_sources=[ShellDataAsset(DUMMY_TABLE)],
        data_target=ShellDataAsset(DUMMY_TABLE2),
        dag_id="test_dag",
        task_id="test_task",
        dag_exec_date=datetime.now(),
    )
Ejemplo n.º 3
0
def test_sql1() -> Tuple[str, Lineage]:
    return (
        """
       insert into table table5
       select * from table3

    """,
        Lineage(
            data_sources=[ShellDataAsset("table3")],
            data_target=ShellDataAsset("table5"),
        ),
    )
Ejemplo n.º 4
0
def test_afdb_load_status(test_meta_adapter: BaseMetaAdapter,
                          test_db_hook: DbApiHook):
    test_meta_adapter.write_load_status(
        LoadStatus(
            for_asset=ShellDataAsset(DUMMY_TABLE),
            load_time=datetime.now(),
            dag_id="test_dag",
            task_id="test_task",
            dag_exec_date=datetime.now(),
        ))

    test_meta_adapter.write_load_status(
        LoadStatus(
            for_asset=ShellDataAsset(DUMMY_TABLE),
            load_time=datetime.now() + timedelta(days=1),
            dag_id="test_dag",
            task_id="test_task",
            dag_exec_date=datetime.now(),
        ))

    test_meta_adapter.write_load_status(
        LoadStatus(
            for_asset=ShellDataAsset(DUMMY_TABLE),
            load_time=datetime.now() + timedelta(days=2),
            dag_id="test_dag",
            task_id="test_task",
            dag_exec_date=datetime.now(),
        ))

    # check row-counts after two updates to the load status metadata table
    predicate = f"{SQLMetaAdapter.FN_DATA_ASSET} = '{DUMMY_TABLE}'"
    rc = table_rowcount(
        test_db_hook=test_db_hook,
        table=SQLMetaAdapter.TN_LOAD_STATUS,
        predicate=predicate,
    )

    # we expect exactly one entry in the master load status table:
    assert rc == 1

    rc_hist = table_rowcount(
        test_db_hook=test_db_hook,
        table=SQLMetaAdapter.TN_LOAD_STATUS_HIST,
        predicate=predicate,
    )
    # we expect at least 2 entries here now:
    assert rc_hist > 1
Ejemplo n.º 5
0
def load_status() -> LoadStatus:
    yield LoadStatus(
        for_asset=ShellDataAsset(DUMMY_TABLE2),
        load_time=datetime.now(),
        dag_id="test_dag",
        task_id="test_task",
        dag_exec_date=datetime.now(),
    )
Ejemplo n.º 6
0
def test_sql2() -> Tuple[str, Lineage]:
    return (
        """
        INSERT OVERWRITE TABLE table4
        select
            from
             table1 t1 join table2 t2
            on t1.key = t2.key
        where exist (select 1 from table3 as t3 where t3.key = t2.fkey


    """,
        Lineage(
            data_sources=ShellDataAsset.from_names(
                ["table1", "table2", "table3"]),
            data_target=ShellDataAsset("table4"),
        ),
    )
Ejemplo n.º 7
0
    def lineage_from_sql_statement(
        statement: str, known_data_assets: Optional[Iterable[str]] = None
    ) -> "Lineage":
        """
        Extract the lineage metadata from a (simple) SQL statement.

        :param statement: the string with the SQL statement
        :param known_data_assets: iterable of known data asset names to restrict the search space of lineage sources and
                                  targets to it - if not given, all known data assets will be fetched from the
                                  declaration store
        :return: the lineage entity with ShellDataAssets as data source(s) and data target and without
                 additional context (DAG ID, task ID, DAG execution datetime)
        """

        """ Note: simplistic algorithm with room for improvement! (i.e. would not support WITH style CTEs) """

        # ensure we either have received a list of known data assets or can look them up from the declaration store
        if known_data_assets is None:
            known_data_assets = [
                d.asset_name for d in declaration_store.fetch_all_declarations()
            ]

        # parse the affected data assets using their known tokens, focusing on "whole word" tokens

        tokens = [
            t.strip().lower()
            for t in re.compile(r"\s|,|=|\(|\)").split(
                statement.replace("\n", " ").replace("\r", "")
            )
            if len(t) > 2
        ]

        target, sources = None, []

        for tok in tokens:
            if tok in known_data_assets:
                if target is None:
                    target = ShellDataAsset(tok)
                else:
                    sources.append(ShellDataAsset(tok))

        return Lineage(data_sources=sources, data_target=target)
Ejemplo n.º 8
0
def ingested_file_metadata() -> IngestedFileMetadata:
    yield IngestedFileMetadata(
        for_asset=ShellDataAsset(DUMMY_TABLE),
        filepath="test_path.csv",
        filesize=100,
        file_mod_time=datetime.now(),
        file_create_time=datetime.now(),
        dag_id="test_dag",
        dag_exec_date=datetime.now(),
        task_id="test_task_id",
    )
Ejemplo n.º 9
0
def test_shell_data_asset(fake_airflow_context: Dict) -> None:
    shell_data_asset = ShellDataAsset("test_parquet_in_asset")

    with pytest.raises(NotImplementedError):
        shell_data_asset.rebuild_for_store(fake_airflow_context)
    with pytest.raises(NotImplementedError):
        shell_data_asset.retrieve_from_store()

    pandas_data_asset = shell_data_asset.to_full_data_asset(
        target_type=PandasDataAsset)
    assert isinstance(pandas_data_asset, PandasDataAsset)
Ejemplo n.º 10
0
def test_pandas_data_asset(
    fake_airflow_context: Dict,
    test_parquet_asset: PandasDataAsset,
    test_parquet_asset_df: pd.DataFrame,
) -> None:

    # none, none
    d1 = test_parquet_asset.retrieve_from_store()

    # only airflow_context
    d2 = test_parquet_asset.retrieve_from_store(
        airflow_context=fake_airflow_context)

    # only consuming asset
    d3 = test_parquet_asset.retrieve_from_store(consuming_asset=ShellDataAsset(
        name="test_consumer"))

    # both parameters set
    d4 = test_parquet_asset.retrieve_from_store(
        airflow_context=fake_airflow_context,
        consuming_asset=ShellDataAsset(name="test_consumer"),
    )

    assert d1.equals(d2) and d2.equals(d3) and d3.equals(d4)
Ejemplo n.º 11
0
def test_generic_metadata(test_meta_adapter: BaseMetaAdapter):
    payload = {"my_data": "my_value", "test_numerid": 42}
    test_meta_adapter.write_generic_metadata(
        for_asset=ShellDataAsset(DUMMY_TABLE), payload=payload)
Ejemplo n.º 12
0
        def _get_lineage_sources_for_target(
                target_name: str) -> Iterable[Lineage]:
            select = self.t_lineage.select().where(
                self.t_lineage.c[self.FN_DATA_ASSET_TRG] == target_name)

            lineage_for_target = pd.read_sql(sql=select,
                                             con=self._connection())

            if dag_id is not None:
                lineage_for_target = lineage_for_target.loc[lineage_for_target[
                    self.FN_DAG_ID] == dag_id]
            if dag_exec_date is not None:
                lineage_for_target = lineage_for_target.loc[lineage_for_target[
                    self.FN_EXEC_DATE] == dag_exec_date]

            max_execution_date_per_dag = (
                lineage_for_target.loc[:, [self.FN_DAG_ID, self.FN_EXEC_DATE]].
                groupby(self.FN_DAG_ID).max())

            lineage_for_target_latest: pd.DataFrame = lineage_for_target.merge(
                right=max_execution_date_per_dag,
                on=[self.FN_DAG_ID, self.FN_EXEC_DATE])

            # we create a lineage entity for each unique dag_id, task_id combination for this source:
            # sort by dag_id, task_id:
            lineage_for_target_latest = lineage_for_target_latest.sort_values(
                by=[self.FN_DAG_ID, self.FN_TASK_ID])

            # now we aggregate each sources per dag_id, task_id, dag_exec_date combination:
            last_dag_id, last_task_id, last_exec_date = None, None, None
            found_sources = []
            lineage_returned = []

            for idx, row in lineage_for_target_latest.iterrows():
                cur_dag_id, cur_task_id, cur_exec_date = (
                    row[self.FN_DAG_ID],
                    row[self.FN_TASK_ID],
                    row[self.FN_EXEC_DATE],
                )
                if last_dag_id is not None and last_task_id is not None:
                    if cur_dag_id != last_dag_id or cur_task_id != last_task_id:
                        lineage_returned.append(
                            Lineage(
                                data_sources=found_sources,
                                data_target=ShellDataAsset(target_name),
                                dag_id=last_dag_id,
                                task_id=last_task_id,
                                dag_exec_date=last_exec_date,
                            ))
                        found_sources = []

                found_sources.append(
                    ShellDataAsset(row[self.FN_DATA_ASSET_SRC]))
                last_dag_id, last_task_id, last_exec_date = (
                    cur_dag_id,
                    cur_task_id,
                    cur_exec_date,
                )
            if len(found_sources) > 0:
                lineage_returned.append(
                    Lineage(
                        data_sources=found_sources,
                        data_target=ShellDataAsset(target_name),
                        dag_id=last_dag_id,
                        task_id=last_task_id,
                        dag_exec_date=last_exec_date,
                    ))

            return lineage_returned