def test_lineage_backend(mock_emit): # Airflow 2.x does not have lineage backend support merged back in yet. # As such, we must protect these imports. from airflow.lineage import apply_lineage, prepare_lineage from datahub.integrations.airflow.entities import Dataset DEFAULT_DATE = days_ago(2) with mock.patch.dict( os.environ, { "AIRFLOW__LINEAGE__BACKEND": "datahub.integrations.airflow.DatahubAirflowLineageBackend", "AIRFLOW__LINEAGE__DATAHUB_CONN_ID": datahub_rest_connection_config.conn_id, }, ), patch_airflow_connection(datahub_rest_connection_config): func = mock.Mock() func.__name__ = "foo" dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) with dag: op1 = DummyOperator(task_id="task1") upstream = Dataset("snowflake", "mydb.schema.tableConsumed") downstream = Dataset("snowflake", "mydb.schema.tableProduced") op1.inlets.append(upstream) op1.outlets.append(downstream) ti = TI(task=op1, execution_date=DEFAULT_DATE) ctx1 = { "dag": dag, "task": op1, "ti": ti, "task_instance": ti, "execution_date": DEFAULT_DATE, "ts": "2021-04-08T00:54:25.771575+00:00", } prep = prepare_lineage(func) prep(op1, ctx1) post = apply_lineage(func) post(op1, ctx1) mock_emit.assert_called_once() assert len(mock_emit.call_args[0][0]) == 4 assert all(mce.validate() for mce in mock_emit.call_args[0][0])
default_args = { "owner": "airflow", "depends_on_past": False, "email": ["*****@*****.**"], "email_on_failure": False, "execution_timeout": timedelta(minutes=5), } with DAG( "datahub_lineage_backend_demo", default_args=default_args, description= "An example DAG demonstrating the usage of DataHub's Airflow lineage backend.", schedule_interval=timedelta(days=1), start_date=days_ago(2), catchup=False, ) as dag: task1 = BashOperator( task_id="run_data_task", dag=dag, bash_command="echo 'This is where you might run your data tooling.'", inlets={ "datasets": [ Dataset("snowflake", "mydb.schema.tableA"), Dataset("snowflake", "mydb.schema.tableB"), ], }, outlets={"datasets": [Dataset("snowflake", "mydb.schema.tableC")]}, )
# in the Airflow UI, where it will be even more clear if something # is wrong. hook.get_connection_form_widgets() hook.get_ui_field_behaviour() @pytest.mark.parametrize( "inlets,outlets", [ ( # Airflow 1.10.x uses a dictionary structure for inlets and outlets. # We want the lineage backend to support this structure for backwards # compatability reasons, so this test is not conditional. { "datasets": [Dataset("snowflake", "mydb.schema.tableConsumed")] }, { "datasets": [Dataset("snowflake", "mydb.schema.tableProduced")] }, ), pytest.param( # Airflow 2.x also supports a flattened list for inlets and outlets. # We want to test this capability. [Dataset("snowflake", "mydb.schema.tableConsumed")], [Dataset("snowflake", "mydb.schema.tableProduced")], marks=pytest.mark.skipif( airflow.version.version.startswith("1"), reason="list-style lineage is only supported in Airflow 2.x", ),
def test_hook_airflow_ui(hook): # Simply ensure that these run without issue. These will also show up # in the Airflow UI, where it will be even more clear if something # is wrong. hook.get_connection_form_widgets() hook.get_ui_field_behaviour() @pytest.mark.parametrize( ["inlets", "outlets"], [ ( # Airflow 1.10.x uses a dictionary structure for inlets and outlets. # We want the lineage backend to support this structure for backwards # compatability reasons, so this test is not conditional. {"datasets": [Dataset("snowflake", "mydb.schema.tableConsumed")]}, {"datasets": [Dataset("snowflake", "mydb.schema.tableProduced")]}, ), pytest.param( # Airflow 2.x also supports a flattened list for inlets and outlets. # We want to test this capability. [Dataset("snowflake", "mydb.schema.tableConsumed")], [Dataset("snowflake", "mydb.schema.tableProduced")], marks=pytest.mark.skipif( airflow.version.version.startswith("1"), reason="list-style lineage is only supported in Airflow 2.x", ), ), ], ids=[ "airflow-1-10-x-decl",