def test_lineage_is_sent_to_backend(self, mock_get_backend): class TestBackend(LineageBackend): def send_lineage(self, operator, inlets=None, outlets=None, context=None): assert len(inlets) == 1 assert len(outlets) == 1 func = mock.Mock() func.__name__ = 'foo' mock_get_backend.return_value = TestBackend() dag = DAG(dag_id='test_lineage_is_sent_to_backend', start_date=DEFAULT_DATE) with dag: op1 = DummyOperator(task_id='task1') file1 = File("/tmp/some_file") op1.inlets.append(file1) op1.outlets.append(file1) ctx1 = { "ti": TI(task=op1, execution_date=DEFAULT_DATE), "execution_date": DEFAULT_DATE } prep = prepare_lineage(func) prep(op1, ctx1) post = apply_lineage(func) post(op1, ctx1)
def test_lineage_backend(mock_emit, inlets, outlets): DEFAULT_DATE = days_ago(2) with mock.patch.dict( os.environ, { "AIRFLOW__LINEAGE__BACKEND": "datahub_provider.lineage.datahub.DatahubLineageBackend", "AIRFLOW__LINEAGE__DATAHUB_CONN_ID": datahub_rest_connection_config.conn_id, "AIRFLOW__LINEAGE__DATAHUB_KWARGS": json.dumps({"graceful_exceptions": False}), }, ), mock.patch("airflow.models.BaseOperator.xcom_pull", autospec=True), mock.patch( "airflow.models.BaseOperator.xcom_push", autospec=True), patch_airflow_connection( datahub_rest_connection_config): func = mock.Mock() func.__name__ = "foo" dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) with dag: op1 = DummyOperator( task_id="task1", inlets=inlets, outlets=outlets, ) ti = TI(task=op1, execution_date=DEFAULT_DATE) ctx1 = { "dag": dag, "task": op1, "ti": ti, "task_instance": ti, "execution_date": DEFAULT_DATE, "ts": "2021-04-08T00:54:25.771575+00:00", } prep = prepare_lineage(func) prep(op1, ctx1) post = apply_lineage(func) post(op1, ctx1) # Verify that the inlets and outlets are registered and recognized by Airflow correctly, # or that our lineage backend forces it to. assert len(op1.inlets) == 1 assert len(op1.outlets) == 1 assert all(map(lambda let: isinstance(let, Dataset), op1.inlets)) assert all(map(lambda let: isinstance(let, Dataset), op1.outlets)) # Check that the right things were emitted. mock_emit.assert_called_once() assert len(mock_emit.call_args[0][0]) == 4 assert all(mce.validate() for mce in mock_emit.call_args[0][0])
def test_lineage_backend(mock_emit, mock_xcom_push, inlets, outlets): DEFAULT_DATE = days_ago(2) with mock.patch.dict( os.environ, { "AIRFLOW__LINEAGE__BACKEND": "datahub.integrations.airflow.DatahubAirflowLineageBackend", "AIRFLOW__LINEAGE__DATAHUB_CONN_ID": datahub_rest_connection_config.conn_id, }, ), patch_airflow_connection(datahub_rest_connection_config): func = mock.Mock() func.__name__ = "foo" dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) with dag: op1 = DummyOperator( task_id="task1", inlets=inlets, outlets=outlets, ) ti = TI(task=op1, execution_date=DEFAULT_DATE) ctx1 = { "dag": dag, "task": op1, "ti": ti, "task_instance": ti, "execution_date": DEFAULT_DATE, "ts": "2021-04-08T00:54:25.771575+00:00", } prep = prepare_lineage(func) prep(op1, ctx1) post = apply_lineage(func) post(op1, ctx1) # Verify that the inlets and outlets are registered and recognized by Airflow correctly, # or that our lineage backend forces it to. assert len(op1.inlets) == 1 assert len(op1.outlets) == 1 assert all(map(lambda let: isinstance(let, Dataset), op1.inlets)) assert all(map(lambda let: isinstance(let, Dataset), op1.outlets)) # Verify xcom push calls are correct. # Two calls, one for inlets and the other for outlets. assert mock_xcom_push.call_count == 2 # Check that the right things were emitted. mock_emit.assert_called_once() assert len(mock_emit.call_args[0][0]) == 4 assert all(mce.validate() for mce in mock_emit.call_args[0][0])
def test_lineage_backend(mock_emit): # Airflow 2.x does not have lineage backend support merged back in yet. # As such, we must protect these imports. from airflow.lineage import apply_lineage, prepare_lineage from datahub.integrations.airflow.entities import Dataset DEFAULT_DATE = days_ago(2) with mock.patch.dict( os.environ, { "AIRFLOW__LINEAGE__BACKEND": "datahub.integrations.airflow.DatahubAirflowLineageBackend", "AIRFLOW__LINEAGE__DATAHUB_CONN_ID": datahub_rest_connection_config.conn_id, }, ), patch_airflow_connection(datahub_rest_connection_config): func = mock.Mock() func.__name__ = "foo" dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) with dag: op1 = DummyOperator(task_id="task1") upstream = Dataset("snowflake", "mydb.schema.tableConsumed") downstream = Dataset("snowflake", "mydb.schema.tableProduced") op1.inlets.append(upstream) op1.outlets.append(downstream) ti = TI(task=op1, execution_date=DEFAULT_DATE) ctx1 = { "dag": dag, "task": op1, "ti": ti, "task_instance": ti, "execution_date": DEFAULT_DATE, "ts": "2021-04-08T00:54:25.771575+00:00", } prep = prepare_lineage(func) prep(op1, ctx1) post = apply_lineage(func) post(op1, ctx1) mock_emit.assert_called_once() assert len(mock_emit.call_args[0][0]) == 4 assert all(mce.validate() for mce in mock_emit.call_args[0][0])
def test_lineage(self, _get_backend): backend = mock.Mock() send_mock = mock.Mock() backend.send_lineage = send_mock _get_backend.return_value = backend dag = DAG( dag_id='test_prepare_lineage', start_date=DEFAULT_DATE ) f1 = File("/tmp/does_not_exist_1") f2 = File("/tmp/does_not_exist_2") f3 = File("/tmp/does_not_exist_3") with dag: op1 = DummyOperator(task_id='leave1', inlets={"datasets": [f1, ]}, outlets={"datasets": [f2, ]}) op2 = DummyOperator(task_id='leave2') op3 = DummyOperator(task_id='upstream_level_1', inlets={"auto": True}, outlets={"datasets": [f3, ]}) op4 = DummyOperator(task_id='upstream_level_2') op5 = DummyOperator(task_id='upstream_level_3', inlets={"task_ids": ["leave1", "upstream_level_1"]}) op1.set_downstream(op3) op2.set_downstream(op3) op3.set_downstream(op4) op4.set_downstream(op5) ctx1 = {"ti": TI(task=op1, execution_date=DEFAULT_DATE)} ctx2 = {"ti": TI(task=op2, execution_date=DEFAULT_DATE)} ctx3 = {"ti": TI(task=op3, execution_date=DEFAULT_DATE)} ctx5 = {"ti": TI(task=op5, execution_date=DEFAULT_DATE)} func = mock.Mock() func.__name__ = 'foo' # prepare with manual inlets and outlets prep = prepare_lineage(func) prep(op1, ctx1) self.assertEqual(len(op1.inlets), 1) self.assertEqual(op1.inlets[0], f1) self.assertEqual(len(op1.outlets), 1) self.assertEqual(op1.outlets[0], f2) # post process with no backend post = apply_lineage(func) post(op1, ctx1) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() prep(op2, ctx2) self.assertEqual(len(op2.inlets), 0) post(op2, ctx2) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() prep(op3, ctx3) self.assertEqual(len(op3.inlets), 1) self.assertEqual(op3.inlets[0].qualified_name, f2.qualified_name) post(op3, ctx3) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() # skip 4 prep(op5, ctx5) self.assertEqual(len(op5.inlets), 2) post(op5, ctx5) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock()
def test_lineage_auto_branching(self, _get_backend): # Tests the ability for the auto feature to skip non state affecting operators # DAG diagram: # 1--->2---->4 # ▼ ▲ # 3-----+ backend = mock.Mock() send_mock = mock.Mock() backend.send_lineage = send_mock _get_backend.return_value = backend dag = DAG( dag_id='test_prepare_lineage_auto_branching', start_date=DEFAULT_DATE ) f1 = File("/tmp/does_not_exist_1") with dag: op1 = DummyOperator(task_id='leave1') op2 = DummyOperator(task_id='branch_1', outlets={"datasets": [f1, ]}) op3 = DummyOperator(task_id='branch_2') op4 = DummyOperator(task_id='upstream_level_2', inlets={"auto": True}) op1.set_downstream(op2) op2.set_downstream(op3) op2.set_downstream(op4) op3.set_downstream(op4) ctx1 = {"ti": TI(task=op1, execution_date=DEFAULT_DATE)} ctx2 = {"ti": TI(task=op2, execution_date=DEFAULT_DATE)} ctx3 = {"ti": TI(task=op3, execution_date=DEFAULT_DATE)} ctx4 = {"ti": TI(task=op4, execution_date=DEFAULT_DATE)} func = mock.Mock() func.__name__ = 'foo' # prepare with manual inlets and outlets prep = prepare_lineage(func) prep(op1, ctx1) self.assertEqual(len(op1.inlets), 0) # post process with no backend post = apply_lineage(func) post(op1, ctx1) send_mock.reset_mock() prep(op2, ctx2) self.assertEqual(len(op2.inlets), 0) post(op2, ctx2) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() prep(op3, ctx3) self.assertEqual(len(op3.inlets), 0) post(op3, ctx3) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() prep(op4, ctx4) self.assertEqual(len(op4.inlets), 1) self.assertEqual(op4.inlets[0].name, f1.name) post(op4, ctx4) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock()
def test_lineage_complicated_dag(self, _get_backend): # Tests the ability for the auto feature to skip non state affecting operators, while still # retrieving data from multiple outlet sources. Notice how if outlets are not specified, # that the auto feature continues to traverse down the dag until not input sources are found. # DAG diagram: # 1-----------+ # | # ▼ # 4 ----------+ # ▲ ▼ # | 5+-------->6 # 2-----------+ ▲ # | # | # | # 3-----------------------+ backend = mock.Mock() send_mock = mock.Mock() backend.send_lineage = send_mock _get_backend.return_value = backend dag = DAG( dag_id='test_prepare_lineage_auto_complicated_dag', start_date=DEFAULT_DATE ) f1 = File("/tmp/does_not_exist_1") f2 = File("/tmp/does_not_exist_2") f3 = File("/tmp/does_not_exist_3") with dag: op1 = DummyOperator(task_id='leave1', outlets={"datasets": [f1, ]}, inlets={"auto": True}) op2 = DummyOperator(task_id='leave2', outlets={"datasets": [f2, ]}) op3 = DummyOperator(task_id='leave3', outlets={"datasets": [f3, ]}) op4 = DummyOperator(task_id='upstream_level_1') op5 = DummyOperator(task_id='upstream_level_2', inlets={"auto": True}) op6 = DummyOperator(task_id='upstream_level_3', inlets={"auto": True}) op1.set_downstream(op4) op2.set_downstream(op4) op3.set_downstream(op5) op4.set_downstream(op5) op5.set_downstream(op6) ctx1 = {"ti": TI(task=op1, execution_date=DEFAULT_DATE)} ctx2 = {"ti": TI(task=op2, execution_date=DEFAULT_DATE)} ctx3 = {"ti": TI(task=op3, execution_date=DEFAULT_DATE)} ctx4 = {"ti": TI(task=op4, execution_date=DEFAULT_DATE)} ctx5 = {"ti": TI(task=op5, execution_date=DEFAULT_DATE)} ctx6 = {"ti": TI(task=op6, execution_date=DEFAULT_DATE)} func = mock.Mock() func.__name__ = 'foo' # prepare with manual inlets and outlets prep = prepare_lineage(func) prep(op1, ctx1) self.assertEqual(len(op1.outlets), 1) self.assertEqual(op1.outlets[0], f1) self.assertEqual(len(op1.inlets), 0) # post process with no backend post = apply_lineage(func) post(op1, ctx1) prep(op2, ctx2) self.assertEqual(len(op2.outlets), 1) post(op2, ctx2) prep(op3, ctx3) self.assertEqual(len(op3.outlets), 1) post(op3, ctx3) prep(op4, ctx4) self.assertEqual(len(op4.inlets), 0) post(op4, ctx4) prep(op5, ctx5) self.assertEqual(len(op5.inlets), 3) self.assertEqual({file.qualified_name for file in op5.inlets}, {'file:///tmp/does_not_exist_1', 'file:///tmp/does_not_exist_2', 'file:///tmp/does_not_exist_3'}) post(op5, ctx5) prep(op6, ctx6) self.assertEqual(len(op6.inlets), 3) self.assertEqual({file.qualified_name for file in op6.inlets}, {'file:///tmp/does_not_exist_1', 'file:///tmp/does_not_exist_2', 'file:///tmp/does_not_exist_3'}) post(op6, ctx6)
def test_lineage_backend(mock_emit, inlets, outlets): DEFAULT_DATE = days_ago(2) # Using autospec on xcom_pull and xcom_push methods fails on Python 3.6. with mock.patch.dict( os.environ, { "AIRFLOW__LINEAGE__BACKEND": "datahub_provider.lineage.datahub.DatahubLineageBackend", "AIRFLOW__LINEAGE__DATAHUB_CONN_ID": datahub_rest_connection_config.conn_id, "AIRFLOW__LINEAGE__DATAHUB_KWARGS": json.dumps({"graceful_exceptions": False}), }, ), mock.patch("airflow.models.BaseOperator.xcom_pull"), mock.patch( "airflow.models.BaseOperator.xcom_push"), patch_airflow_connection( datahub_rest_connection_config): func = mock.Mock() func.__name__ = "foo" dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) with dag: op1 = DummyOperator( task_id="task1_upstream", inlets=inlets, outlets=outlets, ) op2 = DummyOperator( task_id="task2", inlets=inlets, outlets=outlets, ) op1 >> op2 # Airflow <= 2.1 requires the execution_date parameter. Newer Airflow # versions do not require it, but will attempt to find the associated # run_id in the database if execution_date is provided. As such, we # must fake the run_id parameter for newer Airflow versions. if any( airflow.version.version.startswith(prefix) for prefix in ["1", "2.0", "2.1"]): ti = TaskInstance(task=op2, execution_date=DEFAULT_DATE) else: ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}") ctx1 = { "dag": dag, "task": op2, "ti": ti, "task_instance": ti, "execution_date": DEFAULT_DATE, "ts": "2021-04-08T00:54:25.771575+00:00", } prep = prepare_lineage(func) prep(op2, ctx1) post = apply_lineage(func) post(op2, ctx1) # Verify that the inlets and outlets are registered and recognized by Airflow correctly, # or that our lineage backend forces it to. assert len(op2.inlets) == 1 assert len(op2.outlets) == 1 assert all(map(lambda let: isinstance(let, Dataset), op2.inlets)) assert all(map(lambda let: isinstance(let, Dataset), op2.outlets)) # Check that the right things were emitted. mock_emit.assert_called_once() assert len(mock_emit.call_args[0][0]) == 4 assert all(mce.validate() for mce in mock_emit.call_args[0][0])
def test_lineage_backend_capture_executions(mock_emit, inlets, outlets): DEFAULT_DATE = datetime.datetime(2020, 5, 17) mock_emitter = Mock() mock_emit.return_value = mock_emitter # Using autospec on xcom_pull and xcom_push methods fails on Python 3.6. with mock.patch.dict( os.environ, { "AIRFLOW__LINEAGE__BACKEND": "datahub_provider.lineage.datahub.DatahubLineageBackend", "AIRFLOW__LINEAGE__DATAHUB_CONN_ID": datahub_rest_connection_config.conn_id, "AIRFLOW__LINEAGE__DATAHUB_KWARGS": json.dumps({ "graceful_exceptions": False, "capture_executions": True }), }, ), mock.patch("airflow.models.BaseOperator.xcom_pull"), mock.patch( "airflow.models.BaseOperator.xcom_push"), patch_airflow_connection( datahub_rest_connection_config): func = mock.Mock() func.__name__ = "foo" dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) with dag: op1 = DummyOperator( task_id="task1_upstream", inlets=inlets, outlets=outlets, ) op2 = DummyOperator( task_id="task2", inlets=inlets, outlets=outlets, ) op1 >> op2 # Airflow < 2.2 requires the execution_date parameter. Newer Airflow # versions do not require it, but will attempt to find the associated # run_id in the database if execution_date is provided. As such, we # must fake the run_id parameter for newer Airflow versions. if AIRFLOW_VERSION < packaging.version.parse("2.2.0"): ti = TaskInstance(task=op2, execution_date=DEFAULT_DATE) # Ignoring type here because DagRun state is just a sring at Airflow 1 dag_run = DagRun( state="success", run_id=f"scheduled_{DEFAULT_DATE}") # type: ignore ti.dag_run = dag_run ti.start_date = datetime.datetime.utcnow() ti.execution_date = DEFAULT_DATE else: from airflow.utils.state import DagRunState ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}") dag_run = DagRun(state=DagRunState.SUCCESS, run_id=f"scheduled_{DEFAULT_DATE}") ti.dag_run = dag_run ti.start_date = datetime.datetime.utcnow() ti.execution_date = DEFAULT_DATE ctx1 = { "dag": dag, "task": op2, "ti": ti, "dag_run": dag_run, "task_instance": ti, "execution_date": DEFAULT_DATE, "ts": "2021-04-08T00:54:25.771575+00:00", } prep = prepare_lineage(func) prep(op2, ctx1) post = apply_lineage(func) post(op2, ctx1) # Verify that the inlets and outlets are registered and recognized by Airflow correctly, # or that our lineage backend forces it to. assert len(op2.inlets) == 1 assert len(op2.outlets) == 1 assert all(map(lambda let: isinstance(let, Dataset), op2.inlets)) assert all(map(lambda let: isinstance(let, Dataset), op2.outlets)) # Check that the right things were emitted. assert mock_emitter.emit.call_count == 17 # Running further checks based on python version because args only exists in python 3.7+ if sys.version_info[:3] > (3, 7): assert mock_emitter.method_calls[0].args[ 0].aspectName == "dataFlowInfo" assert ( mock_emitter.method_calls[0].args[0].entityUrn == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" ) assert mock_emitter.method_calls[1].args[ 0].aspectName == "ownership" assert ( mock_emitter.method_calls[1].args[0].entityUrn == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" ) assert mock_emitter.method_calls[2].args[ 0].aspectName == "globalTags" assert ( mock_emitter.method_calls[2].args[0].entityUrn == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" ) assert mock_emitter.method_calls[3].args[ 0].aspectName == "dataJobInfo" assert ( mock_emitter.method_calls[3].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert (mock_emitter.method_calls[4].args[0].aspectName == "dataJobInputOutput") assert ( mock_emitter.method_calls[4].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert ( mock_emitter.method_calls[4].args[0].aspect.inputDatajobs[0] == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task1_upstream)" ) assert ( mock_emitter.method_calls[4].args[0].aspect.inputDatasets[0] == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" ) assert ( mock_emitter.method_calls[4].args[0].aspect.outputDatasets[0] == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" ) assert mock_emitter.method_calls[5].args[0].aspectName == "status" assert ( mock_emitter.method_calls[5].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" ) assert mock_emitter.method_calls[6].args[0].aspectName == "status" assert ( mock_emitter.method_calls[6].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" ) assert mock_emitter.method_calls[7].args[ 0].aspectName == "ownership" assert ( mock_emitter.method_calls[7].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert mock_emitter.method_calls[8].args[ 0].aspectName == "globalTags" assert ( mock_emitter.method_calls[8].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert (mock_emitter.method_calls[9].args[0].aspectName == "dataProcessInstanceProperties") assert ( mock_emitter.method_calls[9].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert (mock_emitter.method_calls[10].args[0].aspectName == "dataProcessInstanceRelationships") assert ( mock_emitter.method_calls[10].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert (mock_emitter.method_calls[11].args[0].aspectName == "dataProcessInstanceInput") assert ( mock_emitter.method_calls[11].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert (mock_emitter.method_calls[12].args[0].aspectName == "dataProcessInstanceOutput") assert ( mock_emitter.method_calls[12].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert mock_emitter.method_calls[13].args[0].aspectName == "status" assert ( mock_emitter.method_calls[13].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" ) assert mock_emitter.method_calls[14].args[0].aspectName == "status" assert ( mock_emitter.method_calls[14].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" ) assert (mock_emitter.method_calls[15].args[0].aspectName == "dataProcessInstanceRunEvent") assert ( mock_emitter.method_calls[15].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert (mock_emitter.method_calls[16].args[0].aspectName == "dataProcessInstanceRunEvent") assert ( mock_emitter.method_calls[16].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")