def send_lineage( operator: "BaseOperator", inlets: Optional[List] = None, # unused outlets: Optional[List] = None, # unused context: Dict = None, ) -> None: config = get_lineage_config() try: # This is necessary to avoid issues with circular imports. from airflow.lineage import prepare_lineage from datahub_provider.hooks.datahub import AIRFLOW_1 # Detect Airflow 1.10.x inlet/outlet configurations in Airflow 2.x, and # convert to the newer version. This code path will only be triggered # when 2.x receives a 1.10.x inlet/outlet config. needs_repeat_preparation = False if (not AIRFLOW_1 and isinstance(operator._inlets, list) and len(operator._inlets) == 1 and isinstance(operator._inlets[0], dict)): from airflow.lineage import AUTO operator._inlets = [ # See https://airflow.apache.org/docs/apache-airflow/1.10.15/lineage.html. *operator._inlets[0].get( "datasets", []), # assumes these are attr-annotated *operator._inlets[0].get("task_ids", []), *([AUTO] if operator._inlets[0].get("auto", False) else []), ] needs_repeat_preparation = True if (not AIRFLOW_1 and isinstance(operator._outlets, list) and len(operator._outlets) == 1 and isinstance(operator._outlets[0], dict)): operator._outlets = [*operator._outlets[0].get("datasets", [])] needs_repeat_preparation = True if needs_repeat_preparation: # Rerun the lineage preparation routine, now that the old format has been translated to the new one. prepare_lineage(lambda self, ctx: None)(operator, context) context = context or {} # ensure not None to satisfy mypy send_lineage_to_datahub(config, operator, operator.inlets, operator.outlets, context) except Exception as e: if config.graceful_exceptions: operator.log.error(e) operator.log.info( "Supressing error because graceful_exceptions is set") else: raise
def test_lineage_is_sent_to_backend(self, mock_get_backend): class TestBackend(LineageBackend): def send_lineage(self, operator, inlets=None, outlets=None, context=None): assert len(inlets) == 1 assert len(outlets) == 1 func = mock.Mock() func.__name__ = 'foo' mock_get_backend.return_value = TestBackend() dag = DAG(dag_id='test_lineage_is_sent_to_backend', start_date=DEFAULT_DATE) with dag: op1 = DummyOperator(task_id='task1') file1 = File("/tmp/some_file") op1.inlets.append(file1) op1.outlets.append(file1) ctx1 = { "ti": TI(task=op1, execution_date=DEFAULT_DATE), "execution_date": DEFAULT_DATE } prep = prepare_lineage(func) prep(op1, ctx1) post = apply_lineage(func) post(op1, ctx1)
def test_lineage_backend(mock_emit, inlets, outlets): DEFAULT_DATE = days_ago(2) with mock.patch.dict( os.environ, { "AIRFLOW__LINEAGE__BACKEND": "datahub_provider.lineage.datahub.DatahubLineageBackend", "AIRFLOW__LINEAGE__DATAHUB_CONN_ID": datahub_rest_connection_config.conn_id, "AIRFLOW__LINEAGE__DATAHUB_KWARGS": json.dumps({"graceful_exceptions": False}), }, ), mock.patch("airflow.models.BaseOperator.xcom_pull", autospec=True), mock.patch( "airflow.models.BaseOperator.xcom_push", autospec=True), patch_airflow_connection( datahub_rest_connection_config): func = mock.Mock() func.__name__ = "foo" dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) with dag: op1 = DummyOperator( task_id="task1", inlets=inlets, outlets=outlets, ) ti = TI(task=op1, execution_date=DEFAULT_DATE) ctx1 = { "dag": dag, "task": op1, "ti": ti, "task_instance": ti, "execution_date": DEFAULT_DATE, "ts": "2021-04-08T00:54:25.771575+00:00", } prep = prepare_lineage(func) prep(op1, ctx1) post = apply_lineage(func) post(op1, ctx1) # Verify that the inlets and outlets are registered and recognized by Airflow correctly, # or that our lineage backend forces it to. assert len(op1.inlets) == 1 assert len(op1.outlets) == 1 assert all(map(lambda let: isinstance(let, Dataset), op1.inlets)) assert all(map(lambda let: isinstance(let, Dataset), op1.outlets)) # Check that the right things were emitted. mock_emit.assert_called_once() assert len(mock_emit.call_args[0][0]) == 4 assert all(mce.validate() for mce in mock_emit.call_args[0][0])
def test_lineage_backend(mock_emit, mock_xcom_push, inlets, outlets): DEFAULT_DATE = days_ago(2) with mock.patch.dict( os.environ, { "AIRFLOW__LINEAGE__BACKEND": "datahub.integrations.airflow.DatahubAirflowLineageBackend", "AIRFLOW__LINEAGE__DATAHUB_CONN_ID": datahub_rest_connection_config.conn_id, }, ), patch_airflow_connection(datahub_rest_connection_config): func = mock.Mock() func.__name__ = "foo" dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) with dag: op1 = DummyOperator( task_id="task1", inlets=inlets, outlets=outlets, ) ti = TI(task=op1, execution_date=DEFAULT_DATE) ctx1 = { "dag": dag, "task": op1, "ti": ti, "task_instance": ti, "execution_date": DEFAULT_DATE, "ts": "2021-04-08T00:54:25.771575+00:00", } prep = prepare_lineage(func) prep(op1, ctx1) post = apply_lineage(func) post(op1, ctx1) # Verify that the inlets and outlets are registered and recognized by Airflow correctly, # or that our lineage backend forces it to. assert len(op1.inlets) == 1 assert len(op1.outlets) == 1 assert all(map(lambda let: isinstance(let, Dataset), op1.inlets)) assert all(map(lambda let: isinstance(let, Dataset), op1.outlets)) # Verify xcom push calls are correct. # Two calls, one for inlets and the other for outlets. assert mock_xcom_push.call_count == 2 # Check that the right things were emitted. mock_emit.assert_called_once() assert len(mock_emit.call_args[0][0]) == 4 assert all(mce.validate() for mce in mock_emit.call_args[0][0])
def test_lineage_backend(mock_emit): # Airflow 2.x does not have lineage backend support merged back in yet. # As such, we must protect these imports. from airflow.lineage import apply_lineage, prepare_lineage from datahub.integrations.airflow.entities import Dataset DEFAULT_DATE = days_ago(2) with mock.patch.dict( os.environ, { "AIRFLOW__LINEAGE__BACKEND": "datahub.integrations.airflow.DatahubAirflowLineageBackend", "AIRFLOW__LINEAGE__DATAHUB_CONN_ID": datahub_rest_connection_config.conn_id, }, ), patch_airflow_connection(datahub_rest_connection_config): func = mock.Mock() func.__name__ = "foo" dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) with dag: op1 = DummyOperator(task_id="task1") upstream = Dataset("snowflake", "mydb.schema.tableConsumed") downstream = Dataset("snowflake", "mydb.schema.tableProduced") op1.inlets.append(upstream) op1.outlets.append(downstream) ti = TI(task=op1, execution_date=DEFAULT_DATE) ctx1 = { "dag": dag, "task": op1, "ti": ti, "task_instance": ti, "execution_date": DEFAULT_DATE, "ts": "2021-04-08T00:54:25.771575+00:00", } prep = prepare_lineage(func) prep(op1, ctx1) post = apply_lineage(func) post(op1, ctx1) mock_emit.assert_called_once() assert len(mock_emit.call_args[0][0]) == 4 assert all(mce.validate() for mce in mock_emit.call_args[0][0])
def test_lineage(self, _get_backend): backend = mock.Mock() send_mock = mock.Mock() backend.send_lineage = send_mock _get_backend.return_value = backend dag = DAG( dag_id='test_prepare_lineage', start_date=DEFAULT_DATE ) f1 = File("/tmp/does_not_exist_1") f2 = File("/tmp/does_not_exist_2") f3 = File("/tmp/does_not_exist_3") with dag: op1 = DummyOperator(task_id='leave1', inlets={"datasets": [f1, ]}, outlets={"datasets": [f2, ]}) op2 = DummyOperator(task_id='leave2') op3 = DummyOperator(task_id='upstream_level_1', inlets={"auto": True}, outlets={"datasets": [f3, ]}) op4 = DummyOperator(task_id='upstream_level_2') op5 = DummyOperator(task_id='upstream_level_3', inlets={"task_ids": ["leave1", "upstream_level_1"]}) op1.set_downstream(op3) op2.set_downstream(op3) op3.set_downstream(op4) op4.set_downstream(op5) ctx1 = {"ti": TI(task=op1, execution_date=DEFAULT_DATE)} ctx2 = {"ti": TI(task=op2, execution_date=DEFAULT_DATE)} ctx3 = {"ti": TI(task=op3, execution_date=DEFAULT_DATE)} ctx5 = {"ti": TI(task=op5, execution_date=DEFAULT_DATE)} func = mock.Mock() func.__name__ = 'foo' # prepare with manual inlets and outlets prep = prepare_lineage(func) prep(op1, ctx1) self.assertEqual(len(op1.inlets), 1) self.assertEqual(op1.inlets[0], f1) self.assertEqual(len(op1.outlets), 1) self.assertEqual(op1.outlets[0], f2) # post process with no backend post = apply_lineage(func) post(op1, ctx1) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() prep(op2, ctx2) self.assertEqual(len(op2.inlets), 0) post(op2, ctx2) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() prep(op3, ctx3) self.assertEqual(len(op3.inlets), 1) self.assertEqual(op3.inlets[0].qualified_name, f2.qualified_name) post(op3, ctx3) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() # skip 4 prep(op5, ctx5) self.assertEqual(len(op5.inlets), 2) post(op5, ctx5) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock()
def test_lineage_auto_branching(self, _get_backend): # Tests the ability for the auto feature to skip non state affecting operators # DAG diagram: # 1--->2---->4 # ▼ ▲ # 3-----+ backend = mock.Mock() send_mock = mock.Mock() backend.send_lineage = send_mock _get_backend.return_value = backend dag = DAG( dag_id='test_prepare_lineage_auto_branching', start_date=DEFAULT_DATE ) f1 = File("/tmp/does_not_exist_1") with dag: op1 = DummyOperator(task_id='leave1') op2 = DummyOperator(task_id='branch_1', outlets={"datasets": [f1, ]}) op3 = DummyOperator(task_id='branch_2') op4 = DummyOperator(task_id='upstream_level_2', inlets={"auto": True}) op1.set_downstream(op2) op2.set_downstream(op3) op2.set_downstream(op4) op3.set_downstream(op4) ctx1 = {"ti": TI(task=op1, execution_date=DEFAULT_DATE)} ctx2 = {"ti": TI(task=op2, execution_date=DEFAULT_DATE)} ctx3 = {"ti": TI(task=op3, execution_date=DEFAULT_DATE)} ctx4 = {"ti": TI(task=op4, execution_date=DEFAULT_DATE)} func = mock.Mock() func.__name__ = 'foo' # prepare with manual inlets and outlets prep = prepare_lineage(func) prep(op1, ctx1) self.assertEqual(len(op1.inlets), 0) # post process with no backend post = apply_lineage(func) post(op1, ctx1) send_mock.reset_mock() prep(op2, ctx2) self.assertEqual(len(op2.inlets), 0) post(op2, ctx2) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() prep(op3, ctx3) self.assertEqual(len(op3.inlets), 0) post(op3, ctx3) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() prep(op4, ctx4) self.assertEqual(len(op4.inlets), 1) self.assertEqual(op4.inlets[0].name, f1.name) post(op4, ctx4) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock()
def test_lineage_complicated_dag(self, _get_backend): # Tests the ability for the auto feature to skip non state affecting operators, while still # retrieving data from multiple outlet sources. Notice how if outlets are not specified, # that the auto feature continues to traverse down the dag until not input sources are found. # DAG diagram: # 1-----------+ # | # ▼ # 4 ----------+ # ▲ ▼ # | 5+-------->6 # 2-----------+ ▲ # | # | # | # 3-----------------------+ backend = mock.Mock() send_mock = mock.Mock() backend.send_lineage = send_mock _get_backend.return_value = backend dag = DAG( dag_id='test_prepare_lineage_auto_complicated_dag', start_date=DEFAULT_DATE ) f1 = File("/tmp/does_not_exist_1") f2 = File("/tmp/does_not_exist_2") f3 = File("/tmp/does_not_exist_3") with dag: op1 = DummyOperator(task_id='leave1', outlets={"datasets": [f1, ]}, inlets={"auto": True}) op2 = DummyOperator(task_id='leave2', outlets={"datasets": [f2, ]}) op3 = DummyOperator(task_id='leave3', outlets={"datasets": [f3, ]}) op4 = DummyOperator(task_id='upstream_level_1') op5 = DummyOperator(task_id='upstream_level_2', inlets={"auto": True}) op6 = DummyOperator(task_id='upstream_level_3', inlets={"auto": True}) op1.set_downstream(op4) op2.set_downstream(op4) op3.set_downstream(op5) op4.set_downstream(op5) op5.set_downstream(op6) ctx1 = {"ti": TI(task=op1, execution_date=DEFAULT_DATE)} ctx2 = {"ti": TI(task=op2, execution_date=DEFAULT_DATE)} ctx3 = {"ti": TI(task=op3, execution_date=DEFAULT_DATE)} ctx4 = {"ti": TI(task=op4, execution_date=DEFAULT_DATE)} ctx5 = {"ti": TI(task=op5, execution_date=DEFAULT_DATE)} ctx6 = {"ti": TI(task=op6, execution_date=DEFAULT_DATE)} func = mock.Mock() func.__name__ = 'foo' # prepare with manual inlets and outlets prep = prepare_lineage(func) prep(op1, ctx1) self.assertEqual(len(op1.outlets), 1) self.assertEqual(op1.outlets[0], f1) self.assertEqual(len(op1.inlets), 0) # post process with no backend post = apply_lineage(func) post(op1, ctx1) prep(op2, ctx2) self.assertEqual(len(op2.outlets), 1) post(op2, ctx2) prep(op3, ctx3) self.assertEqual(len(op3.outlets), 1) post(op3, ctx3) prep(op4, ctx4) self.assertEqual(len(op4.inlets), 0) post(op4, ctx4) prep(op5, ctx5) self.assertEqual(len(op5.inlets), 3) self.assertEqual({file.qualified_name for file in op5.inlets}, {'file:///tmp/does_not_exist_1', 'file:///tmp/does_not_exist_2', 'file:///tmp/does_not_exist_3'}) post(op5, ctx5) prep(op6, ctx6) self.assertEqual(len(op6.inlets), 3) self.assertEqual({file.qualified_name for file in op6.inlets}, {'file:///tmp/does_not_exist_1', 'file:///tmp/does_not_exist_2', 'file:///tmp/does_not_exist_3'}) post(op6, ctx6)
def send_lineage( operator: "BaseOperator", inlets: Optional[List] = None, outlets: Optional[List] = None, context: Dict = None, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.lineage import prepare_lineage from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) from datahub.integrations.airflow.hooks import AIRFLOW_1 # Detect Airflow 1.10.x inlet/outlet configurations in Airflow 2.x, and # convert to the newer version. This code path will only be triggered # when 2.x receives a 1.10.x inlet/outlet config. needs_repeat_preparation = False if ( not AIRFLOW_1 and isinstance(operator._inlets, list) and len(operator._inlets) == 1 and isinstance(operator._inlets[0], dict) ): from airflow.lineage import AUTO operator._inlets = [ # See https://airflow.apache.org/docs/apache-airflow/1.10.15/lineage.html. *operator._inlets[0].get( "datasets", [] ), # assumes these are attr-annotated *operator._inlets[0].get("task_ids", []), *([AUTO] if operator._inlets[0].get("auto", False) else []), ] needs_repeat_preparation = True if ( not AIRFLOW_1 and isinstance(operator._outlets, list) and len(operator._outlets) == 1 and isinstance(operator._outlets[0], dict) ): operator._outlets = [*operator._outlets[0].get("datasets", [])] needs_repeat_preparation = True if needs_repeat_preparation: # Rerun the lineage preparation routine, now that the old format has been translated to the new one. prepare_lineage(lambda self, ctx: None)(operator, context) context = context or {} # ensure not None to satisfy mypy dag: "DAG" = context["dag"] task = context["task"] # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") timestamp = int(dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow") ), ) # operator.log.info(f"{ownership=}") tags = models.GlobalTagsClass( tags=[ models.TagAssociationClass(tag=f"airflow_{tag}") for tag in (dag.tags or []) ] ) # operator.log.info(f"{tags=}") flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), ownership, tags, ], ) ) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), ), ownership, tags, ], ) ) lineage_mces = [ builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet) for outlet in _entities_to_urn_list(outlets or []) ] force_upstream_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=inlet, aspects=[ models.StatusClass(removed=False), ], ) ) for inlet in _entities_to_urn_list(inlets or []) ] hook = make_emitter_hook() mces = [ flow_mce, job_mce, *lineage_mces, *force_upstream_materialization, ] operator.log.info( "DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces) ) hook.emit_mces(mces)
def test_lineage_backend(mock_emit, inlets, outlets): DEFAULT_DATE = days_ago(2) # Using autospec on xcom_pull and xcom_push methods fails on Python 3.6. with mock.patch.dict( os.environ, { "AIRFLOW__LINEAGE__BACKEND": "datahub_provider.lineage.datahub.DatahubLineageBackend", "AIRFLOW__LINEAGE__DATAHUB_CONN_ID": datahub_rest_connection_config.conn_id, "AIRFLOW__LINEAGE__DATAHUB_KWARGS": json.dumps({"graceful_exceptions": False}), }, ), mock.patch("airflow.models.BaseOperator.xcom_pull"), mock.patch( "airflow.models.BaseOperator.xcom_push"), patch_airflow_connection( datahub_rest_connection_config): func = mock.Mock() func.__name__ = "foo" dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) with dag: op1 = DummyOperator( task_id="task1_upstream", inlets=inlets, outlets=outlets, ) op2 = DummyOperator( task_id="task2", inlets=inlets, outlets=outlets, ) op1 >> op2 # Airflow <= 2.1 requires the execution_date parameter. Newer Airflow # versions do not require it, but will attempt to find the associated # run_id in the database if execution_date is provided. As such, we # must fake the run_id parameter for newer Airflow versions. if any( airflow.version.version.startswith(prefix) for prefix in ["1", "2.0", "2.1"]): ti = TaskInstance(task=op2, execution_date=DEFAULT_DATE) else: ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}") ctx1 = { "dag": dag, "task": op2, "ti": ti, "task_instance": ti, "execution_date": DEFAULT_DATE, "ts": "2021-04-08T00:54:25.771575+00:00", } prep = prepare_lineage(func) prep(op2, ctx1) post = apply_lineage(func) post(op2, ctx1) # Verify that the inlets and outlets are registered and recognized by Airflow correctly, # or that our lineage backend forces it to. assert len(op2.inlets) == 1 assert len(op2.outlets) == 1 assert all(map(lambda let: isinstance(let, Dataset), op2.inlets)) assert all(map(lambda let: isinstance(let, Dataset), op2.outlets)) # Check that the right things were emitted. mock_emit.assert_called_once() assert len(mock_emit.call_args[0][0]) == 4 assert all(mce.validate() for mce in mock_emit.call_args[0][0])
def test_lineage_backend_capture_executions(mock_emit, inlets, outlets): DEFAULT_DATE = datetime.datetime(2020, 5, 17) mock_emitter = Mock() mock_emit.return_value = mock_emitter # Using autospec on xcom_pull and xcom_push methods fails on Python 3.6. with mock.patch.dict( os.environ, { "AIRFLOW__LINEAGE__BACKEND": "datahub_provider.lineage.datahub.DatahubLineageBackend", "AIRFLOW__LINEAGE__DATAHUB_CONN_ID": datahub_rest_connection_config.conn_id, "AIRFLOW__LINEAGE__DATAHUB_KWARGS": json.dumps({ "graceful_exceptions": False, "capture_executions": True }), }, ), mock.patch("airflow.models.BaseOperator.xcom_pull"), mock.patch( "airflow.models.BaseOperator.xcom_push"), patch_airflow_connection( datahub_rest_connection_config): func = mock.Mock() func.__name__ = "foo" dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) with dag: op1 = DummyOperator( task_id="task1_upstream", inlets=inlets, outlets=outlets, ) op2 = DummyOperator( task_id="task2", inlets=inlets, outlets=outlets, ) op1 >> op2 # Airflow < 2.2 requires the execution_date parameter. Newer Airflow # versions do not require it, but will attempt to find the associated # run_id in the database if execution_date is provided. As such, we # must fake the run_id parameter for newer Airflow versions. if AIRFLOW_VERSION < packaging.version.parse("2.2.0"): ti = TaskInstance(task=op2, execution_date=DEFAULT_DATE) # Ignoring type here because DagRun state is just a sring at Airflow 1 dag_run = DagRun( state="success", run_id=f"scheduled_{DEFAULT_DATE}") # type: ignore ti.dag_run = dag_run ti.start_date = datetime.datetime.utcnow() ti.execution_date = DEFAULT_DATE else: from airflow.utils.state import DagRunState ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}") dag_run = DagRun(state=DagRunState.SUCCESS, run_id=f"scheduled_{DEFAULT_DATE}") ti.dag_run = dag_run ti.start_date = datetime.datetime.utcnow() ti.execution_date = DEFAULT_DATE ctx1 = { "dag": dag, "task": op2, "ti": ti, "dag_run": dag_run, "task_instance": ti, "execution_date": DEFAULT_DATE, "ts": "2021-04-08T00:54:25.771575+00:00", } prep = prepare_lineage(func) prep(op2, ctx1) post = apply_lineage(func) post(op2, ctx1) # Verify that the inlets and outlets are registered and recognized by Airflow correctly, # or that our lineage backend forces it to. assert len(op2.inlets) == 1 assert len(op2.outlets) == 1 assert all(map(lambda let: isinstance(let, Dataset), op2.inlets)) assert all(map(lambda let: isinstance(let, Dataset), op2.outlets)) # Check that the right things were emitted. assert mock_emitter.emit.call_count == 17 # Running further checks based on python version because args only exists in python 3.7+ if sys.version_info[:3] > (3, 7): assert mock_emitter.method_calls[0].args[ 0].aspectName == "dataFlowInfo" assert ( mock_emitter.method_calls[0].args[0].entityUrn == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" ) assert mock_emitter.method_calls[1].args[ 0].aspectName == "ownership" assert ( mock_emitter.method_calls[1].args[0].entityUrn == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" ) assert mock_emitter.method_calls[2].args[ 0].aspectName == "globalTags" assert ( mock_emitter.method_calls[2].args[0].entityUrn == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" ) assert mock_emitter.method_calls[3].args[ 0].aspectName == "dataJobInfo" assert ( mock_emitter.method_calls[3].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert (mock_emitter.method_calls[4].args[0].aspectName == "dataJobInputOutput") assert ( mock_emitter.method_calls[4].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert ( mock_emitter.method_calls[4].args[0].aspect.inputDatajobs[0] == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task1_upstream)" ) assert ( mock_emitter.method_calls[4].args[0].aspect.inputDatasets[0] == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" ) assert ( mock_emitter.method_calls[4].args[0].aspect.outputDatasets[0] == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" ) assert mock_emitter.method_calls[5].args[0].aspectName == "status" assert ( mock_emitter.method_calls[5].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" ) assert mock_emitter.method_calls[6].args[0].aspectName == "status" assert ( mock_emitter.method_calls[6].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" ) assert mock_emitter.method_calls[7].args[ 0].aspectName == "ownership" assert ( mock_emitter.method_calls[7].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert mock_emitter.method_calls[8].args[ 0].aspectName == "globalTags" assert ( mock_emitter.method_calls[8].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert (mock_emitter.method_calls[9].args[0].aspectName == "dataProcessInstanceProperties") assert ( mock_emitter.method_calls[9].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert (mock_emitter.method_calls[10].args[0].aspectName == "dataProcessInstanceRelationships") assert ( mock_emitter.method_calls[10].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert (mock_emitter.method_calls[11].args[0].aspectName == "dataProcessInstanceInput") assert ( mock_emitter.method_calls[11].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert (mock_emitter.method_calls[12].args[0].aspectName == "dataProcessInstanceOutput") assert ( mock_emitter.method_calls[12].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert mock_emitter.method_calls[13].args[0].aspectName == "status" assert ( mock_emitter.method_calls[13].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" ) assert mock_emitter.method_calls[14].args[0].aspectName == "status" assert ( mock_emitter.method_calls[14].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" ) assert (mock_emitter.method_calls[15].args[0].aspectName == "dataProcessInstanceRunEvent") assert ( mock_emitter.method_calls[15].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert (mock_emitter.method_calls[16].args[0].aspectName == "dataProcessInstanceRunEvent") assert ( mock_emitter.method_calls[16].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")