def serialize_subprocess(queue, dag_folder): """Validate pickle in a subprocess.""" dags = collect_dags(dag_folder) for dag in dags.values(): queue.put(SerializedDAG.to_json(dag)) queue.put(None)
def test_roundtrip_relativedelta(self, val, expected): serialized = SerializedDAG._serialize(val) self.assertDictEqual(serialized, expected) round_tripped = SerializedDAG._deserialize(serialized) self.assertEqual(val, round_tripped)
def test_extra_serialized_field_and_multiple_operator_links(self): """ Assert extra field exists & OperatorLinks defined in Plugins and inbuilt Operator Links. This tests also depends on GoogleLink() registered as a plugin in tests/plugins/test_plugin.py The function tests that if extra operator links are registered in plugin in ``operator_extra_links`` and the same is also defined in the Operator in ``BaseOperator.operator_extra_links``, it has the correct extra link. """ test_date = datetime(2019, 8, 1) dag = DAG(dag_id='simple_dag', start_date=test_date) CustomOperator(task_id='simple_task', dag=dag, bash_command=["echo", "true"]) serialized_dag = SerializedDAG.to_dict(dag) self.assertIn("bash_command", serialized_dag["dag"]["tasks"][0]) dag = SerializedDAG.from_dict(serialized_dag) simple_task = dag.task_dict["simple_task"] self.assertEqual(getattr(simple_task, "bash_command"), ["echo", "true"]) ######################################################### # Verify Operator Links work with Serialized Operator ######################################################### # Check Serialized version of operator link only contains the inbuilt Op Link self.assertEqual( serialized_dag["dag"]["tasks"][0]["_operator_extra_links"], [ { 'tests.test_utils.mock_operators.CustomBaseIndexOpLink': { 'index': 0 } }, { 'tests.test_utils.mock_operators.CustomBaseIndexOpLink': { 'index': 1 } }, ]) # Test all the extra_links are set self.assertCountEqual(simple_task.extra_links, [ 'BigQuery Console #1', 'BigQuery Console #2', 'airflow', 'github', 'google' ]) ti = TaskInstance(task=simple_task, execution_date=test_date) ti.xcom_push('search_query', ["dummy_value_1", "dummy_value_2"]) # Test Deserialized inbuilt link #1 custom_inbuilt_link = simple_task.get_extra_links( test_date, "BigQuery Console #1") self.assertEqual( 'https://console.cloud.google.com/bigquery?j=dummy_value_1', custom_inbuilt_link) # Test Deserialized inbuilt link #2 custom_inbuilt_link = simple_task.get_extra_links( test_date, "BigQuery Console #2") self.assertEqual( 'https://console.cloud.google.com/bigquery?j=dummy_value_2', custom_inbuilt_link) # Test Deserialized link registered via Airflow Plugin google_link_from_plugin = simple_task.get_extra_links( test_date, GoogleLink.name) self.assertEqual("https://www.google.com", google_link_from_plugin)
def test_deserialization_with_dag_context(self): with DAG(dag_id='simple_dag', start_date=datetime(2019, 8, 1, tzinfo=timezone.utc)) as dag: BaseOperator(task_id='simple_task') # should not raise RuntimeError: dictionary changed size during iteration SerializedDAG.to_dict(dag)
def test_roundtrip_relativedelta(self, val, expected): serialized = SerializedDAG._serialize(val) assert serialized == expected round_tripped = SerializedDAG._deserialize(serialized) assert val == round_tripped
def __init__(self, dag): self.dag_id = dag.dag_id self.fileloc = dag.full_filepath self.fileloc_hash = self.dag_fileloc_hash(self.fileloc) self.data = SerializedDAG.to_dict(dag) self.last_updated = timezone.utcnow()
def test_serialized_objects_are_sorted(self, object_to_serialized, expected_output): """Test Serialized Lists, Sets and Tuples are sorted""" serialized_obj = SerializedDAG._serialize(object_to_serialized) if isinstance(serialized_obj, dict) and "__type" in serialized_obj: serialized_obj = serialized_obj["__var"] assert serialized_obj == expected_output
def send_lineage( operator: "BaseOperator", inlets: Optional[List] = None, outlets: Optional[List] = None, context: Dict = None, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.lineage import prepare_lineage from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) from datahub.integrations.airflow.hooks import AIRFLOW_1 # Detect Airflow 1.10.x inlet/outlet configurations in Airflow 2.x, and # convert to the newer version. This code path will only be triggered # when 2.x receives a 1.10.x inlet/outlet config. needs_repeat_preparation = False if ( not AIRFLOW_1 and isinstance(operator._inlets, list) and len(operator._inlets) == 1 and isinstance(operator._inlets[0], dict) ): from airflow.lineage import AUTO operator._inlets = [ # See https://airflow.apache.org/docs/apache-airflow/1.10.15/lineage.html. *operator._inlets[0].get( "datasets", [] ), # assumes these are attr-annotated *operator._inlets[0].get("task_ids", []), *([AUTO] if operator._inlets[0].get("auto", False) else []), ] needs_repeat_preparation = True if ( not AIRFLOW_1 and isinstance(operator._outlets, list) and len(operator._outlets) == 1 and isinstance(operator._outlets[0], dict) ): operator._outlets = [*operator._outlets[0].get("datasets", [])] needs_repeat_preparation = True if needs_repeat_preparation: # Rerun the lineage preparation routine, now that the old format has been translated to the new one. prepare_lineage(lambda self, ctx: None)(operator, context) context = context or {} # ensure not None to satisfy mypy dag: "DAG" = context["dag"] task = context["task"] # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") timestamp = int(dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow") ), ) # operator.log.info(f"{ownership=}") tags = models.GlobalTagsClass( tags=[ models.TagAssociationClass(tag=f"airflow_{tag}") for tag in (dag.tags or []) ] ) # operator.log.info(f"{tags=}") flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), ownership, tags, ], ) ) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), ), ownership, tags, ], ) ) lineage_mces = [ builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet) for outlet in _entities_to_urn_list(outlets or []) ] force_upstream_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=inlet, aspects=[ models.StatusClass(removed=False), ], ) ) for inlet in _entities_to_urn_list(inlets or []) ] hook = make_emitter_hook() mces = [ flow_mce, job_mce, *lineage_mces, *force_upstream_materialization, ] operator.log.info( "DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces) ) hook.emit_mces(mces)
def send_lineage_to_datahub( config: DatahubBasicLineageConfig, operator: "BaseOperator", inlets: List[_Entity], outlets: List[_Entity], context: Dict, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) dag: "DAG" = context["dag"] task: "BaseOperator" = context["task"] # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id, config.cluster) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") allowed_task_keys = [ "_downstream_task_ids", "_inlets", "_outlets", "_task_type", "_task_module", "depends_on_past", "email", "label", "execution_timeout", "end_date", "start_date", "sla", "sql", "task_id", "trigger_rule", "wait_for_downstream", ] job_property_bag = { k: v for (k, v) in job_property_bag.items() if k in allowed_task_keys } allowed_flow_keys = [ "_access_control", "_concurrency", "_default_view", "catchup", "fileloc", "is_paused_upon_creation", "start_date", "tags", "timezone", ] flow_property_bag = { k: v for (k, v) in flow_property_bag.items() if k in allowed_flow_keys } if config.capture_ownership_info: timestamp = int( dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow")), ) # operator.log.info(f"{ownership=}") ownership_aspect = [ownership] else: ownership_aspect = [] if config.capture_tags_info: tags = models.GlobalTagsClass(tags=[ models.TagAssociationClass(tag=builder.make_tag_urn(tag)) for tag in (dag.tags or []) ]) # operator.log.info(f"{tags=}") tags_aspect = [tags] else: tags_aspect = [] flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), *ownership_aspect, *tags_aspect, ], )) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), inputDatajobs=[ builder.make_data_job_urn_with_flow(flow_urn, task_id) for task_id in task.upstream_task_ids ], ), *ownership_aspect, *tags_aspect, ], )) force_entity_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=iolet, aspects=[ models.StatusClass(removed=False), ], )) for iolet in _entities_to_urn_list((inlets or []) + (outlets or [])) ] hook = config.make_emitter_hook() mces = [ flow_mce, job_mce, *force_entity_materialization, ] operator.log.info("DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces)) hook.emit_mces(mces)
def _process_message(self, message): self.log.debug("Received message of type %s", type(message).__name__) if isinstance(message, DagParsingStat): self._sync_metadata(message) else: self._collected_dag_buffer.append(SerializedDAG.from_dict(message))
def send_lineage_to_datahub( config: DatahubBasicLineageConfig, operator: "BaseOperator", inlets: List[_Entity], outlets: List[_Entity], context: Dict, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) dag: "DAG" = context["dag"] task: "BaseOperator" = context["task"] # resolve URNs for upstream nodes in subdags upstream of the current task. upstream_subdag_task_urns: List[str] = [] for upstream_task_id in task.upstream_task_ids: upstream_task = dag.task_dict[upstream_task_id] # if upstream task is not a subdag, then skip it if upstream_task.subdag is None: continue # else, link the leaf tasks of the upstream subdag as upstream tasks upstream_subdag = upstream_task.subdag upstream_subdag_flow_urn = builder.make_data_flow_urn( "airflow", upstream_subdag.dag_id, config.cluster) for upstream_subdag_task_id in upstream_subdag.task_dict: upstream_subdag_task = upstream_subdag.task_dict[ upstream_subdag_task_id] upstream_subdag_task_urn = builder.make_data_job_urn_with_flow( upstream_subdag_flow_urn, upstream_subdag_task_id) # if subdag task is a leaf task, then link it as an upstream task if len(upstream_subdag_task._downstream_task_ids) == 0: upstream_subdag_task_urns.append(upstream_subdag_task_urn) # resolve URNs for upstream nodes that trigger the subdag containing the current task. # (if it is in a subdag at all) upstream_subdag_triggers: List[str] = [] # subdags are always named with 'parent.child' style or Airflow won't run them # add connection from subdag trigger(s) if subdag task has no upstreams if (dag.is_subdag and dag.parent_dag is not None and len(task._upstream_task_ids) == 0): # filter through the parent dag's tasks and find the subdag trigger(s) subdags = [ x for x in dag.parent_dag.task_dict.values() if x.subdag is not None ] matched_subdags = [ x for x in subdags if getattr(getattr(x, "subdag"), "dag_id") == dag.dag_id ] # id of the task containing the subdag subdag_task_id = matched_subdags[0].task_id parent_dag_urn = builder.make_data_flow_urn("airflow", dag.parent_dag.dag_id, config.cluster) # iterate through the parent dag's tasks and find the ones that trigger the subdag for upstream_task_id in dag.parent_dag.task_dict: upstream_task = dag.parent_dag.task_dict[upstream_task_id] upstream_task_urn = builder.make_data_job_urn_with_flow( parent_dag_urn, upstream_task_id) # if the task triggers the subdag, link it to this node in the subdag if subdag_task_id in upstream_task._downstream_task_ids: upstream_subdag_triggers.append(upstream_task_urn) # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id, config.cluster) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") allowed_task_keys = [ "_downstream_task_ids", "_inlets", "_outlets", "_task_type", "_task_module", "depends_on_past", "email", "label", "execution_timeout", "end_date", "start_date", "sla", "sql", "task_id", "trigger_rule", "wait_for_downstream", ] job_property_bag = { k: v for (k, v) in job_property_bag.items() if k in allowed_task_keys } allowed_flow_keys = [ "_access_control", "_concurrency", "_default_view", "catchup", "fileloc", "is_paused_upon_creation", "start_date", "tags", "timezone", ] flow_property_bag = { k: v for (k, v) in flow_property_bag.items() if k in allowed_flow_keys } if config.capture_ownership_info: ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=0, actor=builder.make_user_urn("airflow")), ) # operator.log.info(f"{ownership=}") ownership_aspect = [ownership] else: ownership_aspect = [] if config.capture_tags_info: tags = models.GlobalTagsClass(tags=[ models.TagAssociationClass(tag=builder.make_tag_urn(tag)) for tag in (dag.tags or []) ]) # operator.log.info(f"{tags=}") tags_aspect = [tags] else: tags_aspect = [] flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), *ownership_aspect, *tags_aspect, ], )) # exclude subdag operator tasks since these are not emitted, resulting in empty metadata upstream_tasks = ([ builder.make_data_job_urn_with_flow(flow_urn, task_id) for task_id in task.upstream_task_ids if dag.task_dict[task_id].subdag is None ] + upstream_subdag_task_urns + upstream_subdag_triggers) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), inputDatajobs=upstream_tasks, ), *ownership_aspect, *tags_aspect, ], )) force_entity_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=iolet, aspects=[ models.StatusClass(removed=False), ], )) for iolet in _entities_to_urn_list((inlets or []) + (outlets or [])) ] hook = config.make_emitter_hook() mces = [ flow_mce, job_mce, *force_entity_materialization, ] operator.log.info("DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces)) hook.emit_mces(mces)
def send_lineage( operator: "BaseOperator", inlets: Optional[List] = None, outlets: Optional[List] = None, context: Dict = None, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) context = context or {} # ensure not None to satisfy mypy dag: "DAG" = context["dag"] task = context["task"] # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") timestamp = int(dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow") ), ) # operator.log.info(f"{ownership=}") tags = models.GlobalTagsClass( tags=[ models.TagAssociationClass(tag=f"airflow_{tag}") for tag in (dag.tags or []) ] ) # operator.log.info(f"{tags=}") flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), ownership, tags, ], ) ) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), ), ownership, tags, ], ) ) lineage_mces = [ builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet) for outlet in _entities_to_urn_list(outlets or []) ] force_upstream_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=inlet, aspects=[ models.StatusClass(removed=False), ], ) ) for inlet in _entities_to_urn_list(inlets or []) ] hook = make_emitter_hook() mces = [ flow_mce, job_mce, *lineage_mces, *force_upstream_materialization, ] operator.log.info( "DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces) ) hook.emit_mces(mces)