def make_sagemaker_job_urn(job_type: str, job_name: str, arn: str, env: str) -> str: flow_urn = make_sagemaker_flow_urn(job_type, job_name, env) # SageMaker has no global grouping property for jobs, # so we create a flow for every single job return mce_builder.make_data_job_urn_with_flow(flow_urn=flow_urn, job_id=arn)
def construct_job_workunits( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: connector_name = connector.name flow_urn = builder.make_data_flow_urn("kafka-connect", connector_name, self.config.env) job_property_bag: Optional[Dict[str, str]] = None lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform target_dataset = lineage.target_dataset target_platform = lineage.target_platform job_urn = builder.make_data_job_urn_with_flow( flow_urn, source_dataset) inlets = [ builder.make_dataset_urn(source_platform, source_dataset) ] outlets = [ builder.make_dataset_urn(target_platform, target_dataset) ] mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=f"{connector_name}:{source_dataset}", type="COMMAND", description=None, customProperties=job_property_bag, # externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=inlets or [], outputDatasets=outlets or [], ), # ownership, # tags, ], )) wu = MetadataWorkUnit(id=source_dataset, mce=mce) self.report.report_workunit(wu) yield wu
def send_lineage( operator: "BaseOperator", inlets: Optional[List] = None, outlets: Optional[List] = None, context: Dict = None, ) -> None: context = context or {} # ensure not None to satisfy mypy dag: "DAG" = context["dag"] task = context["task"] # task_instance: "TaskInstance" = context["task_instance"] # TODO: verify if task and operator are the same? # TODO: use dag serialization to just save the whole thing. # TODO: save context.get("conf") # TODO: save DAG tags # TODO: save context.get("dag_run") # TODO: save all the data from task_instance # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) timestamp = int( dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=dag.owner, type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow")), ) flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md}", ), ownership, ], )) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, # TODO: add datajob description ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), ), ownership, ], )) lineage_mces = [ builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet) for outlet in _entities_to_urn_list(outlets or []) ] hook = make_emitter_hook() mces = [ flow_mce, job_mce, *lineage_mces, ] operator.log.info("DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces)) hook.emit_mces(mces)
def send_lineage_to_datahub( config: DatahubBasicLineageConfig, operator: "BaseOperator", inlets: List[_Entity], outlets: List[_Entity], context: Dict, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) dag: "DAG" = context["dag"] task: "BaseOperator" = context["task"] # resolve URNs for upstream nodes in subdags upstream of the current task. upstream_subdag_task_urns: List[str] = [] for upstream_task_id in task.upstream_task_ids: upstream_task = dag.task_dict[upstream_task_id] # if upstream task is not a subdag, then skip it if upstream_task.subdag is None: continue # else, link the leaf tasks of the upstream subdag as upstream tasks upstream_subdag = upstream_task.subdag upstream_subdag_flow_urn = builder.make_data_flow_urn( "airflow", upstream_subdag.dag_id, config.cluster) for upstream_subdag_task_id in upstream_subdag.task_dict: upstream_subdag_task = upstream_subdag.task_dict[ upstream_subdag_task_id] upstream_subdag_task_urn = builder.make_data_job_urn_with_flow( upstream_subdag_flow_urn, upstream_subdag_task_id) # if subdag task is a leaf task, then link it as an upstream task if len(upstream_subdag_task._downstream_task_ids) == 0: upstream_subdag_task_urns.append(upstream_subdag_task_urn) # resolve URNs for upstream nodes that trigger the subdag containing the current task. # (if it is in a subdag at all) upstream_subdag_triggers: List[str] = [] # subdags are always named with 'parent.child' style or Airflow won't run them # add connection from subdag trigger(s) if subdag task has no upstreams if (dag.is_subdag and dag.parent_dag is not None and len(task._upstream_task_ids) == 0): # filter through the parent dag's tasks and find the subdag trigger(s) subdags = [ x for x in dag.parent_dag.task_dict.values() if x.subdag is not None ] matched_subdags = [ x for x in subdags if getattr(getattr(x, "subdag"), "dag_id") == dag.dag_id ] # id of the task containing the subdag subdag_task_id = matched_subdags[0].task_id parent_dag_urn = builder.make_data_flow_urn("airflow", dag.parent_dag.dag_id, config.cluster) # iterate through the parent dag's tasks and find the ones that trigger the subdag for upstream_task_id in dag.parent_dag.task_dict: upstream_task = dag.parent_dag.task_dict[upstream_task_id] upstream_task_urn = builder.make_data_job_urn_with_flow( parent_dag_urn, upstream_task_id) # if the task triggers the subdag, link it to this node in the subdag if subdag_task_id in upstream_task._downstream_task_ids: upstream_subdag_triggers.append(upstream_task_urn) # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id, config.cluster) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") allowed_task_keys = [ "_downstream_task_ids", "_inlets", "_outlets", "_task_type", "_task_module", "depends_on_past", "email", "label", "execution_timeout", "end_date", "start_date", "sla", "sql", "task_id", "trigger_rule", "wait_for_downstream", ] job_property_bag = { k: v for (k, v) in job_property_bag.items() if k in allowed_task_keys } allowed_flow_keys = [ "_access_control", "_concurrency", "_default_view", "catchup", "fileloc", "is_paused_upon_creation", "start_date", "tags", "timezone", ] flow_property_bag = { k: v for (k, v) in flow_property_bag.items() if k in allowed_flow_keys } if config.capture_ownership_info: ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=0, actor=builder.make_user_urn("airflow")), ) # operator.log.info(f"{ownership=}") ownership_aspect = [ownership] else: ownership_aspect = [] if config.capture_tags_info: tags = models.GlobalTagsClass(tags=[ models.TagAssociationClass(tag=builder.make_tag_urn(tag)) for tag in (dag.tags or []) ]) # operator.log.info(f"{tags=}") tags_aspect = [tags] else: tags_aspect = [] flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), *ownership_aspect, *tags_aspect, ], )) # exclude subdag operator tasks since these are not emitted, resulting in empty metadata upstream_tasks = ([ builder.make_data_job_urn_with_flow(flow_urn, task_id) for task_id in task.upstream_task_ids if dag.task_dict[task_id].subdag is None ] + upstream_subdag_task_urns + upstream_subdag_triggers) job_doc = ((operator.doc or operator.doc_md or operator.doc_json or operator.doc_yaml or operator.doc_rst) if not AIRFLOW_1 else None) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=job_doc, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), inputDatajobs=upstream_tasks, ), *ownership_aspect, *tags_aspect, ], )) force_entity_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=iolet, aspects=[ models.StatusClass(removed=False), ], )) for iolet in _entities_to_urn_list((inlets or []) + (outlets or [])) ] hook = config.make_emitter_hook() mces = [ flow_mce, job_mce, *force_entity_materialization, ] operator.log.info("DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces)) hook.emit_mces(mces)
def process_dataflow_node( self, node: Dict[str, Any], flow_urn: str, new_dataset_ids: List[str], new_dataset_mces: List[MetadataChangeEvent], s3_formats: typing.DefaultDict[str, Set[Union[str, None]]], ) -> Dict[str, Any]: node_type = node["NodeType"] # for nodes representing datasets, we construct a dataset URN accordingly if node_type in ["DataSource", "DataSink"]: node_args = {x["Name"]: json.loads(x["Value"]) for x in node["Args"]} # if data object is Glue table if "database" in node_args and "table_name" in node_args: full_table_name = f"{node_args['database']}.{node_args['table_name']}" # we know that the table will already be covered when ingesting Glue tables node_urn = f"urn:li:dataset:(urn:li:dataPlatform:glue,{full_table_name},{self.env})" # if data object is S3 bucket elif node_args.get("connection_type") == "s3": s3_uri = node_args["connection_options"]["path"] # append S3 format if different ones exist if len(s3_formats[s3_uri]) > 1: node_urn = make_s3_urn( s3_uri, self.env, suffix=node_args.get("format"), ) else: node_urn = make_s3_urn(s3_uri, self.env) dataset_snapshot = DatasetSnapshot( urn=node_urn, aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( DatasetPropertiesClass( customProperties={k: str(v) for k, v in node_args.items()}, tags=[], ) ) new_dataset_mces.append( MetadataChangeEvent(proposedSnapshot=dataset_snapshot) ) new_dataset_ids.append(f"{node['NodeType']}-{node['Id']}") else: raise ValueError(f"Unrecognized Glue data object type: {node_args}") # otherwise, a node represents a transformation else: node_urn = mce_builder.make_data_job_urn_with_flow( flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}' ) return { **node, "urn": node_urn, # to be filled in after traversing edges "inputDatajobs": [], "inputDatasets": [], "outputDatasets": [], }
def construct_job_workunits( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: connector_name = connector.name flow_urn = builder.make_data_flow_urn("kafka-connect", connector_name, self.config.env) lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform source_platform_instance = ( self.config.platform_instance_map.get(source_platform) if self.config.platform_instance_map else None) target_dataset = lineage.target_dataset target_platform = lineage.target_platform target_platform_instance = ( self.config.platform_instance_map.get(target_platform) if self.config.platform_instance_map else None) job_property_bag = lineage.job_property_bag job_id = (source_dataset if source_dataset else f"unknown_source.{target_dataset}") job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id) inlets = ([ builder.make_dataset_urn_with_platform_instance( source_platform, source_dataset, platform_instance=source_platform_instance, env=self.config.env, ) ] if source_dataset else []) outlets = [ builder.make_dataset_urn_with_platform_instance( target_platform, target_dataset, platform_instance=target_platform_instance, env=self.config.env, ) ] mcp = MetadataChangeProposalWrapper( entityType="dataJob", entityUrn=job_urn, changeType=models.ChangeTypeClass.UPSERT, aspectName="dataJobInfo", aspect=models.DataJobInfoClass( name=f"{connector_name}:{job_id}", type="COMMAND", description=None, customProperties=job_property_bag # externalUrl=job_url, ), ) wu = MetadataWorkUnit( id= f"kafka-connect.{connector_name}.{job_id}.{mcp.aspectName}", mcp=mcp, ) self.report.report_workunit(wu) yield wu mcp = MetadataChangeProposalWrapper( entityType="dataJob", entityUrn=job_urn, changeType=models.ChangeTypeClass.UPSERT, aspectName="dataJobInputOutput", aspect=models.DataJobInputOutputClass( inputDatasets=inlets, outputDatasets=outlets, ), ) wu = MetadataWorkUnit( id= f"kafka-connect.{connector_name}.{job_id}.{mcp.aspectName}", mcp=mcp, ) self.report.report_workunit(wu) yield wu
def construct_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 rootpg = self.nifi_flow.root_process_group flow_name = rootpg.name # self.config.site_name flow_urn = builder.make_data_flow_urn(NIFI, rootpg.id, self.config.env) flow_properties = dict() if self.nifi_flow.clustered is not None: flow_properties["clustered"] = str(self.nifi_flow.clustered) if self.nifi_flow.version is not None: flow_properties["version"] = str(self.nifi_flow.version) yield from self.construct_flow_workunits( flow_urn, flow_name, self.make_external_url(rootpg.id), flow_properties ) for component in self.nifi_flow.components.values(): job_name = component.name job_urn = builder.make_data_job_urn_with_flow(flow_urn, component.id) incoming = list( filter(lambda x: x[1] == component.id, self.nifi_flow.connections) ) outgoing = list( filter(lambda x: x[0] == component.id, self.nifi_flow.connections) ) inputJobs = [] jobProperties = None if component.nifi_type is NifiType.PROCESSOR: jobProperties = { k: str(v) for k, v in component.config.items() # type: ignore if k in [ "schedulingPeriod", "schedulingStrategy", "executionNode", "concurrentlySchedulableTaskCount", ] } jobProperties["properties"] = json.dumps( component.config.get("properties") # type: ignore ) if component.last_event_time is not None: jobProperties["last_event_time"] = component.last_event_time for dataset in component.inlets.values(): yield from self.construct_dataset_workunits( dataset.platform, dataset.dataset_name, dataset.dataset_urn, datasetProperties=dataset.dataset_properties, ) for dataset in component.outlets.values(): yield from self.construct_dataset_workunits( dataset.platform, dataset.dataset_name, dataset.dataset_urn, datasetProperties=dataset.dataset_properties, ) for edge in incoming: incoming_from = edge[0] if incoming_from in self.nifi_flow.remotely_accessible_ports.keys(): dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[incoming_from].name}" dataset_urn = builder.make_dataset_urn( NIFI, dataset_name, self.config.env ) component.inlets[dataset_urn] = ExternalDataset( NIFI, dataset_name, dict(nifi_uri=self.config.site_url), dataset_urn, ) else: inputJobs.append( builder.make_data_job_urn_with_flow(flow_urn, incoming_from) ) for edge in outgoing: outgoing_to = edge[1] if outgoing_to in self.nifi_flow.remotely_accessible_ports.keys(): dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[outgoing_to].name}" dataset_urn = builder.make_dataset_urn( NIFI, dataset_name, self.config.env ) component.outlets[dataset_urn] = ExternalDataset( NIFI, dataset_name, dict(nifi_uri=self.config.site_url), dataset_urn, ) if component.nifi_type is NifiType.REMOTE_INPUT_PORT: # TODO - if target_uris is not set, but http proxy is used in RPG site_urls = component.target_uris.split(",") # type: ignore for site_url in site_urls: if site_url not in self.config.site_url_to_site_name: self.report_warning( site_url, f"Site with url {site_url} is being used in flow but\ corresponding site name is not configured via site_url_to_site_name.\ This may result in broken lineage.", ) else: site_name = self.config.site_url_to_site_name[site_url] dataset_name = f"{site_name}.{component.name}" dataset_urn = builder.make_dataset_urn( NIFI, dataset_name, self.config.env ) component.outlets[dataset_urn] = ExternalDataset( NIFI, dataset_name, dict(nifi_uri=site_url), dataset_urn ) break if component.nifi_type is NifiType.REMOTE_OUTPUT_PORT: site_urls = component.target_uris.split(",") # type: ignore for site_url in site_urls: if site_url not in self.config.site_url_to_site_name: self.report_warning( self.config.site_url, f"Site with url {site_url} is being used in flow but\ corresponding site name is not configured via site_url_to_site_name.\ This may result in broken lineage.", ) else: site_name = self.config.site_url_to_site_name[site_url] dataset_name = f"{site_name}.{component.name}" dataset_urn = builder.make_dataset_urn( NIFI, dataset_name, self.config.env ) component.inlets[dataset_urn] = ExternalDataset( NIFI, dataset_name, dict(nifi_uri=site_url), dataset_urn ) break yield from self.construct_job_workunits( job_urn, job_name, external_url=self.make_external_url( component.parent_group_id, component.id, component.parent_rpg_id ), job_type=NIFI.upper() + "_" + component.nifi_type.value, description=component.comments, job_properties=jobProperties, inlets=list(component.inlets.keys()), outlets=list(component.outlets.keys()), inputJobs=inputJobs, status=component.status, ) for port in self.nifi_flow.remotely_accessible_ports.values(): dataset_name = f"{self.config.site_name}.{port.name}" dataset_platform = NIFI yield from self.construct_dataset_workunits( dataset_platform, dataset_name, external_url=self.make_external_url(port.parent_group_id, port.id), )
def send_lineage( operator: "BaseOperator", inlets: Optional[List] = None, outlets: Optional[List] = None, context: Dict = None, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.lineage import prepare_lineage from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) from datahub.integrations.airflow.hooks import AIRFLOW_1 # Detect Airflow 1.10.x inlet/outlet configurations in Airflow 2.x, and # convert to the newer version. This code path will only be triggered # when 2.x receives a 1.10.x inlet/outlet config. needs_repeat_preparation = False if ( not AIRFLOW_1 and isinstance(operator._inlets, list) and len(operator._inlets) == 1 and isinstance(operator._inlets[0], dict) ): from airflow.lineage import AUTO operator._inlets = [ # See https://airflow.apache.org/docs/apache-airflow/1.10.15/lineage.html. *operator._inlets[0].get( "datasets", [] ), # assumes these are attr-annotated *operator._inlets[0].get("task_ids", []), *([AUTO] if operator._inlets[0].get("auto", False) else []), ] needs_repeat_preparation = True if ( not AIRFLOW_1 and isinstance(operator._outlets, list) and len(operator._outlets) == 1 and isinstance(operator._outlets[0], dict) ): operator._outlets = [*operator._outlets[0].get("datasets", [])] needs_repeat_preparation = True if needs_repeat_preparation: # Rerun the lineage preparation routine, now that the old format has been translated to the new one. prepare_lineage(lambda self, ctx: None)(operator, context) context = context or {} # ensure not None to satisfy mypy dag: "DAG" = context["dag"] task = context["task"] # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") timestamp = int(dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow") ), ) # operator.log.info(f"{ownership=}") tags = models.GlobalTagsClass( tags=[ models.TagAssociationClass(tag=f"airflow_{tag}") for tag in (dag.tags or []) ] ) # operator.log.info(f"{tags=}") flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), ownership, tags, ], ) ) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), ), ownership, tags, ], ) ) lineage_mces = [ builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet) for outlet in _entities_to_urn_list(outlets or []) ] force_upstream_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=inlet, aspects=[ models.StatusClass(removed=False), ], ) ) for inlet in _entities_to_urn_list(inlets or []) ] hook = make_emitter_hook() mces = [ flow_mce, job_mce, *lineage_mces, *force_upstream_materialization, ] operator.log.info( "DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces) ) hook.emit_mces(mces)
def process_dataflow_node( self, node: Dict[str, Any], flow_urn: str, new_dataset_ids: List[str], new_dataset_mces: List[MetadataChangeEvent], s3_formats: typing.DefaultDict[str, Set[Union[str, None]]], ) -> Optional[Dict[str, Any]]: node_type = node["NodeType"] # for nodes representing datasets, we construct a dataset URN accordingly if node_type in ["DataSource", "DataSink"]: node_args = { x["Name"]: yaml.safe_load(x["Value"]) for x in node["Args"] } # if data object is Glue table if "database" in node_args and "table_name" in node_args: full_table_name = f"{node_args['database']}.{node_args['table_name']}" # we know that the table will already be covered when ingesting Glue tables node_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=full_table_name, env=self.env, platform_instance=self.source_config.platform_instance, ) # if data object is S3 bucket elif node_args.get("connection_type") == "s3": s3_uri = self.get_s3_uri(node_args) if s3_uri is None: self.report.report_warning( f"{node['Nodetype']}-{node['Id']}", f"Could not find script path for job {node['Nodetype']}-{node['Id']} in flow {flow_urn}. Skipping", ) return None # append S3 format if different ones exist if len(s3_formats[s3_uri]) > 1: node_urn = make_s3_urn( f"{s3_uri}.{node_args.get('format')}", self.env, ) else: node_urn = make_s3_urn(s3_uri, self.env) dataset_snapshot = DatasetSnapshot( urn=node_urn, aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( DatasetPropertiesClass( customProperties={ k: str(v) for k, v in node_args.items() }, tags=[], )) new_dataset_mces.append( MetadataChangeEvent(proposedSnapshot=dataset_snapshot)) new_dataset_ids.append(f"{node['NodeType']}-{node['Id']}") else: if self.source_config.ignore_unsupported_connectors: logger.info( flow_urn, f"Unrecognized Glue data object type: {node_args}. Skipping.", ) return None else: raise ValueError( f"Unrecognized Glue data object type: {node_args}") # otherwise, a node represents a transformation else: node_urn = mce_builder.make_data_job_urn_with_flow( flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}') return { **node, "urn": node_urn, # to be filled in after traversing edges "inputDatajobs": [], "inputDatasets": [], "outputDatasets": [], }
def send_lineage_to_datahub( config: DatahubBasicLineageConfig, operator: "BaseOperator", inlets: List[_Entity], outlets: List[_Entity], context: Dict, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) dag: "DAG" = context["dag"] task: "BaseOperator" = context["task"] # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id, config.cluster) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") allowed_task_keys = [ "_downstream_task_ids", "_inlets", "_outlets", "_task_type", "_task_module", "depends_on_past", "email", "label", "execution_timeout", "end_date", "start_date", "sla", "sql", "task_id", "trigger_rule", "wait_for_downstream", ] job_property_bag = { k: v for (k, v) in job_property_bag.items() if k in allowed_task_keys } allowed_flow_keys = [ "_access_control", "_concurrency", "_default_view", "catchup", "fileloc", "is_paused_upon_creation", "start_date", "tags", "timezone", ] flow_property_bag = { k: v for (k, v) in flow_property_bag.items() if k in allowed_flow_keys } if config.capture_ownership_info: timestamp = int( dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow")), ) # operator.log.info(f"{ownership=}") ownership_aspect = [ownership] else: ownership_aspect = [] if config.capture_tags_info: tags = models.GlobalTagsClass(tags=[ models.TagAssociationClass(tag=builder.make_tag_urn(tag)) for tag in (dag.tags or []) ]) # operator.log.info(f"{tags=}") tags_aspect = [tags] else: tags_aspect = [] flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), *ownership_aspect, *tags_aspect, ], )) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), inputDatajobs=[ builder.make_data_job_urn_with_flow(flow_urn, task_id) for task_id in task.upstream_task_ids ], ), *ownership_aspect, *tags_aspect, ], )) force_entity_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=iolet, aspects=[ models.StatusClass(removed=False), ], )) for iolet in _entities_to_urn_list((inlets or []) + (outlets or [])) ] hook = config.make_emitter_hook() mces = [ flow_mce, job_mce, *force_entity_materialization, ] operator.log.info("DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces)) hook.emit_mces(mces)
def send_lineage( operator: "BaseOperator", inlets: Optional[List] = None, outlets: Optional[List] = None, context: Dict = None, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) context = context or {} # ensure not None to satisfy mypy dag: "DAG" = context["dag"] task = context["task"] # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") timestamp = int(dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow") ), ) # operator.log.info(f"{ownership=}") tags = models.GlobalTagsClass( tags=[ models.TagAssociationClass(tag=f"airflow_{tag}") for tag in (dag.tags or []) ] ) # operator.log.info(f"{tags=}") flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), ownership, tags, ], ) ) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), ), ownership, tags, ], ) ) lineage_mces = [ builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet) for outlet in _entities_to_urn_list(outlets or []) ] force_upstream_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=inlet, aspects=[ models.StatusClass(removed=False), ], ) ) for inlet in _entities_to_urn_list(inlets or []) ] hook = make_emitter_hook() mces = [ flow_mce, job_mce, *lineage_mces, *force_upstream_materialization, ] operator.log.info( "DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces) ) hook.emit_mces(mces)