Esempio n. 1
0
def make_sagemaker_job_urn(job_type: str, job_name: str, arn: str, env: str) -> str:

    flow_urn = make_sagemaker_flow_urn(job_type, job_name, env)

    # SageMaker has no global grouping property for jobs,
    # so we create a flow for every single job
    return mce_builder.make_data_job_urn_with_flow(flow_urn=flow_urn, job_id=arn)
Esempio n. 2
0
    def construct_job_workunits(
            self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]:

        connector_name = connector.name
        flow_urn = builder.make_data_flow_urn("kafka-connect", connector_name,
                                              self.config.env)

        job_property_bag: Optional[Dict[str, str]] = None

        lineages = connector.lineages
        if lineages:
            for lineage in lineages:
                source_dataset = lineage.source_dataset
                source_platform = lineage.source_platform
                target_dataset = lineage.target_dataset
                target_platform = lineage.target_platform

                job_urn = builder.make_data_job_urn_with_flow(
                    flow_urn, source_dataset)

                inlets = [
                    builder.make_dataset_urn(source_platform, source_dataset)
                ]
                outlets = [
                    builder.make_dataset_urn(target_platform, target_dataset)
                ]

                mce = models.MetadataChangeEventClass(
                    proposedSnapshot=models.DataJobSnapshotClass(
                        urn=job_urn,
                        aspects=[
                            models.DataJobInfoClass(
                                name=f"{connector_name}:{source_dataset}",
                                type="COMMAND",
                                description=None,
                                customProperties=job_property_bag,
                                # externalUrl=job_url,
                            ),
                            models.DataJobInputOutputClass(
                                inputDatasets=inlets or [],
                                outputDatasets=outlets or [],
                            ),
                            # ownership,
                            # tags,
                        ],
                    ))

                wu = MetadataWorkUnit(id=source_dataset, mce=mce)
                self.report.report_workunit(wu)
                yield wu
Esempio n. 3
0
    def send_lineage(
        operator: "BaseOperator",
        inlets: Optional[List] = None,
        outlets: Optional[List] = None,
        context: Dict = None,
    ) -> None:
        context = context or {}  # ensure not None to satisfy mypy

        dag: "DAG" = context["dag"]
        task = context["task"]
        # task_instance: "TaskInstance" = context["task_instance"]

        # TODO: verify if task and operator are the same?
        # TODO: use dag serialization to just save the whole thing.
        # TODO: save context.get("conf")
        # TODO: save DAG tags
        # TODO: save context.get("dag_run")
        # TODO: save all the data from task_instance
        # TODO: capture raw sql from db operators

        flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id)
        job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id)

        timestamp = int(
            dateutil.parser.parse(context["ts"]).timestamp() * 1000)
        ownership = models.OwnershipClass(
            owners=[
                models.OwnerClass(
                    owner=dag.owner,
                    type=models.OwnershipTypeClass.DEVELOPER,
                    source=models.OwnershipSourceClass(
                        type=models.OwnershipSourceTypeClass.SERVICE,
                        url=dag.filepath,
                    ),
                )
            ],
            lastModified=models.AuditStampClass(
                time=timestamp, actor=builder.make_user_urn("airflow")),
        )

        flow_mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataFlowSnapshotClass(
                urn=flow_urn,
                aspects=[
                    models.DataFlowInfoClass(
                        name=dag.dag_id,
                        description=f"{dag.description}\n\n{dag.doc_md}",
                    ),
                    ownership,
                ],
            ))

        job_mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataJobSnapshotClass(
                urn=job_urn,
                aspects=[
                    models.DataJobInfoClass(
                        name=task.task_id,
                        type=models.AzkabanJobTypeClass.COMMAND,
                        description=None,  # TODO: add datajob description
                    ),
                    models.DataJobInputOutputClass(
                        inputDatasets=_entities_to_urn_list(inlets or []),
                        outputDatasets=_entities_to_urn_list(outlets or []),
                    ),
                    ownership,
                ],
            ))

        lineage_mces = [
            builder.make_lineage_mce(_entities_to_urn_list(inlets or []),
                                     outlet)
            for outlet in _entities_to_urn_list(outlets or [])
        ]

        hook = make_emitter_hook()

        mces = [
            flow_mce,
            job_mce,
            *lineage_mces,
        ]
        operator.log.info("DataHub lineage backend - emitting metadata:\n" +
                          "\n".join(json.dumps(mce.to_obj()) for mce in mces))
        hook.emit_mces(mces)
Esempio n. 4
0
def send_lineage_to_datahub(
    config: DatahubBasicLineageConfig,
    operator: "BaseOperator",
    inlets: List[_Entity],
    outlets: List[_Entity],
    context: Dict,
) -> None:
    # This is necessary to avoid issues with circular imports.
    from airflow.serialization.serialized_objects import (
        SerializedBaseOperator,
        SerializedDAG,
    )

    dag: "DAG" = context["dag"]
    task: "BaseOperator" = context["task"]

    # resolve URNs for upstream nodes in subdags upstream of the current task.
    upstream_subdag_task_urns: List[str] = []

    for upstream_task_id in task.upstream_task_ids:
        upstream_task = dag.task_dict[upstream_task_id]

        # if upstream task is not a subdag, then skip it
        if upstream_task.subdag is None:
            continue

        # else, link the leaf tasks of the upstream subdag as upstream tasks
        upstream_subdag = upstream_task.subdag

        upstream_subdag_flow_urn = builder.make_data_flow_urn(
            "airflow", upstream_subdag.dag_id, config.cluster)

        for upstream_subdag_task_id in upstream_subdag.task_dict:
            upstream_subdag_task = upstream_subdag.task_dict[
                upstream_subdag_task_id]

            upstream_subdag_task_urn = builder.make_data_job_urn_with_flow(
                upstream_subdag_flow_urn, upstream_subdag_task_id)

            # if subdag task is a leaf task, then link it as an upstream task
            if len(upstream_subdag_task._downstream_task_ids) == 0:

                upstream_subdag_task_urns.append(upstream_subdag_task_urn)

    # resolve URNs for upstream nodes that trigger the subdag containing the current task.
    # (if it is in a subdag at all)
    upstream_subdag_triggers: List[str] = []

    # subdags are always named with 'parent.child' style or Airflow won't run them
    # add connection from subdag trigger(s) if subdag task has no upstreams
    if (dag.is_subdag and dag.parent_dag is not None
            and len(task._upstream_task_ids) == 0):

        # filter through the parent dag's tasks and find the subdag trigger(s)
        subdags = [
            x for x in dag.parent_dag.task_dict.values()
            if x.subdag is not None
        ]
        matched_subdags = [
            x for x in subdags
            if getattr(getattr(x, "subdag"), "dag_id") == dag.dag_id
        ]

        # id of the task containing the subdag
        subdag_task_id = matched_subdags[0].task_id

        parent_dag_urn = builder.make_data_flow_urn("airflow",
                                                    dag.parent_dag.dag_id,
                                                    config.cluster)

        # iterate through the parent dag's tasks and find the ones that trigger the subdag
        for upstream_task_id in dag.parent_dag.task_dict:
            upstream_task = dag.parent_dag.task_dict[upstream_task_id]

            upstream_task_urn = builder.make_data_job_urn_with_flow(
                parent_dag_urn, upstream_task_id)

            # if the task triggers the subdag, link it to this node in the subdag
            if subdag_task_id in upstream_task._downstream_task_ids:
                upstream_subdag_triggers.append(upstream_task_urn)

    # TODO: capture context
    # context dag_run
    # task_instance: "TaskInstance" = context["task_instance"]
    # TODO: capture raw sql from db operators

    flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id,
                                          config.cluster)
    job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id)

    base_url = conf.get("webserver", "base_url")
    flow_url = f"{base_url}/tree?dag_id={dag.dag_id}"
    job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}"
    # operator.log.info(f"{flow_url=}")
    # operator.log.info(f"{job_url=}")
    # operator.log.info(f"{dag.get_serialized_fields()=}")
    # operator.log.info(f"{task.get_serialized_fields()=}")
    # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}")

    flow_property_bag: Dict[str, str] = {
        key: repr(value)
        for (key, value) in SerializedDAG.serialize_dag(dag).items()
    }
    for key in dag.get_serialized_fields():
        if key not in flow_property_bag:
            flow_property_bag[key] = repr(getattr(dag, key))
    job_property_bag: Dict[str, str] = {
        key: repr(value)
        for (key,
             value) in SerializedBaseOperator.serialize_operator(task).items()
    }
    for key in task.get_serialized_fields():
        if key not in job_property_bag:
            job_property_bag[key] = repr(getattr(task, key))
    # operator.log.info(f"{flow_property_bag=}")
    # operator.log.info(f"{job_property_bag=}")
    allowed_task_keys = [
        "_downstream_task_ids",
        "_inlets",
        "_outlets",
        "_task_type",
        "_task_module",
        "depends_on_past",
        "email",
        "label",
        "execution_timeout",
        "end_date",
        "start_date",
        "sla",
        "sql",
        "task_id",
        "trigger_rule",
        "wait_for_downstream",
    ]
    job_property_bag = {
        k: v
        for (k, v) in job_property_bag.items() if k in allowed_task_keys
    }
    allowed_flow_keys = [
        "_access_control",
        "_concurrency",
        "_default_view",
        "catchup",
        "fileloc",
        "is_paused_upon_creation",
        "start_date",
        "tags",
        "timezone",
    ]
    flow_property_bag = {
        k: v
        for (k, v) in flow_property_bag.items() if k in allowed_flow_keys
    }

    if config.capture_ownership_info:
        ownership = models.OwnershipClass(
            owners=[
                models.OwnerClass(
                    owner=builder.make_user_urn(dag.owner),
                    type=models.OwnershipTypeClass.DEVELOPER,
                    source=models.OwnershipSourceClass(
                        type=models.OwnershipSourceTypeClass.SERVICE,
                        url=dag.filepath,
                    ),
                )
            ],
            lastModified=models.AuditStampClass(
                time=0, actor=builder.make_user_urn("airflow")),
        )
        # operator.log.info(f"{ownership=}")
        ownership_aspect = [ownership]
    else:
        ownership_aspect = []

    if config.capture_tags_info:
        tags = models.GlobalTagsClass(tags=[
            models.TagAssociationClass(tag=builder.make_tag_urn(tag))
            for tag in (dag.tags or [])
        ])
        # operator.log.info(f"{tags=}")
        tags_aspect = [tags]
    else:
        tags_aspect = []

    flow_mce = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataFlowSnapshotClass(
            urn=flow_urn,
            aspects=[
                models.DataFlowInfoClass(
                    name=dag.dag_id,
                    description=f"{dag.description}\n\n{dag.doc_md or ''}",
                    customProperties=flow_property_bag,
                    externalUrl=flow_url,
                ),
                *ownership_aspect,
                *tags_aspect,
            ],
        ))

    # exclude subdag operator tasks since these are not emitted, resulting in empty metadata
    upstream_tasks = ([
        builder.make_data_job_urn_with_flow(flow_urn, task_id)
        for task_id in task.upstream_task_ids
        if dag.task_dict[task_id].subdag is None
    ] + upstream_subdag_task_urns + upstream_subdag_triggers)

    job_doc = ((operator.doc or operator.doc_md or operator.doc_json
                or operator.doc_yaml or operator.doc_rst)
               if not AIRFLOW_1 else None)

    job_mce = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataJobSnapshotClass(
            urn=job_urn,
            aspects=[
                models.DataJobInfoClass(
                    name=task.task_id,
                    type=models.AzkabanJobTypeClass.COMMAND,
                    description=job_doc,
                    customProperties=job_property_bag,
                    externalUrl=job_url,
                ),
                models.DataJobInputOutputClass(
                    inputDatasets=_entities_to_urn_list(inlets or []),
                    outputDatasets=_entities_to_urn_list(outlets or []),
                    inputDatajobs=upstream_tasks,
                ),
                *ownership_aspect,
                *tags_aspect,
            ],
        ))

    force_entity_materialization = [
        models.MetadataChangeEventClass(
            proposedSnapshot=models.DatasetSnapshotClass(
                urn=iolet,
                aspects=[
                    models.StatusClass(removed=False),
                ],
            ))
        for iolet in _entities_to_urn_list((inlets or []) + (outlets or []))
    ]

    hook = config.make_emitter_hook()

    mces = [
        flow_mce,
        job_mce,
        *force_entity_materialization,
    ]
    operator.log.info("DataHub lineage backend - emitting metadata:\n" +
                      "\n".join(json.dumps(mce.to_obj()) for mce in mces))
    hook.emit_mces(mces)
Esempio n. 5
0
    def process_dataflow_node(
        self,
        node: Dict[str, Any],
        flow_urn: str,
        new_dataset_ids: List[str],
        new_dataset_mces: List[MetadataChangeEvent],
        s3_formats: typing.DefaultDict[str, Set[Union[str, None]]],
    ) -> Dict[str, Any]:

        node_type = node["NodeType"]

        # for nodes representing datasets, we construct a dataset URN accordingly
        if node_type in ["DataSource", "DataSink"]:

            node_args = {x["Name"]: json.loads(x["Value"]) for x in node["Args"]}

            # if data object is Glue table
            if "database" in node_args and "table_name" in node_args:

                full_table_name = f"{node_args['database']}.{node_args['table_name']}"

                # we know that the table will already be covered when ingesting Glue tables
                node_urn = f"urn:li:dataset:(urn:li:dataPlatform:glue,{full_table_name},{self.env})"

            # if data object is S3 bucket
            elif node_args.get("connection_type") == "s3":

                s3_uri = node_args["connection_options"]["path"]

                # append S3 format if different ones exist
                if len(s3_formats[s3_uri]) > 1:
                    node_urn = make_s3_urn(
                        s3_uri,
                        self.env,
                        suffix=node_args.get("format"),
                    )

                else:
                    node_urn = make_s3_urn(s3_uri, self.env)

                dataset_snapshot = DatasetSnapshot(
                    urn=node_urn,
                    aspects=[],
                )

                dataset_snapshot.aspects.append(Status(removed=False))
                dataset_snapshot.aspects.append(
                    DatasetPropertiesClass(
                        customProperties={k: str(v) for k, v in node_args.items()},
                        tags=[],
                    )
                )

                new_dataset_mces.append(
                    MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                )
                new_dataset_ids.append(f"{node['NodeType']}-{node['Id']}")

            else:

                raise ValueError(f"Unrecognized Glue data object type: {node_args}")

        # otherwise, a node represents a transformation
        else:
            node_urn = mce_builder.make_data_job_urn_with_flow(
                flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}'
            )

        return {
            **node,
            "urn": node_urn,
            # to be filled in after traversing edges
            "inputDatajobs": [],
            "inputDatasets": [],
            "outputDatasets": [],
        }
Esempio n. 6
0
    def construct_job_workunits(
            self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]:

        connector_name = connector.name
        flow_urn = builder.make_data_flow_urn("kafka-connect", connector_name,
                                              self.config.env)

        lineages = connector.lineages
        if lineages:
            for lineage in lineages:
                source_dataset = lineage.source_dataset
                source_platform = lineage.source_platform
                source_platform_instance = (
                    self.config.platform_instance_map.get(source_platform)
                    if self.config.platform_instance_map else None)
                target_dataset = lineage.target_dataset
                target_platform = lineage.target_platform
                target_platform_instance = (
                    self.config.platform_instance_map.get(target_platform)
                    if self.config.platform_instance_map else None)
                job_property_bag = lineage.job_property_bag

                job_id = (source_dataset if source_dataset else
                          f"unknown_source.{target_dataset}")
                job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id)

                inlets = ([
                    builder.make_dataset_urn_with_platform_instance(
                        source_platform,
                        source_dataset,
                        platform_instance=source_platform_instance,
                        env=self.config.env,
                    )
                ] if source_dataset else [])
                outlets = [
                    builder.make_dataset_urn_with_platform_instance(
                        target_platform,
                        target_dataset,
                        platform_instance=target_platform_instance,
                        env=self.config.env,
                    )
                ]

                mcp = MetadataChangeProposalWrapper(
                    entityType="dataJob",
                    entityUrn=job_urn,
                    changeType=models.ChangeTypeClass.UPSERT,
                    aspectName="dataJobInfo",
                    aspect=models.DataJobInfoClass(
                        name=f"{connector_name}:{job_id}",
                        type="COMMAND",
                        description=None,
                        customProperties=job_property_bag
                        # externalUrl=job_url,
                    ),
                )

                wu = MetadataWorkUnit(
                    id=
                    f"kafka-connect.{connector_name}.{job_id}.{mcp.aspectName}",
                    mcp=mcp,
                )
                self.report.report_workunit(wu)
                yield wu

                mcp = MetadataChangeProposalWrapper(
                    entityType="dataJob",
                    entityUrn=job_urn,
                    changeType=models.ChangeTypeClass.UPSERT,
                    aspectName="dataJobInputOutput",
                    aspect=models.DataJobInputOutputClass(
                        inputDatasets=inlets,
                        outputDatasets=outlets,
                    ),
                )

                wu = MetadataWorkUnit(
                    id=
                    f"kafka-connect.{connector_name}.{job_id}.{mcp.aspectName}",
                    mcp=mcp,
                )
                self.report.report_workunit(wu)
                yield wu
Esempio n. 7
0
    def construct_workunits(self) -> Iterable[MetadataWorkUnit]:  # noqa: C901

        rootpg = self.nifi_flow.root_process_group
        flow_name = rootpg.name  # self.config.site_name
        flow_urn = builder.make_data_flow_urn(NIFI, rootpg.id, self.config.env)
        flow_properties = dict()
        if self.nifi_flow.clustered is not None:
            flow_properties["clustered"] = str(self.nifi_flow.clustered)
        if self.nifi_flow.version is not None:
            flow_properties["version"] = str(self.nifi_flow.version)
        yield from self.construct_flow_workunits(
            flow_urn, flow_name, self.make_external_url(rootpg.id), flow_properties
        )

        for component in self.nifi_flow.components.values():
            job_name = component.name
            job_urn = builder.make_data_job_urn_with_flow(flow_urn, component.id)

            incoming = list(
                filter(lambda x: x[1] == component.id, self.nifi_flow.connections)
            )
            outgoing = list(
                filter(lambda x: x[0] == component.id, self.nifi_flow.connections)
            )
            inputJobs = []
            jobProperties = None

            if component.nifi_type is NifiType.PROCESSOR:
                jobProperties = {
                    k: str(v)
                    for k, v in component.config.items()  # type: ignore
                    if k
                    in [
                        "schedulingPeriod",
                        "schedulingStrategy",
                        "executionNode",
                        "concurrentlySchedulableTaskCount",
                    ]
                }
                jobProperties["properties"] = json.dumps(
                    component.config.get("properties")  # type: ignore
                )
                if component.last_event_time is not None:
                    jobProperties["last_event_time"] = component.last_event_time

                for dataset in component.inlets.values():
                    yield from self.construct_dataset_workunits(
                        dataset.platform,
                        dataset.dataset_name,
                        dataset.dataset_urn,
                        datasetProperties=dataset.dataset_properties,
                    )

                for dataset in component.outlets.values():
                    yield from self.construct_dataset_workunits(
                        dataset.platform,
                        dataset.dataset_name,
                        dataset.dataset_urn,
                        datasetProperties=dataset.dataset_properties,
                    )

            for edge in incoming:
                incoming_from = edge[0]
                if incoming_from in self.nifi_flow.remotely_accessible_ports.keys():
                    dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[incoming_from].name}"
                    dataset_urn = builder.make_dataset_urn(
                        NIFI, dataset_name, self.config.env
                    )
                    component.inlets[dataset_urn] = ExternalDataset(
                        NIFI,
                        dataset_name,
                        dict(nifi_uri=self.config.site_url),
                        dataset_urn,
                    )
                else:
                    inputJobs.append(
                        builder.make_data_job_urn_with_flow(flow_urn, incoming_from)
                    )

            for edge in outgoing:
                outgoing_to = edge[1]
                if outgoing_to in self.nifi_flow.remotely_accessible_ports.keys():
                    dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[outgoing_to].name}"
                    dataset_urn = builder.make_dataset_urn(
                        NIFI, dataset_name, self.config.env
                    )
                    component.outlets[dataset_urn] = ExternalDataset(
                        NIFI,
                        dataset_name,
                        dict(nifi_uri=self.config.site_url),
                        dataset_urn,
                    )

            if component.nifi_type is NifiType.REMOTE_INPUT_PORT:
                # TODO - if target_uris is not set, but http proxy is used in RPG
                site_urls = component.target_uris.split(",")  # type: ignore
                for site_url in site_urls:
                    if site_url not in self.config.site_url_to_site_name:
                        self.report_warning(
                            site_url,
                            f"Site with url {site_url} is being used in flow but\
                            corresponding site name is not configured via site_url_to_site_name.\
                            This may result in broken lineage.",
                        )
                    else:
                        site_name = self.config.site_url_to_site_name[site_url]
                        dataset_name = f"{site_name}.{component.name}"
                        dataset_urn = builder.make_dataset_urn(
                            NIFI, dataset_name, self.config.env
                        )
                        component.outlets[dataset_urn] = ExternalDataset(
                            NIFI, dataset_name, dict(nifi_uri=site_url), dataset_urn
                        )
                        break

            if component.nifi_type is NifiType.REMOTE_OUTPUT_PORT:
                site_urls = component.target_uris.split(",")  # type: ignore
                for site_url in site_urls:
                    if site_url not in self.config.site_url_to_site_name:
                        self.report_warning(
                            self.config.site_url,
                            f"Site with url {site_url} is being used in flow but\
                            corresponding site name is not configured via site_url_to_site_name.\
                            This may result in broken lineage.",
                        )
                    else:
                        site_name = self.config.site_url_to_site_name[site_url]

                        dataset_name = f"{site_name}.{component.name}"
                        dataset_urn = builder.make_dataset_urn(
                            NIFI, dataset_name, self.config.env
                        )
                        component.inlets[dataset_urn] = ExternalDataset(
                            NIFI, dataset_name, dict(nifi_uri=site_url), dataset_urn
                        )
                        break

            yield from self.construct_job_workunits(
                job_urn,
                job_name,
                external_url=self.make_external_url(
                    component.parent_group_id, component.id, component.parent_rpg_id
                ),
                job_type=NIFI.upper() + "_" + component.nifi_type.value,
                description=component.comments,
                job_properties=jobProperties,
                inlets=list(component.inlets.keys()),
                outlets=list(component.outlets.keys()),
                inputJobs=inputJobs,
                status=component.status,
            )

        for port in self.nifi_flow.remotely_accessible_ports.values():
            dataset_name = f"{self.config.site_name}.{port.name}"
            dataset_platform = NIFI
            yield from self.construct_dataset_workunits(
                dataset_platform,
                dataset_name,
                external_url=self.make_external_url(port.parent_group_id, port.id),
            )
Esempio n. 8
0
    def send_lineage(
        operator: "BaseOperator",
        inlets: Optional[List] = None,
        outlets: Optional[List] = None,
        context: Dict = None,
    ) -> None:
        # This is necessary to avoid issues with circular imports.
        from airflow.lineage import prepare_lineage
        from airflow.serialization.serialized_objects import (
            SerializedBaseOperator,
            SerializedDAG,
        )

        from datahub.integrations.airflow.hooks import AIRFLOW_1

        # Detect Airflow 1.10.x inlet/outlet configurations in Airflow 2.x, and
        # convert to the newer version. This code path will only be triggered
        # when 2.x receives a 1.10.x inlet/outlet config.
        needs_repeat_preparation = False
        if (
            not AIRFLOW_1
            and isinstance(operator._inlets, list)
            and len(operator._inlets) == 1
            and isinstance(operator._inlets[0], dict)
        ):
            from airflow.lineage import AUTO

            operator._inlets = [
                # See https://airflow.apache.org/docs/apache-airflow/1.10.15/lineage.html.
                *operator._inlets[0].get(
                    "datasets", []
                ),  # assumes these are attr-annotated
                *operator._inlets[0].get("task_ids", []),
                *([AUTO] if operator._inlets[0].get("auto", False) else []),
            ]
            needs_repeat_preparation = True
        if (
            not AIRFLOW_1
            and isinstance(operator._outlets, list)
            and len(operator._outlets) == 1
            and isinstance(operator._outlets[0], dict)
        ):
            operator._outlets = [*operator._outlets[0].get("datasets", [])]
            needs_repeat_preparation = True
        if needs_repeat_preparation:
            # Rerun the lineage preparation routine, now that the old format has been translated to the new one.
            prepare_lineage(lambda self, ctx: None)(operator, context)

        context = context or {}  # ensure not None to satisfy mypy

        dag: "DAG" = context["dag"]
        task = context["task"]

        # TODO: capture context
        # context dag_run
        # task_instance: "TaskInstance" = context["task_instance"]
        # TODO: capture raw sql from db operators

        flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id)
        job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id)

        base_url = conf.get("webserver", "base_url")
        flow_url = f"{base_url}/tree?dag_id={dag.dag_id}"
        job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}"
        # operator.log.info(f"{flow_url=}")
        # operator.log.info(f"{job_url=}")
        # operator.log.info(f"{dag.get_serialized_fields()=}")
        # operator.log.info(f"{task.get_serialized_fields()=}")
        # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}")

        flow_property_bag: Dict[str, str] = {
            key: repr(value)
            for (key, value) in SerializedDAG.serialize_dag(dag).items()
        }
        for key in dag.get_serialized_fields():
            if key not in flow_property_bag:
                flow_property_bag[key] = repr(getattr(dag, key))
        job_property_bag: Dict[str, str] = {
            key: repr(value)
            for (key, value) in SerializedBaseOperator.serialize_operator(task).items()
        }
        for key in task.get_serialized_fields():
            if key not in job_property_bag:
                job_property_bag[key] = repr(getattr(task, key))
        # operator.log.info(f"{flow_property_bag=}")
        # operator.log.info(f"{job_property_bag=}")

        timestamp = int(dateutil.parser.parse(context["ts"]).timestamp() * 1000)
        ownership = models.OwnershipClass(
            owners=[
                models.OwnerClass(
                    owner=builder.make_user_urn(dag.owner),
                    type=models.OwnershipTypeClass.DEVELOPER,
                    source=models.OwnershipSourceClass(
                        type=models.OwnershipSourceTypeClass.SERVICE,
                        url=dag.filepath,
                    ),
                )
            ],
            lastModified=models.AuditStampClass(
                time=timestamp, actor=builder.make_user_urn("airflow")
            ),
        )
        # operator.log.info(f"{ownership=}")

        tags = models.GlobalTagsClass(
            tags=[
                models.TagAssociationClass(tag=f"airflow_{tag}")
                for tag in (dag.tags or [])
            ]
        )
        # operator.log.info(f"{tags=}")

        flow_mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataFlowSnapshotClass(
                urn=flow_urn,
                aspects=[
                    models.DataFlowInfoClass(
                        name=dag.dag_id,
                        description=f"{dag.description}\n\n{dag.doc_md or ''}",
                        customProperties=flow_property_bag,
                        externalUrl=flow_url,
                    ),
                    ownership,
                    tags,
                ],
            )
        )

        job_mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataJobSnapshotClass(
                urn=job_urn,
                aspects=[
                    models.DataJobInfoClass(
                        name=task.task_id,
                        type=models.AzkabanJobTypeClass.COMMAND,
                        description=None,
                        customProperties=job_property_bag,
                        externalUrl=job_url,
                    ),
                    models.DataJobInputOutputClass(
                        inputDatasets=_entities_to_urn_list(inlets or []),
                        outputDatasets=_entities_to_urn_list(outlets or []),
                    ),
                    ownership,
                    tags,
                ],
            )
        )

        lineage_mces = [
            builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet)
            for outlet in _entities_to_urn_list(outlets or [])
        ]

        force_upstream_materialization = [
            models.MetadataChangeEventClass(
                proposedSnapshot=models.DatasetSnapshotClass(
                    urn=inlet,
                    aspects=[
                        models.StatusClass(removed=False),
                    ],
                )
            )
            for inlet in _entities_to_urn_list(inlets or [])
        ]

        hook = make_emitter_hook()

        mces = [
            flow_mce,
            job_mce,
            *lineage_mces,
            *force_upstream_materialization,
        ]
        operator.log.info(
            "DataHub lineage backend - emitting metadata:\n"
            + "\n".join(json.dumps(mce.to_obj()) for mce in mces)
        )
        hook.emit_mces(mces)
Esempio n. 9
0
    def process_dataflow_node(
        self,
        node: Dict[str, Any],
        flow_urn: str,
        new_dataset_ids: List[str],
        new_dataset_mces: List[MetadataChangeEvent],
        s3_formats: typing.DefaultDict[str, Set[Union[str, None]]],
    ) -> Optional[Dict[str, Any]]:

        node_type = node["NodeType"]

        # for nodes representing datasets, we construct a dataset URN accordingly
        if node_type in ["DataSource", "DataSink"]:

            node_args = {
                x["Name"]: yaml.safe_load(x["Value"])
                for x in node["Args"]
            }

            # if data object is Glue table
            if "database" in node_args and "table_name" in node_args:

                full_table_name = f"{node_args['database']}.{node_args['table_name']}"

                # we know that the table will already be covered when ingesting Glue tables
                node_urn = make_dataset_urn_with_platform_instance(
                    platform=self.platform,
                    name=full_table_name,
                    env=self.env,
                    platform_instance=self.source_config.platform_instance,
                )

            # if data object is S3 bucket
            elif node_args.get("connection_type") == "s3":

                s3_uri = self.get_s3_uri(node_args)

                if s3_uri is None:
                    self.report.report_warning(
                        f"{node['Nodetype']}-{node['Id']}",
                        f"Could not find script path for job {node['Nodetype']}-{node['Id']} in flow {flow_urn}. Skipping",
                    )
                    return None

                # append S3 format if different ones exist
                if len(s3_formats[s3_uri]) > 1:
                    node_urn = make_s3_urn(
                        f"{s3_uri}.{node_args.get('format')}",
                        self.env,
                    )

                else:
                    node_urn = make_s3_urn(s3_uri, self.env)

                dataset_snapshot = DatasetSnapshot(
                    urn=node_urn,
                    aspects=[],
                )

                dataset_snapshot.aspects.append(Status(removed=False))
                dataset_snapshot.aspects.append(
                    DatasetPropertiesClass(
                        customProperties={
                            k: str(v)
                            for k, v in node_args.items()
                        },
                        tags=[],
                    ))

                new_dataset_mces.append(
                    MetadataChangeEvent(proposedSnapshot=dataset_snapshot))
                new_dataset_ids.append(f"{node['NodeType']}-{node['Id']}")

            else:

                if self.source_config.ignore_unsupported_connectors:

                    logger.info(
                        flow_urn,
                        f"Unrecognized Glue data object type: {node_args}. Skipping.",
                    )
                    return None
                else:

                    raise ValueError(
                        f"Unrecognized Glue data object type: {node_args}")

        # otherwise, a node represents a transformation
        else:
            node_urn = mce_builder.make_data_job_urn_with_flow(
                flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}')

        return {
            **node,
            "urn": node_urn,
            # to be filled in after traversing edges
            "inputDatajobs": [],
            "inputDatasets": [],
            "outputDatasets": [],
        }
Esempio n. 10
0
def send_lineage_to_datahub(
    config: DatahubBasicLineageConfig,
    operator: "BaseOperator",
    inlets: List[_Entity],
    outlets: List[_Entity],
    context: Dict,
) -> None:
    # This is necessary to avoid issues with circular imports.
    from airflow.serialization.serialized_objects import (
        SerializedBaseOperator,
        SerializedDAG,
    )

    dag: "DAG" = context["dag"]
    task: "BaseOperator" = context["task"]

    # TODO: capture context
    # context dag_run
    # task_instance: "TaskInstance" = context["task_instance"]
    # TODO: capture raw sql from db operators

    flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id,
                                          config.cluster)
    job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id)

    base_url = conf.get("webserver", "base_url")
    flow_url = f"{base_url}/tree?dag_id={dag.dag_id}"
    job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}"
    # operator.log.info(f"{flow_url=}")
    # operator.log.info(f"{job_url=}")
    # operator.log.info(f"{dag.get_serialized_fields()=}")
    # operator.log.info(f"{task.get_serialized_fields()=}")
    # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}")

    flow_property_bag: Dict[str, str] = {
        key: repr(value)
        for (key, value) in SerializedDAG.serialize_dag(dag).items()
    }
    for key in dag.get_serialized_fields():
        if key not in flow_property_bag:
            flow_property_bag[key] = repr(getattr(dag, key))
    job_property_bag: Dict[str, str] = {
        key: repr(value)
        for (key,
             value) in SerializedBaseOperator.serialize_operator(task).items()
    }
    for key in task.get_serialized_fields():
        if key not in job_property_bag:
            job_property_bag[key] = repr(getattr(task, key))
    # operator.log.info(f"{flow_property_bag=}")
    # operator.log.info(f"{job_property_bag=}")
    allowed_task_keys = [
        "_downstream_task_ids",
        "_inlets",
        "_outlets",
        "_task_type",
        "_task_module",
        "depends_on_past",
        "email",
        "label",
        "execution_timeout",
        "end_date",
        "start_date",
        "sla",
        "sql",
        "task_id",
        "trigger_rule",
        "wait_for_downstream",
    ]
    job_property_bag = {
        k: v
        for (k, v) in job_property_bag.items() if k in allowed_task_keys
    }
    allowed_flow_keys = [
        "_access_control",
        "_concurrency",
        "_default_view",
        "catchup",
        "fileloc",
        "is_paused_upon_creation",
        "start_date",
        "tags",
        "timezone",
    ]
    flow_property_bag = {
        k: v
        for (k, v) in flow_property_bag.items() if k in allowed_flow_keys
    }

    if config.capture_ownership_info:
        timestamp = int(
            dateutil.parser.parse(context["ts"]).timestamp() * 1000)
        ownership = models.OwnershipClass(
            owners=[
                models.OwnerClass(
                    owner=builder.make_user_urn(dag.owner),
                    type=models.OwnershipTypeClass.DEVELOPER,
                    source=models.OwnershipSourceClass(
                        type=models.OwnershipSourceTypeClass.SERVICE,
                        url=dag.filepath,
                    ),
                )
            ],
            lastModified=models.AuditStampClass(
                time=timestamp, actor=builder.make_user_urn("airflow")),
        )
        # operator.log.info(f"{ownership=}")
        ownership_aspect = [ownership]
    else:
        ownership_aspect = []

    if config.capture_tags_info:
        tags = models.GlobalTagsClass(tags=[
            models.TagAssociationClass(tag=builder.make_tag_urn(tag))
            for tag in (dag.tags or [])
        ])
        # operator.log.info(f"{tags=}")
        tags_aspect = [tags]
    else:
        tags_aspect = []

    flow_mce = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataFlowSnapshotClass(
            urn=flow_urn,
            aspects=[
                models.DataFlowInfoClass(
                    name=dag.dag_id,
                    description=f"{dag.description}\n\n{dag.doc_md or ''}",
                    customProperties=flow_property_bag,
                    externalUrl=flow_url,
                ),
                *ownership_aspect,
                *tags_aspect,
            ],
        ))

    job_mce = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataJobSnapshotClass(
            urn=job_urn,
            aspects=[
                models.DataJobInfoClass(
                    name=task.task_id,
                    type=models.AzkabanJobTypeClass.COMMAND,
                    description=None,
                    customProperties=job_property_bag,
                    externalUrl=job_url,
                ),
                models.DataJobInputOutputClass(
                    inputDatasets=_entities_to_urn_list(inlets or []),
                    outputDatasets=_entities_to_urn_list(outlets or []),
                    inputDatajobs=[
                        builder.make_data_job_urn_with_flow(flow_urn, task_id)
                        for task_id in task.upstream_task_ids
                    ],
                ),
                *ownership_aspect,
                *tags_aspect,
            ],
        ))

    force_entity_materialization = [
        models.MetadataChangeEventClass(
            proposedSnapshot=models.DatasetSnapshotClass(
                urn=iolet,
                aspects=[
                    models.StatusClass(removed=False),
                ],
            ))
        for iolet in _entities_to_urn_list((inlets or []) + (outlets or []))
    ]

    hook = config.make_emitter_hook()

    mces = [
        flow_mce,
        job_mce,
        *force_entity_materialization,
    ]
    operator.log.info("DataHub lineage backend - emitting metadata:\n" +
                      "\n".join(json.dumps(mce.to_obj()) for mce in mces))
    hook.emit_mces(mces)
Esempio n. 11
0
    def send_lineage(
        operator: "BaseOperator",
        inlets: Optional[List] = None,
        outlets: Optional[List] = None,
        context: Dict = None,
    ) -> None:
        # This is necessary to avoid issues with circular imports.
        from airflow.serialization.serialized_objects import (
            SerializedBaseOperator,
            SerializedDAG,
        )

        context = context or {}  # ensure not None to satisfy mypy

        dag: "DAG" = context["dag"]
        task = context["task"]

        # TODO: capture context
        # context dag_run
        # task_instance: "TaskInstance" = context["task_instance"]
        # TODO: capture raw sql from db operators

        flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id)
        job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id)

        base_url = conf.get("webserver", "base_url")
        flow_url = f"{base_url}/tree?dag_id={dag.dag_id}"
        job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}"
        # operator.log.info(f"{flow_url=}")
        # operator.log.info(f"{job_url=}")
        # operator.log.info(f"{dag.get_serialized_fields()=}")
        # operator.log.info(f"{task.get_serialized_fields()=}")
        # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}")

        flow_property_bag: Dict[str, str] = {
            key: repr(value)
            for (key, value) in SerializedDAG.serialize_dag(dag).items()
        }
        for key in dag.get_serialized_fields():
            if key not in flow_property_bag:
                flow_property_bag[key] = repr(getattr(dag, key))
        job_property_bag: Dict[str, str] = {
            key: repr(value)
            for (key, value) in SerializedBaseOperator.serialize_operator(task).items()
        }
        for key in task.get_serialized_fields():
            if key not in job_property_bag:
                job_property_bag[key] = repr(getattr(task, key))
        # operator.log.info(f"{flow_property_bag=}")
        # operator.log.info(f"{job_property_bag=}")

        timestamp = int(dateutil.parser.parse(context["ts"]).timestamp() * 1000)
        ownership = models.OwnershipClass(
            owners=[
                models.OwnerClass(
                    owner=builder.make_user_urn(dag.owner),
                    type=models.OwnershipTypeClass.DEVELOPER,
                    source=models.OwnershipSourceClass(
                        type=models.OwnershipSourceTypeClass.SERVICE,
                        url=dag.filepath,
                    ),
                )
            ],
            lastModified=models.AuditStampClass(
                time=timestamp, actor=builder.make_user_urn("airflow")
            ),
        )
        # operator.log.info(f"{ownership=}")

        tags = models.GlobalTagsClass(
            tags=[
                models.TagAssociationClass(tag=f"airflow_{tag}")
                for tag in (dag.tags or [])
            ]
        )
        # operator.log.info(f"{tags=}")

        flow_mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataFlowSnapshotClass(
                urn=flow_urn,
                aspects=[
                    models.DataFlowInfoClass(
                        name=dag.dag_id,
                        description=f"{dag.description}\n\n{dag.doc_md or ''}",
                        customProperties=flow_property_bag,
                        externalUrl=flow_url,
                    ),
                    ownership,
                    tags,
                ],
            )
        )

        job_mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataJobSnapshotClass(
                urn=job_urn,
                aspects=[
                    models.DataJobInfoClass(
                        name=task.task_id,
                        type=models.AzkabanJobTypeClass.COMMAND,
                        description=None,
                        customProperties=job_property_bag,
                        externalUrl=job_url,
                    ),
                    models.DataJobInputOutputClass(
                        inputDatasets=_entities_to_urn_list(inlets or []),
                        outputDatasets=_entities_to_urn_list(outlets or []),
                    ),
                    ownership,
                    tags,
                ],
            )
        )

        lineage_mces = [
            builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet)
            for outlet in _entities_to_urn_list(outlets or [])
        ]

        force_upstream_materialization = [
            models.MetadataChangeEventClass(
                proposedSnapshot=models.DatasetSnapshotClass(
                    urn=inlet,
                    aspects=[
                        models.StatusClass(removed=False),
                    ],
                )
            )
            for inlet in _entities_to_urn_list(inlets or [])
        ]

        hook = make_emitter_hook()

        mces = [
            flow_mce,
            job_mce,
            *lineage_mces,
            *force_upstream_materialization,
        ]
        operator.log.info(
            "DataHub lineage backend - emitting metadata:\n"
            + "\n".join(json.dumps(mce.to_obj()) for mce in mces)
        )
        hook.emit_mces(mces)