def unmap(self) -> "BaseOperator":
        """Get the "normal" Operator after applying the current mapping."""
        if isinstance(self.operator_class, type):
            return self.operator_class(**self._get_unmap_kwargs(),
                                       _airflow_from_mapped=True)

        # After a mapped operator is serialized, there's no real way to actually
        # unmap it since we've lost access to the underlying operator class.
        # This tries its best to simply "forward" all the attributes on this
        # mapped operator to a new SerializedBaseOperator instance.
        from airflow.serialization.serialized_objects import SerializedBaseOperator

        op = SerializedBaseOperator(task_id=self.task_id,
                                    _airflow_from_mapped=True)
        SerializedBaseOperator.populate_operator(op, self.operator_class)
        return op
Beispiel #2
0
    def test_operator_subclass_changing_base_defaults(self):
        assert BaseOperator(task_id='dummy').do_xcom_push is True, \
            "Precondition check! If this fails the test won't make sense"

        class MyOperator(BaseOperator):
            def __init__(self, do_xcom_push=False, **kwargs):
                super().__init__(**kwargs)
                self.do_xcom_push = do_xcom_push

        op = MyOperator(task_id='dummy')
        assert op.do_xcom_push is False

        blob = SerializedBaseOperator.serialize_operator(op)
        serialized_op = SerializedBaseOperator.deserialize_operator(blob)

        assert serialized_op.do_xcom_push is False
    def test_serialized_external_task_marker(self):
        dag = DAG('test_serialized_external_task_marker',
                  start_date=DEFAULT_DATE)
        task = ExternalTaskMarker(task_id="parent_task",
                                  external_dag_id="external_task_marker_child",
                                  external_task_id="child_task1",
                                  dag=dag)

        serialized_op = SerializedBaseOperator.serialize_operator(task)
        deserialized_op = SerializedBaseOperator.deserialize_operator(
            serialized_op)
        self.assertEqual(deserialized_op.task_type, 'ExternalTaskMarker')
        self.assertEqual(getattr(deserialized_op, 'external_dag_id'),
                         'external_task_marker_child')
        self.assertEqual(getattr(deserialized_op, 'external_task_id'),
                         'child_task1')
        def check_task_group(node):
            try:
                children = node.children.values()
            except AttributeError:
                # Round-trip serialization and check the result
                expected_serialized = SerializedBaseOperator.serialize_operator(
                    dag.get_task(node.task_id))
                expected_deserialized = SerializedBaseOperator.deserialize_operator(
                    expected_serialized)
                expected_dict = SerializedBaseOperator.serialize_operator(
                    expected_deserialized)
                assert node
                assert SerializedBaseOperator.serialize_operator(
                    node) == expected_dict
                return

            for child in children:
                check_task_group(child)
Beispiel #5
0
    def test_serialize_sensor(self, mode, expect_custom_deps):
        from airflow.sensors.base import BaseSensorOperator

        class DummySensor(BaseSensorOperator):
            def poke(self, context):
                return False

        op = DummySensor(task_id='dummy', mode=mode, poke_interval=23)

        blob = SerializedBaseOperator.serialize_operator(op)

        if expect_custom_deps:
            assert "deps" in blob
        else:
            assert "deps" not in blob

        serialized_op = SerializedBaseOperator.deserialize_operator(blob)

        assert op.deps == serialized_op.deps
Beispiel #6
0
    def unmap(
        self, resolve: Union[None, Mapping[str, Any], Tuple[Context, Session]]
    ) -> "BaseOperator":
        """Get the "normal" Operator after applying the current mapping.

        If ``operator_class`` is not a class (i.e. this DAG has been
        deserialized), this returns a SerializedBaseOperator that aims to
        "look like" the actual unmapping result.

        :param resolve: Only used if ``operator_class`` is a real class. If this
            is a two-tuple (context, session), the information is used to
            resolve the mapped arguments into init arguments. If this is a
            mapping, no resolving happens, the mapping directly provides those
            init arguments resolved from mapped kwargs.

        :meta private:
        """
        if isinstance(self.operator_class, type):
            if isinstance(resolve, collections.abc.Mapping):
                kwargs = resolve
            else:
                kwargs = self._expand_mapped_kwargs(resolve)
            kwargs = self._get_unmap_kwargs(
                kwargs, strict=self._disallow_kwargs_override)
            op = self.operator_class(**kwargs, _airflow_from_mapped=True)
            # We need to overwrite task_id here because BaseOperator further
            # mangles the task_id based on the task hierarchy (namely, group_id
            # is prepended, and '__N' appended to deduplicate). This is hacky,
            # but better than duplicating the whole mangling logic.
            op.task_id = self.task_id
            return op

        # After a mapped operator is serialized, there's no real way to actually
        # unmap it since we've lost access to the underlying operator class.
        # This tries its best to simply "forward" all the attributes on this
        # mapped operator to a new SerializedBaseOperator instance.
        from airflow.serialization.serialized_objects import SerializedBaseOperator

        op = SerializedBaseOperator(task_id=self.task_id,
                                    _airflow_from_mapped=True)
        SerializedBaseOperator.populate_operator(op, self.operator_class)
        return op
Beispiel #7
0
    def unmap(self) -> "BaseOperator":
        """Get the "normal" Operator after applying the current mapping."""
        if isinstance(self.operator_class, type):
            # We can't simply specify task_id here because BaseOperator further
            # mangles the task_id based on the task hierarchy (namely, group_id
            # is prepended, and '__N' appended to deduplicate). Instead of
            # recreating the whole logic here, we just overwrite task_id later.
            op = self.operator_class(**self._get_unmap_kwargs(), _airflow_from_mapped=True)
            op.task_id = self.task_id
            return op

        # After a mapped operator is serialized, there's no real way to actually
        # unmap it since we've lost access to the underlying operator class.
        # This tries its best to simply "forward" all the attributes on this
        # mapped operator to a new SerializedBaseOperator instance.
        from airflow.serialization.serialized_objects import SerializedBaseOperator

        op = SerializedBaseOperator(task_id=self.task_id, _airflow_from_mapped=True)
        SerializedBaseOperator.populate_operator(op, self.operator_class)
        return op
def send_lineage_to_datahub(
    config: DatahubBasicLineageConfig,
    operator: "BaseOperator",
    inlets: List[_Entity],
    outlets: List[_Entity],
    context: Dict,
) -> None:
    # This is necessary to avoid issues with circular imports.
    from airflow.serialization.serialized_objects import (
        SerializedBaseOperator,
        SerializedDAG,
    )

    dag: "DAG" = context["dag"]
    task: "BaseOperator" = context["task"]

    # resolve URNs for upstream nodes in subdags upstream of the current task.
    upstream_subdag_task_urns: List[str] = []

    for upstream_task_id in task.upstream_task_ids:
        upstream_task = dag.task_dict[upstream_task_id]

        # if upstream task is not a subdag, then skip it
        if upstream_task.subdag is None:
            continue

        # else, link the leaf tasks of the upstream subdag as upstream tasks
        upstream_subdag = upstream_task.subdag

        upstream_subdag_flow_urn = builder.make_data_flow_urn(
            "airflow", upstream_subdag.dag_id, config.cluster)

        for upstream_subdag_task_id in upstream_subdag.task_dict:
            upstream_subdag_task = upstream_subdag.task_dict[
                upstream_subdag_task_id]

            upstream_subdag_task_urn = builder.make_data_job_urn_with_flow(
                upstream_subdag_flow_urn, upstream_subdag_task_id)

            # if subdag task is a leaf task, then link it as an upstream task
            if len(upstream_subdag_task._downstream_task_ids) == 0:

                upstream_subdag_task_urns.append(upstream_subdag_task_urn)

    # resolve URNs for upstream nodes that trigger the subdag containing the current task.
    # (if it is in a subdag at all)
    upstream_subdag_triggers: List[str] = []

    # subdags are always named with 'parent.child' style or Airflow won't run them
    # add connection from subdag trigger(s) if subdag task has no upstreams
    if (dag.is_subdag and dag.parent_dag is not None
            and len(task._upstream_task_ids) == 0):

        # filter through the parent dag's tasks and find the subdag trigger(s)
        subdags = [
            x for x in dag.parent_dag.task_dict.values()
            if x.subdag is not None
        ]
        matched_subdags = [
            x for x in subdags
            if getattr(getattr(x, "subdag"), "dag_id") == dag.dag_id
        ]

        # id of the task containing the subdag
        subdag_task_id = matched_subdags[0].task_id

        parent_dag_urn = builder.make_data_flow_urn("airflow",
                                                    dag.parent_dag.dag_id,
                                                    config.cluster)

        # iterate through the parent dag's tasks and find the ones that trigger the subdag
        for upstream_task_id in dag.parent_dag.task_dict:
            upstream_task = dag.parent_dag.task_dict[upstream_task_id]

            upstream_task_urn = builder.make_data_job_urn_with_flow(
                parent_dag_urn, upstream_task_id)

            # if the task triggers the subdag, link it to this node in the subdag
            if subdag_task_id in upstream_task._downstream_task_ids:
                upstream_subdag_triggers.append(upstream_task_urn)

    # TODO: capture context
    # context dag_run
    # task_instance: "TaskInstance" = context["task_instance"]
    # TODO: capture raw sql from db operators

    flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id,
                                          config.cluster)
    job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id)

    base_url = conf.get("webserver", "base_url")
    flow_url = f"{base_url}/tree?dag_id={dag.dag_id}"
    job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}"
    # operator.log.info(f"{flow_url=}")
    # operator.log.info(f"{job_url=}")
    # operator.log.info(f"{dag.get_serialized_fields()=}")
    # operator.log.info(f"{task.get_serialized_fields()=}")
    # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}")

    flow_property_bag: Dict[str, str] = {
        key: repr(value)
        for (key, value) in SerializedDAG.serialize_dag(dag).items()
    }
    for key in dag.get_serialized_fields():
        if key not in flow_property_bag:
            flow_property_bag[key] = repr(getattr(dag, key))
    job_property_bag: Dict[str, str] = {
        key: repr(value)
        for (key,
             value) in SerializedBaseOperator.serialize_operator(task).items()
    }
    for key in task.get_serialized_fields():
        if key not in job_property_bag:
            job_property_bag[key] = repr(getattr(task, key))
    # operator.log.info(f"{flow_property_bag=}")
    # operator.log.info(f"{job_property_bag=}")
    allowed_task_keys = [
        "_downstream_task_ids",
        "_inlets",
        "_outlets",
        "_task_type",
        "_task_module",
        "depends_on_past",
        "email",
        "label",
        "execution_timeout",
        "end_date",
        "start_date",
        "sla",
        "sql",
        "task_id",
        "trigger_rule",
        "wait_for_downstream",
    ]
    job_property_bag = {
        k: v
        for (k, v) in job_property_bag.items() if k in allowed_task_keys
    }
    allowed_flow_keys = [
        "_access_control",
        "_concurrency",
        "_default_view",
        "catchup",
        "fileloc",
        "is_paused_upon_creation",
        "start_date",
        "tags",
        "timezone",
    ]
    flow_property_bag = {
        k: v
        for (k, v) in flow_property_bag.items() if k in allowed_flow_keys
    }

    if config.capture_ownership_info:
        ownership = models.OwnershipClass(
            owners=[
                models.OwnerClass(
                    owner=builder.make_user_urn(dag.owner),
                    type=models.OwnershipTypeClass.DEVELOPER,
                    source=models.OwnershipSourceClass(
                        type=models.OwnershipSourceTypeClass.SERVICE,
                        url=dag.filepath,
                    ),
                )
            ],
            lastModified=models.AuditStampClass(
                time=0, actor=builder.make_user_urn("airflow")),
        )
        # operator.log.info(f"{ownership=}")
        ownership_aspect = [ownership]
    else:
        ownership_aspect = []

    if config.capture_tags_info:
        tags = models.GlobalTagsClass(tags=[
            models.TagAssociationClass(tag=builder.make_tag_urn(tag))
            for tag in (dag.tags or [])
        ])
        # operator.log.info(f"{tags=}")
        tags_aspect = [tags]
    else:
        tags_aspect = []

    flow_mce = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataFlowSnapshotClass(
            urn=flow_urn,
            aspects=[
                models.DataFlowInfoClass(
                    name=dag.dag_id,
                    description=f"{dag.description}\n\n{dag.doc_md or ''}",
                    customProperties=flow_property_bag,
                    externalUrl=flow_url,
                ),
                *ownership_aspect,
                *tags_aspect,
            ],
        ))

    # exclude subdag operator tasks since these are not emitted, resulting in empty metadata
    upstream_tasks = ([
        builder.make_data_job_urn_with_flow(flow_urn, task_id)
        for task_id in task.upstream_task_ids
        if dag.task_dict[task_id].subdag is None
    ] + upstream_subdag_task_urns + upstream_subdag_triggers)

    job_doc = ((operator.doc or operator.doc_md or operator.doc_json
                or operator.doc_yaml or operator.doc_rst)
               if not AIRFLOW_1 else None)

    job_mce = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataJobSnapshotClass(
            urn=job_urn,
            aspects=[
                models.DataJobInfoClass(
                    name=task.task_id,
                    type=models.AzkabanJobTypeClass.COMMAND,
                    description=job_doc,
                    customProperties=job_property_bag,
                    externalUrl=job_url,
                ),
                models.DataJobInputOutputClass(
                    inputDatasets=_entities_to_urn_list(inlets or []),
                    outputDatasets=_entities_to_urn_list(outlets or []),
                    inputDatajobs=upstream_tasks,
                ),
                *ownership_aspect,
                *tags_aspect,
            ],
        ))

    force_entity_materialization = [
        models.MetadataChangeEventClass(
            proposedSnapshot=models.DatasetSnapshotClass(
                urn=iolet,
                aspects=[
                    models.StatusClass(removed=False),
                ],
            ))
        for iolet in _entities_to_urn_list((inlets or []) + (outlets or []))
    ]

    hook = config.make_emitter_hook()

    mces = [
        flow_mce,
        job_mce,
        *force_entity_materialization,
    ]
    operator.log.info("DataHub lineage backend - emitting metadata:\n" +
                      "\n".join(json.dumps(mce.to_obj()) for mce in mces))
    hook.emit_mces(mces)
Beispiel #9
0
    def send_lineage(
        operator: "BaseOperator",
        inlets: Optional[List] = None,
        outlets: Optional[List] = None,
        context: Dict = None,
    ) -> None:
        # This is necessary to avoid issues with circular imports.
        from airflow.lineage import prepare_lineage
        from airflow.serialization.serialized_objects import (
            SerializedBaseOperator,
            SerializedDAG,
        )

        from datahub.integrations.airflow.hooks import AIRFLOW_1

        # Detect Airflow 1.10.x inlet/outlet configurations in Airflow 2.x, and
        # convert to the newer version. This code path will only be triggered
        # when 2.x receives a 1.10.x inlet/outlet config.
        needs_repeat_preparation = False
        if (
            not AIRFLOW_1
            and isinstance(operator._inlets, list)
            and len(operator._inlets) == 1
            and isinstance(operator._inlets[0], dict)
        ):
            from airflow.lineage import AUTO

            operator._inlets = [
                # See https://airflow.apache.org/docs/apache-airflow/1.10.15/lineage.html.
                *operator._inlets[0].get(
                    "datasets", []
                ),  # assumes these are attr-annotated
                *operator._inlets[0].get("task_ids", []),
                *([AUTO] if operator._inlets[0].get("auto", False) else []),
            ]
            needs_repeat_preparation = True
        if (
            not AIRFLOW_1
            and isinstance(operator._outlets, list)
            and len(operator._outlets) == 1
            and isinstance(operator._outlets[0], dict)
        ):
            operator._outlets = [*operator._outlets[0].get("datasets", [])]
            needs_repeat_preparation = True
        if needs_repeat_preparation:
            # Rerun the lineage preparation routine, now that the old format has been translated to the new one.
            prepare_lineage(lambda self, ctx: None)(operator, context)

        context = context or {}  # ensure not None to satisfy mypy

        dag: "DAG" = context["dag"]
        task = context["task"]

        # TODO: capture context
        # context dag_run
        # task_instance: "TaskInstance" = context["task_instance"]
        # TODO: capture raw sql from db operators

        flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id)
        job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id)

        base_url = conf.get("webserver", "base_url")
        flow_url = f"{base_url}/tree?dag_id={dag.dag_id}"
        job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}"
        # operator.log.info(f"{flow_url=}")
        # operator.log.info(f"{job_url=}")
        # operator.log.info(f"{dag.get_serialized_fields()=}")
        # operator.log.info(f"{task.get_serialized_fields()=}")
        # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}")

        flow_property_bag: Dict[str, str] = {
            key: repr(value)
            for (key, value) in SerializedDAG.serialize_dag(dag).items()
        }
        for key in dag.get_serialized_fields():
            if key not in flow_property_bag:
                flow_property_bag[key] = repr(getattr(dag, key))
        job_property_bag: Dict[str, str] = {
            key: repr(value)
            for (key, value) in SerializedBaseOperator.serialize_operator(task).items()
        }
        for key in task.get_serialized_fields():
            if key not in job_property_bag:
                job_property_bag[key] = repr(getattr(task, key))
        # operator.log.info(f"{flow_property_bag=}")
        # operator.log.info(f"{job_property_bag=}")

        timestamp = int(dateutil.parser.parse(context["ts"]).timestamp() * 1000)
        ownership = models.OwnershipClass(
            owners=[
                models.OwnerClass(
                    owner=builder.make_user_urn(dag.owner),
                    type=models.OwnershipTypeClass.DEVELOPER,
                    source=models.OwnershipSourceClass(
                        type=models.OwnershipSourceTypeClass.SERVICE,
                        url=dag.filepath,
                    ),
                )
            ],
            lastModified=models.AuditStampClass(
                time=timestamp, actor=builder.make_user_urn("airflow")
            ),
        )
        # operator.log.info(f"{ownership=}")

        tags = models.GlobalTagsClass(
            tags=[
                models.TagAssociationClass(tag=f"airflow_{tag}")
                for tag in (dag.tags or [])
            ]
        )
        # operator.log.info(f"{tags=}")

        flow_mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataFlowSnapshotClass(
                urn=flow_urn,
                aspects=[
                    models.DataFlowInfoClass(
                        name=dag.dag_id,
                        description=f"{dag.description}\n\n{dag.doc_md or ''}",
                        customProperties=flow_property_bag,
                        externalUrl=flow_url,
                    ),
                    ownership,
                    tags,
                ],
            )
        )

        job_mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataJobSnapshotClass(
                urn=job_urn,
                aspects=[
                    models.DataJobInfoClass(
                        name=task.task_id,
                        type=models.AzkabanJobTypeClass.COMMAND,
                        description=None,
                        customProperties=job_property_bag,
                        externalUrl=job_url,
                    ),
                    models.DataJobInputOutputClass(
                        inputDatasets=_entities_to_urn_list(inlets or []),
                        outputDatasets=_entities_to_urn_list(outlets or []),
                    ),
                    ownership,
                    tags,
                ],
            )
        )

        lineage_mces = [
            builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet)
            for outlet in _entities_to_urn_list(outlets or [])
        ]

        force_upstream_materialization = [
            models.MetadataChangeEventClass(
                proposedSnapshot=models.DatasetSnapshotClass(
                    urn=inlet,
                    aspects=[
                        models.StatusClass(removed=False),
                    ],
                )
            )
            for inlet in _entities_to_urn_list(inlets or [])
        ]

        hook = make_emitter_hook()

        mces = [
            flow_mce,
            job_mce,
            *lineage_mces,
            *force_upstream_materialization,
        ]
        operator.log.info(
            "DataHub lineage backend - emitting metadata:\n"
            + "\n".join(json.dumps(mce.to_obj()) for mce in mces)
        )
        hook.emit_mces(mces)
Beispiel #10
0
def send_lineage_to_datahub(
    config: DatahubBasicLineageConfig,
    operator: "BaseOperator",
    inlets: List[_Entity],
    outlets: List[_Entity],
    context: Dict,
) -> None:
    # This is necessary to avoid issues with circular imports.
    from airflow.serialization.serialized_objects import (
        SerializedBaseOperator,
        SerializedDAG,
    )

    dag: "DAG" = context["dag"]
    task: "BaseOperator" = context["task"]

    # TODO: capture context
    # context dag_run
    # task_instance: "TaskInstance" = context["task_instance"]
    # TODO: capture raw sql from db operators

    flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id,
                                          config.cluster)
    job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id)

    base_url = conf.get("webserver", "base_url")
    flow_url = f"{base_url}/tree?dag_id={dag.dag_id}"
    job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}"
    # operator.log.info(f"{flow_url=}")
    # operator.log.info(f"{job_url=}")
    # operator.log.info(f"{dag.get_serialized_fields()=}")
    # operator.log.info(f"{task.get_serialized_fields()=}")
    # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}")

    flow_property_bag: Dict[str, str] = {
        key: repr(value)
        for (key, value) in SerializedDAG.serialize_dag(dag).items()
    }
    for key in dag.get_serialized_fields():
        if key not in flow_property_bag:
            flow_property_bag[key] = repr(getattr(dag, key))
    job_property_bag: Dict[str, str] = {
        key: repr(value)
        for (key,
             value) in SerializedBaseOperator.serialize_operator(task).items()
    }
    for key in task.get_serialized_fields():
        if key not in job_property_bag:
            job_property_bag[key] = repr(getattr(task, key))
    # operator.log.info(f"{flow_property_bag=}")
    # operator.log.info(f"{job_property_bag=}")
    allowed_task_keys = [
        "_downstream_task_ids",
        "_inlets",
        "_outlets",
        "_task_type",
        "_task_module",
        "depends_on_past",
        "email",
        "label",
        "execution_timeout",
        "end_date",
        "start_date",
        "sla",
        "sql",
        "task_id",
        "trigger_rule",
        "wait_for_downstream",
    ]
    job_property_bag = {
        k: v
        for (k, v) in job_property_bag.items() if k in allowed_task_keys
    }
    allowed_flow_keys = [
        "_access_control",
        "_concurrency",
        "_default_view",
        "catchup",
        "fileloc",
        "is_paused_upon_creation",
        "start_date",
        "tags",
        "timezone",
    ]
    flow_property_bag = {
        k: v
        for (k, v) in flow_property_bag.items() if k in allowed_flow_keys
    }

    if config.capture_ownership_info:
        timestamp = int(
            dateutil.parser.parse(context["ts"]).timestamp() * 1000)
        ownership = models.OwnershipClass(
            owners=[
                models.OwnerClass(
                    owner=builder.make_user_urn(dag.owner),
                    type=models.OwnershipTypeClass.DEVELOPER,
                    source=models.OwnershipSourceClass(
                        type=models.OwnershipSourceTypeClass.SERVICE,
                        url=dag.filepath,
                    ),
                )
            ],
            lastModified=models.AuditStampClass(
                time=timestamp, actor=builder.make_user_urn("airflow")),
        )
        # operator.log.info(f"{ownership=}")
        ownership_aspect = [ownership]
    else:
        ownership_aspect = []

    if config.capture_tags_info:
        tags = models.GlobalTagsClass(tags=[
            models.TagAssociationClass(tag=builder.make_tag_urn(tag))
            for tag in (dag.tags or [])
        ])
        # operator.log.info(f"{tags=}")
        tags_aspect = [tags]
    else:
        tags_aspect = []

    flow_mce = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataFlowSnapshotClass(
            urn=flow_urn,
            aspects=[
                models.DataFlowInfoClass(
                    name=dag.dag_id,
                    description=f"{dag.description}\n\n{dag.doc_md or ''}",
                    customProperties=flow_property_bag,
                    externalUrl=flow_url,
                ),
                *ownership_aspect,
                *tags_aspect,
            ],
        ))

    job_mce = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataJobSnapshotClass(
            urn=job_urn,
            aspects=[
                models.DataJobInfoClass(
                    name=task.task_id,
                    type=models.AzkabanJobTypeClass.COMMAND,
                    description=None,
                    customProperties=job_property_bag,
                    externalUrl=job_url,
                ),
                models.DataJobInputOutputClass(
                    inputDatasets=_entities_to_urn_list(inlets or []),
                    outputDatasets=_entities_to_urn_list(outlets or []),
                    inputDatajobs=[
                        builder.make_data_job_urn_with_flow(flow_urn, task_id)
                        for task_id in task.upstream_task_ids
                    ],
                ),
                *ownership_aspect,
                *tags_aspect,
            ],
        ))

    force_entity_materialization = [
        models.MetadataChangeEventClass(
            proposedSnapshot=models.DatasetSnapshotClass(
                urn=iolet,
                aspects=[
                    models.StatusClass(removed=False),
                ],
            ))
        for iolet in _entities_to_urn_list((inlets or []) + (outlets or []))
    ]

    hook = config.make_emitter_hook()

    mces = [
        flow_mce,
        job_mce,
        *force_entity_materialization,
    ]
    operator.log.info("DataHub lineage backend - emitting metadata:\n" +
                      "\n".join(json.dumps(mce.to_obj()) for mce in mces))
    hook.emit_mces(mces)
Beispiel #11
0
    def generate_datajob(
        cluster: str,
        task: "BaseOperator",
        dag: "DAG",
        set_dependendecies: bool = True,
        capture_owner: bool = True,
        capture_tags: bool = True,
    ) -> DataJob:
        """

        :param cluster: str
        :param task: TaskIntance
        :param dag: DAG
        :param set_dependendecies: bool - whether to extract dependencies from airflow task
        :param capture_owner: bool - whether to extract owner from airflow task
        :param capture_tags: bool - whether to set tags automatically from airflow task
        :return: DataJob - returns the generated DataJob object
        """
        from airflow.serialization.serialized_objects import SerializedBaseOperator

        dataflow_urn = DataFlowUrn.create_from_ids(orchestrator="airflow",
                                                   env=cluster,
                                                   flow_id=dag.dag_id)
        datajob = DataJob(id=task.task_id, flow_urn=dataflow_urn)
        datajob.description = ((task.doc or task.doc_md or task.doc_json
                                or task.doc_yaml or task.doc_rst)
                               if not AIRFLOW_1 else None)

        job_property_bag: Dict[str, str] = {
            key: repr(value)
            for (key, value
                 ) in SerializedBaseOperator.serialize_operator(task).items()
        }
        for key in task.get_serialized_fields():
            if key not in job_property_bag:
                job_property_bag[key] = repr(getattr(task, key))

        allowed_task_keys = [
            "_downstream_task_ids",
            "_inlets",
            "_outlets",
            "_task_type",
            "_task_module",
            "depends_on_past",
            "email",
            "label",
            "execution_timeout",
            "sla",
            "sql",
            "task_id",
            "trigger_rule",
            "wait_for_downstream",
        ]
        job_property_bag = {
            k: v
            for (k, v) in job_property_bag.items() if k in allowed_task_keys
        }

        datajob.properties = job_property_bag
        base_url = conf.get("webserver", "base_url")
        datajob.url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={datajob.flow_urn.get_flow_id()}&_flt_3_task_id={task.task_id}"

        if capture_owner and dag.owner:
            datajob.owners.add(dag.owner)

        if capture_tags and dag.tags:
            datajob.tags.update(dag.tags)

        if set_dependendecies:
            datajob.upstream_urns.extend(
                AirflowGenerator._get_dependencies(task=task,
                                                   dag=dag,
                                                   flow_urn=datajob.flow_urn))

        return datajob
Beispiel #12
0
    def send_lineage(
        operator: "BaseOperator",
        inlets: Optional[List] = None,
        outlets: Optional[List] = None,
        context: Dict = None,
    ) -> None:
        # This is necessary to avoid issues with circular imports.
        from airflow.serialization.serialized_objects import (
            SerializedBaseOperator,
            SerializedDAG,
        )

        context = context or {}  # ensure not None to satisfy mypy

        dag: "DAG" = context["dag"]
        task = context["task"]

        # TODO: capture context
        # context dag_run
        # task_instance: "TaskInstance" = context["task_instance"]
        # TODO: capture raw sql from db operators

        flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id)
        job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id)

        base_url = conf.get("webserver", "base_url")
        flow_url = f"{base_url}/tree?dag_id={dag.dag_id}"
        job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}"
        # operator.log.info(f"{flow_url=}")
        # operator.log.info(f"{job_url=}")
        # operator.log.info(f"{dag.get_serialized_fields()=}")
        # operator.log.info(f"{task.get_serialized_fields()=}")
        # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}")

        flow_property_bag: Dict[str, str] = {
            key: repr(value)
            for (key, value) in SerializedDAG.serialize_dag(dag).items()
        }
        for key in dag.get_serialized_fields():
            if key not in flow_property_bag:
                flow_property_bag[key] = repr(getattr(dag, key))
        job_property_bag: Dict[str, str] = {
            key: repr(value)
            for (key, value) in SerializedBaseOperator.serialize_operator(task).items()
        }
        for key in task.get_serialized_fields():
            if key not in job_property_bag:
                job_property_bag[key] = repr(getattr(task, key))
        # operator.log.info(f"{flow_property_bag=}")
        # operator.log.info(f"{job_property_bag=}")

        timestamp = int(dateutil.parser.parse(context["ts"]).timestamp() * 1000)
        ownership = models.OwnershipClass(
            owners=[
                models.OwnerClass(
                    owner=builder.make_user_urn(dag.owner),
                    type=models.OwnershipTypeClass.DEVELOPER,
                    source=models.OwnershipSourceClass(
                        type=models.OwnershipSourceTypeClass.SERVICE,
                        url=dag.filepath,
                    ),
                )
            ],
            lastModified=models.AuditStampClass(
                time=timestamp, actor=builder.make_user_urn("airflow")
            ),
        )
        # operator.log.info(f"{ownership=}")

        tags = models.GlobalTagsClass(
            tags=[
                models.TagAssociationClass(tag=f"airflow_{tag}")
                for tag in (dag.tags or [])
            ]
        )
        # operator.log.info(f"{tags=}")

        flow_mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataFlowSnapshotClass(
                urn=flow_urn,
                aspects=[
                    models.DataFlowInfoClass(
                        name=dag.dag_id,
                        description=f"{dag.description}\n\n{dag.doc_md or ''}",
                        customProperties=flow_property_bag,
                        externalUrl=flow_url,
                    ),
                    ownership,
                    tags,
                ],
            )
        )

        job_mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataJobSnapshotClass(
                urn=job_urn,
                aspects=[
                    models.DataJobInfoClass(
                        name=task.task_id,
                        type=models.AzkabanJobTypeClass.COMMAND,
                        description=None,
                        customProperties=job_property_bag,
                        externalUrl=job_url,
                    ),
                    models.DataJobInputOutputClass(
                        inputDatasets=_entities_to_urn_list(inlets or []),
                        outputDatasets=_entities_to_urn_list(outlets or []),
                    ),
                    ownership,
                    tags,
                ],
            )
        )

        lineage_mces = [
            builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet)
            for outlet in _entities_to_urn_list(outlets or [])
        ]

        force_upstream_materialization = [
            models.MetadataChangeEventClass(
                proposedSnapshot=models.DatasetSnapshotClass(
                    urn=inlet,
                    aspects=[
                        models.StatusClass(removed=False),
                    ],
                )
            )
            for inlet in _entities_to_urn_list(inlets or [])
        ]

        hook = make_emitter_hook()

        mces = [
            flow_mce,
            job_mce,
            *lineage_mces,
            *force_upstream_materialization,
        ]
        operator.log.info(
            "DataHub lineage backend - emitting metadata:\n"
            + "\n".join(json.dumps(mce.to_obj()) for mce in mces)
        )
        hook.emit_mces(mces)