Example #1
0
def test_lineage_backend(mock_emit):
    # Airflow 2.x does not have lineage backend support merged back in yet.
    # As such, we must protect these imports.
    from airflow.lineage import apply_lineage, prepare_lineage

    from datahub.integrations.airflow.entities import Dataset

    DEFAULT_DATE = days_ago(2)

    with mock.patch.dict(
            os.environ,
        {
            "AIRFLOW__LINEAGE__BACKEND":
            "datahub.integrations.airflow.DatahubAirflowLineageBackend",
            "AIRFLOW__LINEAGE__DATAHUB_CONN_ID":
            datahub_rest_connection_config.conn_id,
        },
    ), patch_airflow_connection(datahub_rest_connection_config):
        func = mock.Mock()
        func.__name__ = "foo"

        dag = DAG(dag_id="test_lineage_is_sent_to_backend",
                  start_date=DEFAULT_DATE)

        with dag:
            op1 = DummyOperator(task_id="task1")

        upstream = Dataset("snowflake", "mydb.schema.tableConsumed")
        downstream = Dataset("snowflake", "mydb.schema.tableProduced")

        op1.inlets.append(upstream)
        op1.outlets.append(downstream)

        ti = TI(task=op1, execution_date=DEFAULT_DATE)
        ctx1 = {
            "dag": dag,
            "task": op1,
            "ti": ti,
            "task_instance": ti,
            "execution_date": DEFAULT_DATE,
            "ts": "2021-04-08T00:54:25.771575+00:00",
        }

        prep = prepare_lineage(func)
        prep(op1, ctx1)
        post = apply_lineage(func)
        post(op1, ctx1)

        mock_emit.assert_called_once()
        assert len(mock_emit.call_args[0][0]) == 4
        assert all(mce.validate() for mce in mock_emit.call_args[0][0])
Example #2
0
default_args = {
    "owner": "airflow",
    "depends_on_past": False,
    "email": ["*****@*****.**"],
    "email_on_failure": False,
    "execution_timeout": timedelta(minutes=5),
}

with DAG(
        "datahub_lineage_backend_demo",
        default_args=default_args,
        description=
        "An example DAG demonstrating the usage of DataHub's Airflow lineage backend.",
        schedule_interval=timedelta(days=1),
        start_date=days_ago(2),
        catchup=False,
) as dag:
    task1 = BashOperator(
        task_id="run_data_task",
        dag=dag,
        bash_command="echo 'This is where you might run your data tooling.'",
        inlets={
            "datasets": [
                Dataset("snowflake", "mydb.schema.tableA"),
                Dataset("snowflake", "mydb.schema.tableB"),
            ],
        },
        outlets={"datasets": [Dataset("snowflake", "mydb.schema.tableC")]},
    )
Example #3
0
    # in the Airflow UI, where it will be even more clear if something
    # is wrong.
    hook.get_connection_form_widgets()
    hook.get_ui_field_behaviour()


@pytest.mark.parametrize(
    "inlets,outlets",
    [
        (
            # Airflow 1.10.x uses a dictionary structure for inlets and outlets.
            # We want the lineage backend to support this structure for backwards
            # compatability reasons, so this test is not conditional.
            {
                "datasets":
                [Dataset("snowflake", "mydb.schema.tableConsumed")]
            },
            {
                "datasets":
                [Dataset("snowflake", "mydb.schema.tableProduced")]
            },
        ),
        pytest.param(
            # Airflow 2.x also supports a flattened list for inlets and outlets.
            # We want to test this capability.
            [Dataset("snowflake", "mydb.schema.tableConsumed")],
            [Dataset("snowflake", "mydb.schema.tableProduced")],
            marks=pytest.mark.skipif(
                airflow.version.version.startswith("1"),
                reason="list-style lineage is only supported in Airflow 2.x",
            ),
Example #4
0
def test_hook_airflow_ui(hook):
    # Simply ensure that these run without issue. These will also show up
    # in the Airflow UI, where it will be even more clear if something
    # is wrong.
    hook.get_connection_form_widgets()
    hook.get_ui_field_behaviour()


@pytest.mark.parametrize(
    ["inlets", "outlets"],
    [
        (
            # Airflow 1.10.x uses a dictionary structure for inlets and outlets.
            # We want the lineage backend to support this structure for backwards
            # compatability reasons, so this test is not conditional.
            {"datasets": [Dataset("snowflake", "mydb.schema.tableConsumed")]},
            {"datasets": [Dataset("snowflake", "mydb.schema.tableProduced")]},
        ),
        pytest.param(
            # Airflow 2.x also supports a flattened list for inlets and outlets.
            # We want to test this capability.
            [Dataset("snowflake", "mydb.schema.tableConsumed")],
            [Dataset("snowflake", "mydb.schema.tableProduced")],
            marks=pytest.mark.skipif(
                airflow.version.version.startswith("1"),
                reason="list-style lineage is only supported in Airflow 2.x",
            ),
        ),
    ],
    ids=[
        "airflow-1-10-x-decl",