Example #1
0
def test_spark_dag(mock_subproc_popen):
    # Hack to get around having a Connection
    os.environ["AIRFLOW_CONN_SPARK"] = "something"

    dag = DAG(
        dag_id="spark_dag",
        default_args=default_args,
        schedule_interval=None,
    )
    # pylint: disable=unused-variable
    clean_data = SparkSubmitOperator(
        task_id="run_spark",
        application="some_path.py",
        conn_id="SPARK",
        dag=dag,
    )

    pipeline = make_dagster_pipeline_from_airflow_dag(
        dag=dag,
        tags={
            AIRFLOW_EXECUTION_DATE_STR:
            get_current_datetime_in_utc().isoformat()
        },
    )
    execute_pipeline(pipeline)  # , instance=instance,)

    assert mock_subproc_popen.call_args_list[0][0] == ([
        "spark-submit", "--master", "", "--name", "airflow-spark",
        "some_path.py"
    ], )
Example #2
0
def test_long_name():
    dag_name = "dag-with.dot-dash-lo00ong" * 10
    dag = DAG(
        dag_id=dag_name,
        default_args=default_args,
        schedule_interval=None,
    )
    long_name = "task-with.dot-dash2-loong" * 10  # 250 characters, Airflow's max allowed length
    dummy_operator = DummyOperator(
        task_id=long_name,
        dag=dag,
    )

    pipeline_def = make_dagster_pipeline_from_airflow_dag(
        dag=dag,
        tags={
            AIRFLOW_EXECUTION_DATE_STR:
            get_current_datetime_in_utc().isoformat()
        },
    )
    result = execute_pipeline(pipeline_def)

    assert result.success
    assert (
        result.pipeline_def.name ==
        "airflow_dag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ong"
    )

    assert len(result.pipeline_def.solids) == 1
    assert (
        result.pipeline_def.solids[0].name ==
        "airflow_task_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loong"
    )
Example #3
0
def test_multi_leaf_dag(snapshot):
    dag = DAG(
        dag_id="multi_leaf_dag",
        default_args=default_args,
        schedule_interval=None,
    )
    dummy_operator_1 = DummyOperator(
        task_id="dummy_operator_1",
        dag=dag,
    )
    dummy_operator_2 = DummyOperator(
        task_id="dummy_operator_2",
        dag=dag,
    )
    dummy_operator_3 = DummyOperator(
        task_id="dummy_operator_3",
        dag=dag,
    )
    dummy_operator_4 = DummyOperator(
        task_id="dummy_operator_4",
        dag=dag,
    )
    dummy_operator_1 >> dummy_operator_2
    dummy_operator_1 >> dummy_operator_3
    dummy_operator_1 >> dummy_operator_4

    snapshot.assert_match(
        serialize_pp(
            PipelineSnapshot.from_pipeline_def(
                make_dagster_pipeline_from_airflow_dag(
                    dag=dag)).dep_structure_snapshot))
Example #4
0
def test_diamond_task_dag(snapshot):
    dag = DAG(
        dag_id='diamond_task_dag',
        default_args=default_args,
        schedule_interval=None,
    )
    dummy_operator_1 = DummyOperator(
        task_id='dummy_operator_1',
        dag=dag,
    )
    dummy_operator_2 = DummyOperator(
        task_id='dummy_operator_2',
        dag=dag,
    )
    dummy_operator_3 = DummyOperator(
        task_id='dummy_operator_3',
        dag=dag,
    )
    dummy_operator_4 = DummyOperator(
        task_id='dummy_operator_4',
        dag=dag,
    )
    dummy_operator_1 >> dummy_operator_2
    dummy_operator_1 >> dummy_operator_3
    dummy_operator_2 >> dummy_operator_4
    dummy_operator_3 >> dummy_operator_4

    snapshot.assert_match(
        serialize_pp(
            PipelineSnapshot.from_pipeline_def(
                make_dagster_pipeline_from_airflow_dag(
                    dag)).dep_structure_snapshot))
Example #5
0
def test_one_task_dag():
    dag = DAG(dag_id='dag', default_args=default_args, schedule_interval=None,)
    dummy_operator = DummyOperator(task_id='dummy_operator', dag=dag,)

    pipeline_def = make_dagster_pipeline_from_airflow_dag(
        dag=dag, tags={AIRFLOW_EXECUTION_DATE_STR: get_current_datetime_in_utc().isoformat()},
    )
    result = execute_pipeline(pipeline_def)
    assert result.success
Example #6
0
def make_dagster_job_from_airflow_dag(dag,
                                      tags=None,
                                      use_airflow_template_context=False,
                                      unique_id=None):
    """Construct a Dagster job corresponding to a given Airflow DAG.

    Tasks in the resulting job will execute the ``execute()`` method on the corresponding
    Airflow Operator. Dagster, any dependencies required by Airflow Operators, and the module
    containing your DAG definition must be available in the Python environment within which your
    Dagster solids execute.

    To set Airflow's ``execution_date`` for use with Airflow Operator's ``execute()`` methods,
    either:

    1. (Best for ad hoc runs) Execute job directly. This will set execution_date to the
        time (in UTC) of the run.

    2. Add ``{'airflow_execution_date': utc_date_string}`` to the job tags. This will override
        behavior from (1).

        .. code-block:: python

            my_dagster_job = make_dagster_job_from_airflow_dag(
                    dag=dag,
                    tags={'airflow_execution_date': utc_execution_date_str}
            )
            my_dagster_job.execute_in_process()

    3. (Recommended) Add ``{'airflow_execution_date': utc_date_string}`` to the run tags,
        such as in the Dagit UI. This will override behavior from (1) and (2)


    We apply normalized_name() to the dag id and task ids when generating job name and op
    names to ensure that names conform to Dagster's naming conventions.

    Args:
        dag (DAG): The Airflow DAG to compile into a Dagster job
        tags (Dict[str, Field]): Job tags. Optionally include
            `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within
            execution of Airflow Operators.
        use_airflow_template_context (bool): If True, will call get_template_context() on the
            Airflow TaskInstance model which requires and modifies the DagRun table.
            (default: False)
        unique_id (int): If not None, this id will be postpended to generated op names. Used by
            framework authors to enforce unique op names within a repo.

    Returns:
        JobDefinition: The generated Dagster job

    """
    pipeline_def = make_dagster_pipeline_from_airflow_dag(
        dag, tags, use_airflow_template_context, unique_id)
    # pass in tags manually because pipeline_def.graph doesn't have it threaded
    return pipeline_def.graph.to_job(tags={**pipeline_def.tags})
Example #7
0
def test_pipeline_tags():
    dag = get_dag()

    instance = DagsterInstance.local_temp()
    manager = instance.compute_log_manager

    # When mode is default and tags are set, run with tags
    result = execute_pipeline(
        pipeline=make_dagster_pipeline_from_airflow_dag(
            dag, {AIRFLOW_EXECUTION_DATE_STR: EXECUTION_DATE_MINUS_WEEK_FMT}),
        instance=instance,
    )
    check_compute_logs(manager, result, EXECUTION_DATE_MINUS_WEEK_FMT)
def test_normalize_name():
    dag = DAG(dag_id="dag-with.dot-dash", default_args=default_args, schedule_interval=None,)
    dummy_operator = DummyOperator(task_id="task-with.dot-dash", dag=dag,)

    pipeline_def = make_dagster_pipeline_from_airflow_dag(
        dag=dag, tags={AIRFLOW_EXECUTION_DATE_STR: get_current_datetime_in_utc().isoformat()},
    )
    result = execute_pipeline(pipeline_def)

    assert result.success
    assert result.pipeline_def.name == "airflow_dag_with_dot_dash"
    assert len(result.pipeline_def.solids) == 1
    assert result.pipeline_def.solids[0].name == "airflow_task_with_dot_dash"
Example #9
0
def test_one_task_dag(snapshot):
    dag = DAG(
        dag_id='one_task_dag',
        default_args=default_args,
        schedule_interval=None,
    )
    dummy_operator = DummyOperator(
        task_id='dummy_operator',
        dag=dag,
    )

    snapshot.assert_match(
        serialize_pp(
            PipelineSnapshot.from_pipeline_def(
                make_dagster_pipeline_from_airflow_dag(
                    dag)).dep_structure_snapshot))
Example #10
0
def test_pipeline_auto_tag():
    dag = get_dag()

    with instance_for_test() as instance:
        manager = instance.compute_log_manager

        pre_execute_time = get_current_datetime_in_utc()

        # When tags are not set, run with current time
        result = execute_pipeline(
            pipeline=make_dagster_pipeline_from_airflow_dag(dag=dag),
            instance=instance,
        )

        post_execute_time = get_current_datetime_in_utc()

        compute_io_path = manager.get_local_path(result.run_id,
                                                 "airflow_templated.compute",
                                                 ComputeIOType.STDOUT)
        assert os.path.exists(compute_io_path)
        stdout_file = open(compute_io_path, "r")
        file_contents = normalize_file_content(stdout_file.read())

        stdout_file.close()

        search_str = "INFO - Running command: \n    echo '"
        date_start = file_contents.find(search_str) + len(search_str)
        date_end = date_start + 10  # number of characters in YYYY-MM-DD
        date = file_contents[date_start:date_end]

        check_compute_logs(manager, result, date)

        pre_execute_time_fmt = pre_execute_time.strftime("%Y-%m-%d")
        post_execute_time_fmt = post_execute_time.strftime("%Y-%m-%d")

        assert date in [pre_execute_time_fmt, post_execute_time_fmt]
Example #11
0
def test_template_task_dag():
    dag = DAG(
        dag_id="dag",
        default_args=default_args,
        schedule_interval=None,
    )

    t1 = BashOperator(
        task_id="print_hello",
        bash_command="echo hello dagsir",
        dag=dag,
    )

    t2 = BashOperator(
        task_id="sleep",
        bash_command="sleep 2",
        dag=dag,
    )

    templated_command = """
    {% for i in range(5) %}
        echo '{{ ds }}'
        echo '{{ macros.ds_add(ds, 7)}}'
        echo '{{ params.my_param }}'
    {% endfor %}
    """

    t3 = BashOperator(
        task_id="templated",
        depends_on_past=False,
        bash_command=templated_command,
        params={"my_param": "Parameter I passed in"},
        dag=dag,
    )

    # pylint: disable=pointless-statement
    t1 >> [t2, t3]

    instance = DagsterInstance.local_temp()
    manager = instance.compute_log_manager

    execution_date = get_current_datetime_in_utc()
    execution_date_add_one_week = execution_date + datetime.timedelta(days=7)
    execution_date_iso = execution_date.strftime("%Y-%m-%d")
    execution_date_add_one_week_iso = execution_date_add_one_week.strftime(
        "%Y-%m-%d")

    result = execute_pipeline(
        make_dagster_pipeline_from_airflow_dag(
            dag=dag, tags={AIRFLOW_EXECUTION_DATE_STR: execution_date_iso}),
        instance=instance,
    )

    compute_steps = [
        event.step_key for event in result.step_event_list
        if event.event_type == DagsterEventType.STEP_START
    ]

    assert compute_steps == [
        "airflow_print_hello.compute",
        "airflow_sleep.compute",
        "airflow_templated.compute",
    ]

    for step_key in compute_steps:
        compute_io_path = manager.get_local_path(result.run_id, step_key,
                                                 ComputeIOType.STDOUT)
        assert os.path.exists(compute_io_path)
        stdout_file = open(compute_io_path, "r")
        file_contents = normalize_file_content(stdout_file.read())
        stdout_file.close()

        if step_key == "airflow_print_hello.compute":
            assert file_contents.count(
                "INFO - Running command: echo hello dagsir\n") == 1
            assert file_contents.count(
                "INFO - Command exited with return code 0") == 1

        elif step_key == "airflow_sleep.compute":
            assert file_contents.count(
                "INFO - Running command: sleep 2\n") == 1
            assert file_contents.count("INFO - Output:\n") == 1
            assert file_contents.count(
                "INFO - Command exited with return code 0") == 1

        elif step_key == "airflow_templated.compute":
            assert (file_contents.count(
                "INFO - Running command: \n    \n        "
                "echo '{execution_date_iso}'\n        "
                "echo '{execution_date_add_one_week_iso}'\n        "
                "echo 'Parameter I passed in'\n    \n        "
                "echo '{execution_date_iso}'\n        "
                "echo '{execution_date_add_one_week_iso}'\n        "
                "echo 'Parameter I passed in'\n    \n        "
                "echo '{execution_date_iso}'\n        "
                "echo '{execution_date_add_one_week_iso}'\n        "
                "echo 'Parameter I passed in'\n    \n        "
                "echo '{execution_date_iso}'\n        "
                "echo '{execution_date_add_one_week_iso}'\n        "
                "echo 'Parameter I passed in'\n    \n        "
                "echo '{execution_date_iso}'\n        "
                "echo '{execution_date_add_one_week_iso}'\n        "
                "echo 'Parameter I passed in'\n    \n    \n".format(
                    execution_date_iso=execution_date_iso,
                    execution_date_add_one_week_iso=
                    execution_date_add_one_week_iso,
                )) == 1)
            assert (file_contents.count("INFO - {execution_date_iso}\n".format(
                execution_date_iso=execution_date_iso)) == 5)
            assert (file_contents.count(
                "INFO - {execution_date_add_one_week_iso}\n".format(
                    execution_date_add_one_week_iso=
                    execution_date_add_one_week_iso)) == 5)
            assert file_contents.count("INFO - Parameter I passed in\n") == 5
            assert file_contents.count(
                "INFO - Command exited with return code 0") == 1
Example #12
0
# start_repo_marker_0
from airflow_ingest.airflow_complex_dag import complex_dag
from airflow_ingest.airflow_simple_dag import simple_dag
from dagster_airflow.dagster_pipeline_factory import make_dagster_pipeline_from_airflow_dag

from dagster import repository

airflow_simple_dag = make_dagster_pipeline_from_airflow_dag(simple_dag)
airflow_complex_dag = make_dagster_pipeline_from_airflow_dag(complex_dag)


@repository
def airflow_ingest_example():
    return [airflow_complex_dag, airflow_simple_dag]


# end_repo_marker_0
Example #13
0
def test_complex_dag(snapshot):
    dag = DAG(dag_id="complex_dag",
              default_args=default_args,
              schedule_interval=None)

    # Create
    create_entry_group = DummyOperator(
        task_id="create_entry_group",
        dag=dag,
    )
    create_entry_group_result = DummyOperator(
        task_id="create_entry_group_result",
        dag=dag,
    )
    create_entry_group_result2 = DummyOperator(
        task_id="create_entry_group_result2",
        dag=dag,
    )
    create_entry_gcs = DummyOperator(
        task_id="create_entry_gcs",
        dag=dag,
    )
    create_entry_gcs_result = DummyOperator(
        task_id="create_entry_gcs_result",
        dag=dag,
    )
    create_entry_gcs_result2 = DummyOperator(
        task_id="create_entry_gcs_result2",
        dag=dag,
    )
    create_tag = DummyOperator(
        task_id="create_tag",
        dag=dag,
    )
    create_tag_result = DummyOperator(
        task_id="create_tag_result",
        dag=dag,
    )
    create_tag_result2 = DummyOperator(
        task_id="create_tag_result2",
        dag=dag,
    )
    create_tag_template = DummyOperator(
        task_id="create_tag_template",
        dag=dag,
    )
    create_tag_template_result = DummyOperator(
        task_id="create_tag_template_result",
        dag=dag,
    )
    create_tag_template_result2 = DummyOperator(
        task_id="create_tag_template_result2",
        dag=dag,
    )
    create_tag_template_field = DummyOperator(
        task_id="create_tag_template_field",
        dag=dag,
    )
    create_tag_template_field_result = DummyOperator(
        task_id="create_tag_template_field_result",
        dag=dag,
    )
    create_tag_template_field_result2 = DummyOperator(
        task_id="create_tag_template_field_result",
        dag=dag,
    )

    # Delete
    delete_entry = DummyOperator(
        task_id="delete_entry",
        dag=dag,
    )
    create_entry_gcs >> delete_entry
    delete_entry_group = DummyOperator(
        task_id="delete_entry_group",
        dag=dag,
    )
    create_entry_group >> delete_entry_group
    delete_tag = DummyOperator(
        task_id="delete_tag",
        dag=dag,
    )
    create_tag >> delete_tag
    delete_tag_template_field = DummyOperator(
        task_id="delete_tag_template_field",
        dag=dag,
    )
    delete_tag_template = DummyOperator(
        task_id="delete_tag_template",
        dag=dag,
    )

    # Get
    get_entry_group = DummyOperator(
        task_id="get_entry_group",
        dag=dag,
    )
    get_entry_group_result = DummyOperator(
        task_id="get_entry_group_result",
        dag=dag,
    )
    get_entry = DummyOperator(
        task_id="get_entry",
        dag=dag,
    )
    get_entry_result = DummyOperator(
        task_id="get_entry_result",
        dag=dag,
    )
    get_tag_template = DummyOperator(
        task_id="get_tag_template",
        dag=dag,
    )
    get_tag_template_result = DummyOperator(
        task_id="get_tag_template_result",
        dag=dag,
    )

    # List
    list_tags = DummyOperator(
        task_id="list_tags",
        dag=dag,
    )
    list_tags_result = DummyOperator(
        task_id="list_tags_result",
        dag=dag,
    )

    # Lookup
    lookup_entry = DummyOperator(
        task_id="lookup_entry",
        dag=dag,
    )
    lookup_entry_result = DummyOperator(
        task_id="lookup_entry_result",
        dag=dag,
    )

    # Rename
    rename_tag_template_field = DummyOperator(
        task_id="rename_tag_template_field",
        dag=dag,
    )

    # Search
    search_catalog = DummyOperator(
        task_id="search_catalog",
        dag=dag,
    )
    search_catalog_result = DummyOperator(
        task_id="search_catalog_result",
        dag=dag,
    )

    # Update
    update_entry = DummyOperator(
        task_id="update_entry",
        dag=dag,
    )
    update_tag = DummyOperator(
        task_id="update_tag",
        dag=dag,
    )
    update_tag_template = DummyOperator(
        task_id="update_tag_template",
        dag=dag,
    )
    update_tag_template_field = DummyOperator(
        task_id="update_tag_template_field",
        dag=dag,
    )

    # Create
    create_tasks = [
        create_entry_group,
        create_entry_gcs,
        create_tag_template,
        create_tag_template_field,
        create_tag,
    ]
    chain(*create_tasks)

    create_entry_group >> delete_entry_group
    create_entry_group >> create_entry_group_result
    create_entry_group >> create_entry_group_result2

    create_entry_gcs >> delete_entry
    create_entry_gcs >> create_entry_gcs_result
    create_entry_gcs >> create_entry_gcs_result2

    create_tag_template >> delete_tag_template_field
    create_tag_template >> create_tag_template_result
    create_tag_template >> create_tag_template_result2

    create_tag_template_field >> delete_tag_template_field
    create_tag_template_field >> create_tag_template_field_result
    create_tag_template_field >> create_tag_template_field_result2

    create_tag >> delete_tag
    create_tag >> create_tag_result
    create_tag >> create_tag_result2

    # Delete
    delete_tasks = [
        delete_tag,
        delete_tag_template_field,
        delete_tag_template,
        delete_entry_group,
        delete_entry,
    ]
    chain(*delete_tasks)

    # Get
    create_tag_template >> get_tag_template >> delete_tag_template
    get_tag_template >> get_tag_template_result

    create_entry_gcs >> get_entry >> delete_entry
    get_entry >> get_entry_result

    create_entry_group >> get_entry_group >> delete_entry_group
    get_entry_group >> get_entry_group_result

    # List
    create_tag >> list_tags >> delete_tag
    list_tags >> list_tags_result

    # Lookup
    create_entry_gcs >> lookup_entry >> delete_entry
    lookup_entry >> lookup_entry_result

    # Rename
    create_tag_template_field >> rename_tag_template_field >> delete_tag_template_field

    # Search
    chain(create_tasks, search_catalog, delete_tasks)
    search_catalog >> search_catalog_result

    # Update
    create_entry_gcs >> update_entry >> delete_entry
    create_tag >> update_tag >> delete_tag
    create_tag_template >> update_tag_template >> delete_tag_template
    create_tag_template_field >> update_tag_template_field >> rename_tag_template_field

    snapshot.assert_match(
        serialize_pp(
            PipelineSnapshot.from_pipeline_def(
                make_dagster_pipeline_from_airflow_dag(
                    dag)).dep_structure_snapshot))