Esempio n. 1
0
def make_training():
    """Instantiates the training DAG
    """
    # setting build_in_subprocess=False because Python does not like when we
    # use multiprocessing in functions defined in the main module. Works if
    # we define them in a different one
    dag = DAG(executor=Serial(build_in_subprocess=False))

    output = Path('output')

    # add "get" task that returns the training data
    PythonCallable(get,
                   File(output / 'get.csv'),
                   dag,
                   serializer=serializer,
                   unserializer=unserializer)

    # add features tasks
    add_features(dag)

    # add "fit" task for model training
    fit_t = PythonCallable(fit, File(output / 'model.pickle'), dag)

    # train after joining features
    dag['join'] >> fit_t

    return dag
Esempio n. 2
0
def add_features(dag):
    """
    Given a DAG, adds feature engineering tasks. The DAG must have a task "get"
    that returns the input data.
    """
    get_task = dag['get']

    output = Path('output')

    # instantiate tasks
    a_feature_task = PythonCallable(a_feature,
                                    File(output / 'a_feature.csv'),
                                    dag,
                                    serializer=serializer,
                                    unserializer=unserializer)
    another_task = PythonCallable(another,
                                  File(output / 'another.csv'),
                                  dag,
                                  serializer=serializer,
                                  unserializer=unserializer)
    join_task = PythonCallable(join,
                               File(output / 'join.csv'),
                               dag,
                               serializer=serializer,
                               unserializer=unserializer)

    # establish dependencies
    get_task >> a_feature_task
    get_task >> another_task
    (get_task + a_feature_task + another_task) >> join_task

    return dag
Esempio n. 3
0
def test_sucessful_execution(executor, tmp_directory):
    dag = DAG(executor=executor)
    t1 = PythonCallable(touch_root, File('ok.txt'), dag, name='t1')
    t2 = PythonCallable(touch, File('a_file.txt'), dag, name='t2')
    t3 = PythonCallable(touch, File('another_file.txt'), dag, name='t3')
    t4 = PythonCallable(touch, File('yet_another_file.txt'), dag, name='t4')
    PythonCallable(touch_root, File('file.txt'), dag, name='t5')
    t1 >> t2
    t1 >> t3
    (t2 + t3) >> t4

    dag.build()

    assert Path('ok.txt').exists()
    assert Path('a_file.txt').exists()
    assert Path('another_file.txt').exists()
    assert Path('yet_another_file.txt').exists()
    assert Path('file.txt').exists()

    assert set(t.exec_status for t in dag.values()) == {TaskStatus.Executed}
    assert set(t.product._is_outdated() for t in dag.values()) == {False}

    # nothing executed cause everything is up-to-date
    dag.build()

    assert set(t.exec_status for t in dag.values()) == {TaskStatus.Skipped}
Esempio n. 4
0
def test_copy(copy):
    def _assign_upstream(upstream):
        _assign_upstream.obj = upstream
        return 42

    dag_ = DAG()

    root = PythonCallable(_root,
                          File('root.parquet'),
                          dag_,
                          name='root',
                          serializer=serializer,
                          params={'input_data': {
                              'x': [0, 0, 0]
                          }})

    task = PythonCallable(_assign_upstream,
                          File('task.parquet'),
                          dag_,
                          name='task',
                          unserializer=unserializer,
                          serializer=serializer)

    root >> task

    dag = InMemoryDAG(dag_)

    out = dag.build({'root': {'x': [1]}}, copy=copy)

    # test that the function _assign_upstream received the same object
    # the task root returned in the upstream argument if copy is disabled.
    # if copying, then it should be a different object
    assert (_assign_upstream.obj['root'] is out['root']) is (not copy)
Esempio n. 5
0
def test_hot_reload(backup_test_pkg, tmp_directory):
    cfg = DAGConfigurator()
    cfg.params.hot_reload = True
    dag = cfg.create()

    t1 = PythonCallable(functions.touch_root, File('file1.txt'), dag)
    t2 = PythonCallable(functions.touch_upstream, File('file2.txt'), dag)
    t1 >> t2

    path_to_functions = Path(backup_test_pkg, 'functions.py')
    source_new = """
from pathlib import Path

def touch_root(product):
    Path(str(product)).write_text("hi")

def touch_upstream(product, upstream):
    Path(str(product)).write_text("hello")
    """
    path_to_functions.write_text(source_new)

    dag.build()

    assert Path('file1.txt').read_text() == 'hi'
    assert Path('file2.txt').read_text() == 'hello'
Esempio n. 6
0
def test_cycle_exception():
    dag = DAG()
    ta = PythonCallable(touch_root, File(Path("a.txt")), dag, "ta")
    tb = PythonCallable(touch, File(Path("b.txt")), dag, "tb")
    ta >> tb >> ta
    with pytest.raises(DAGCycle):
        dag.build()
Esempio n. 7
0
def test_runs_on_finish(executor, tmp_directory):
    hook.count = 0
    hook_2.count = 0
    hook_3.count = 0
    hook_4.count = 0

    dag = DAG(executor=executor)
    t = PythonCallable(fn, File('file1.txt'), dag, 't')
    t.on_finish = hook
    t.on_failure = hook_4

    t2 = PythonCallable(touch_w_upstream, File('file2'), dag, 't2')
    t2.on_finish = hook_2

    t3 = PythonCallable(fn, File('file3'), dag, 't3')
    t3.on_finish = hook_3

    t >> t2

    dag.build()

    assert hook.count == 1
    assert hook_2.count == 1
    assert hook_3.count == 1
    assert hook_4.count == 0
Esempio n. 8
0
def dag():
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')
    root = PythonCallable(_touch, File('root'), dag=dag, name='root')
    task = PythonCallable(_touch_upstream, File('file'), dag=dag, name='task')
    root >> task
    return dag
Esempio n. 9
0
def test_executor_keeps_running_until_no_more_tasks_can_run(
        executor, tmp_directory):
    dag = DAG(executor=executor)
    t_fail = PythonCallable(failing_root, File('t_fail'), dag, name='t_fail')
    t_fail_downstream = PythonCallable(failing,
                                       File('t_fail_downstream'),
                                       dag,
                                       name='t_fail_downstream')
    t_touch_aborted = PythonCallable(touch,
                                     File('t_touch_aborted'),
                                     dag,
                                     name='t_touch_aborted')

    t_fail >> t_fail_downstream >> t_touch_aborted

    PythonCallable(touch_root, File('t_ok'), dag, name='t_ok')

    try:
        dag.build(force=True)
    except DAGBuildError:
        pass

    assert not Path('t_fail').exists()
    assert not Path('t_fail_downstream').exists()
    assert Path('t_ok').exists()
Esempio n. 10
0
def test_params_are_copied_upon_initialization():
    dag = DAG()

    params = {'a': 1}
    t1 = PythonCallable(touch, File('file'), dag, name='t1', params=params)
    t2 = PythonCallable(touch, File('file'), dag, name='t2', params=params)

    assert t1.params is not t2.params
Esempio n. 11
0
def _make_dag_with_upstream():
    # run in the same process, to ensure the mock object is called
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')
    t1 = PythonCallable(_touch, File('1.txt'), dag=dag, name='root')
    PythonCallable(_touch, File('2.txt'), dag=dag, name=2)
    t3 = PythonCallable(_touch_upstream, File('3.txt'), dag=dag, name=3)
    t1 >> t3
    return dag
Esempio n. 12
0
def test_building_a_single_task_when_rendered_upstream():
    dag = DAG()
    t1 = PythonCallable(touch, File('1.txt'), dag, name=1)
    t2 = PythonCallable(touch_w_upstream, File('2.txt'), dag, name=2)

    t1 >> t2

    dag.render()
    t2.build()
Esempio n. 13
0
def test_build_partially_with_wildcard(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    PythonCallable(touch_root, File('a-1.txt'), dag, name='a-1')
    PythonCallable(touch_root, File('a-2.txt'), dag, name='a-2')
    PythonCallable(touch_root, File('b.txt'), dag, name='b')

    dag.build_partially('a-*')

    assert Path('a-1.txt').exists()
    assert Path('a-2.txt').exists()
    assert not Path('b.txt').exists()
Esempio n. 14
0
def test_parallel_execution(tmp_directory):
    dag = DAG('dag', executor='parallel')

    a1 = PythonCallable(touch_root, File('a1.txt'), dag, 'a1')
    a2 = PythonCallable(touch_root, File('a2.txt'), dag, 'a2')
    b = PythonCallable(touch, File('b.txt'), dag, 'b')
    c = PythonCallable(touch, File('c.txt'), dag, 'c')

    (a1 + a2) >> b >> c

    dag.build()
Esempio n. 15
0
def test_warnings_are_shown(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    t1 = PythonCallable(touch_root_w_warning, File('file.txt'), dag)
    t2 = PythonCallable(touch_w_warning, File('file2.txt'), dag)
    t1 >> t2

    with pytest.warns(None) as record:
        dag.build()

    assert len(record) == 1
    assert 'This is a warning' in str(record[0].message)
    assert 'This is another warning' in str(record[0].message)
Esempio n. 16
0
def _make_dag_with_two_upstream():
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')

    root = PythonCallable(_touch, File('root'), dag=dag, name='root')
    another = PythonCallable(_touch, File('another'), dag=dag, name='another')
    task = PythonCallable(_touch_upstream,
                          File('file.txt'),
                          dag=dag,
                          name='task')
    (root + another) >> task
    return dag
Esempio n. 17
0
def test_tracebacks_are_shown_for_all_on_build_failing_tasks(executor):
    dag = DAG(executor=executor)
    PythonCallable(failing_root, File('a_file.txt'), dag, name='t1')
    PythonCallable(failing_root, File('another_file.txt'), dag, name='t2')

    with pytest.raises(DAGBuildError) as excinfo:
        dag.build()

    # excinfo.getrepr() returns full text of chained exceptions
    assert "PythonCallable: t1 -> File('a_file.txt')" in str(excinfo.getrepr())
    assert ("PythonCallable: t2 -> File('another_file.txt')"
            in str(excinfo.getrepr()))
Esempio n. 18
0
def test_keeps_folder_layout(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[File] = LocalStorageClient('backup', path_to_project_root='.')
    Path('dir').mkdir()
    PythonCallable(_touch, File('file'), dag, name='task')
    PythonCallable(_touch, File('dir/nested'), dag, name='nested')
    dag.build()

    assert Path('backup', 'dir', 'nested').is_file()
    assert Path('backup', 'dir', '.nested.metadata').is_file()
    assert Path('backup', 'file').is_file()
    assert Path('backup', '.file.metadata').is_file()
Esempio n. 19
0
def test_task_status_and_output_when_on_finish_crashes(tmp_directory):
    dag = DAG()
    t = PythonCallable(fn, File('file'), dag)
    t.on_finish = hook_crashing
    t2 = PythonCallable(touch_w_upstream, File('file2'), dag)
    t >> t2

    with pytest.raises(DAGBuildError) as excinfo:
        dag.build()

    assert t.exec_status == TaskStatus.Errored
    assert t2.exec_status == TaskStatus.Aborted
    assert "PythonCallable: fn -> File('file')" in str(excinfo.getrepr())
Esempio n. 20
0
def make_larger_dag_with_client():
    dag = DAG(executor=Serial(build_in_subprocess=False))

    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')

    root = PythonCallable(touch_root, File('out/root'), dag=dag, name='root')
    task = PythonCallable(touch, File('out/file'), dag=dag, name='task')
    another = PythonCallable(touch,
                             File('out/another'),
                             dag=dag,
                             name='another')
    root >> task >> another
    return dag
Esempio n. 21
0
def test_building_a_single_task_when_has_unrendered_upstream():
    dag = DAG()
    t1 = PythonCallable(touch, File('1.txt'), dag, name=1)
    t2 = PythonCallable(touch_w_upstream, File('2.txt'), dag, name=2)

    t1 >> t2

    with pytest.raises(TaskBuildError) as excinfo:
        t2.build()

    msg = ('Cannot directly build task "2" as it has upstream dependencies'
           ', call dag.render() first')
    assert msg == str(excinfo.value)
Esempio n. 22
0
def test_unserializes_upstream_metaproduct(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.unserializer = metaproduct_unserializer
    t1 = PythonCallable(touch_meta, {
        'one': File('one'),
        'another': File('another')
    },
                        dag=dag,
                        name='first')
    t2 = PythonCallable(touch_with_first_as_upstream, File('last'), dag=dag)
    t1 >> t2

    dag.build()
Esempio n. 23
0
def test_duplicated_files_one_absolute():
    dag = DAG()
    PythonCallable(touch_root, File('a'), dag, name='task')
    PythonCallable(touch_root, File(Path('a').resolve()), dag, name='another')

    with pytest.raises(DAGRenderError) as excinfo:
        dag.render()

    expected = ("Tasks must generate unique Products. "
                "The following Products appear in more than one task "
                "{File('a'): ['task', 'another']}")

    assert expected == str(excinfo.value)
Esempio n. 24
0
def dag():
    def fn1(product):
        pass

    def fn2(upstream, product):
        pass

    dag = DAG()
    t1 = PythonCallable(fn1, File('file1.txt'), dag, name='first')
    t2 = PythonCallable(fn2, File('file2.txt'), dag, name='second')
    t1 >> t2

    return dag
Esempio n. 25
0
def test_dag_task_status_life_cycle(executor, tmp_directory):
    """
    Check dag and task status along calls to DAG.render and DAG.build.
    Although DAG and Task status are automatically updated and propagated
    downstream upon calls to render and build, we have to parametrize this
    over executors since the object that gets updated might not be the same
    one that we declared here (this happens when a task runs in a different
    process), hence, it is the executor's responsibility to notify tasks
    on sucess/fail scenarios so downstream tasks are updated correctly
    """
    dag = DAG(executor=executor)
    t1 = PythonCallable(touch_root, File('ok.txt'), dag, name='t1')
    t2 = PythonCallable(failing_root, File('a_file.txt'), dag, name='t2')
    t3 = PythonCallable(touch, File('another_file.txt'), dag, name='t3')
    t4 = PythonCallable(touch, File('yet_another_file.txt'), dag, name='t4')
    t5 = PythonCallable(touch_root, File('file.txt'), dag, name='t5')
    t2 >> t3 >> t4

    assert dag._exec_status == DAGStatus.WaitingRender
    assert {TaskStatus.WaitingRender
            } == set([t.exec_status for t in dag.values()])

    dag.render()

    assert dag._exec_status == DAGStatus.WaitingExecution
    assert t1.exec_status == TaskStatus.WaitingExecution
    assert t2.exec_status == TaskStatus.WaitingExecution
    assert t3.exec_status == TaskStatus.WaitingUpstream
    assert t4.exec_status == TaskStatus.WaitingUpstream
    assert t5.exec_status == TaskStatus.WaitingExecution

    try:
        dag.build()
    except DAGBuildError:
        pass

    assert dag._exec_status == DAGStatus.Errored
    assert t1.exec_status == TaskStatus.Executed
    assert t2.exec_status == TaskStatus.Errored
    assert t3.exec_status == TaskStatus.Aborted
    assert t4.exec_status == TaskStatus.Aborted
    assert t5.exec_status == TaskStatus.Executed

    dag.render()

    assert dag._exec_status == DAGStatus.WaitingExecution
    assert t1.exec_status == TaskStatus.Skipped
    assert t2.exec_status == TaskStatus.WaitingExecution
    assert t3.exec_status == TaskStatus.WaitingUpstream
    assert t4.exec_status == TaskStatus.WaitingUpstream
    assert t5.exec_status == TaskStatus.Skipped
Esempio n. 26
0
def make_dag_with_client_and_metaproduct():
    dag = DAG(executor=Serial(build_in_subprocess=False))

    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')

    root = PythonCallable(touch_root_with_metaproduct, {
        'root': File('out/root'),
        'another': File('out/another')
    },
                          dag=dag,
                          name='root')
    task = PythonCallable(touch, File('file'), dag=dag, name='task')
    root >> task
    return dag
Esempio n. 27
0
def test_warnings_are_shown(tmp_directory):
    dag = DAG()
    t1 = PythonCallable(touch_root, File('file.txt'), dag)
    t2 = PythonCallable(touch, File('file2.txt'), dag)
    t1.on_render = on_render_1
    t2.on_render = on_render_2
    t1 >> t2

    with pytest.warns(None) as record:
        dag.render()

    assert len(record) == 1
    assert 'This is a warning' in str(record[0].message)
    assert 'This is another warning' in str(record[0].message)
Esempio n. 28
0
def test_build_partially_with_wildcard_skip_upstream(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    root = PythonCallable(touch_root, File('root.txt'), dag, name='root')
    a1 = PythonCallable(touch, File('a-1.txt'), dag, name='a-1')
    root >> a1
    PythonCallable(touch_root, File('a-2.txt'), dag, name='a-2')
    PythonCallable(touch_root, File('b.txt'), dag, name='b')

    dag.build_partially('a-*', skip_upstream=True)

    assert not Path('root.txt').exists()
    assert Path('a-1.txt').exists()
    assert Path('a-2.txt').exists()
    assert not Path('b.txt').exists()
Esempio n. 29
0
def test_dag_render_step_by_step_w_skipped(tmp_directory):
    dag = DAG()

    t1 = PythonCallable(touch_root, File('t1.txt'), dag, name='t1')
    t21 = PythonCallable(touch, File('t21.txt'), dag, name='t21')
    t22 = PythonCallable(touch, File('t22.txt'), dag, name='t22')
    t3 = PythonCallable(touch, File('t3.txt'), dag, name='t3')

    t1 >> t21
    t1 >> t22

    (t21 + t22) >> t3

    assert (set(t.exec_status
                for t in dag.values()) == {TaskStatus.WaitingRender})

    dag.render()
    t1.build()

    dag.render()

    assert t1.exec_status == TaskStatus.Skipped
    assert t21.exec_status == TaskStatus.WaitingExecution
    assert t22.exec_status == TaskStatus.WaitingExecution
    assert t3.exec_status == TaskStatus.WaitingUpstream

    t21.build()
    dag.render()

    assert t1.exec_status == TaskStatus.Skipped
    assert t21.exec_status == TaskStatus.Skipped
    assert t22.exec_status == TaskStatus.WaitingExecution
    assert t3.exec_status == TaskStatus.WaitingUpstream

    t22.build()
    dag.render()

    assert t1.exec_status == TaskStatus.Skipped
    assert t21.exec_status == TaskStatus.Skipped
    assert t22.exec_status == TaskStatus.Skipped
    assert t3.exec_status == TaskStatus.WaitingExecution

    t3.build()
    dag.render()

    assert t1.exec_status == TaskStatus.Skipped
    assert t21.exec_status == TaskStatus.Skipped
    assert t22.exec_status == TaskStatus.Skipped
    assert t3.exec_status == TaskStatus.Skipped
Esempio n. 30
0
def dag():
    dag = DAG()

    t1 = PythonCallable(touch,
                        File('1.txt'),
                        dag=dag,
                        name='without_dependencies')
    t2 = PythonCallable(touch_with_upstream,
                        File('2.txt'),
                        dag=dag,
                        name='with_dependencies',
                        params={'param': 42})
    t1 >> t2

    return dag