Esempio n. 1
0
def test_task_change_in_status():
    dag = DAG('dag')

    ta = ShellScript('echo "a" > {{product}}', File('a.txt'), dag, 'ta')
    tb = ShellScript('cat {{upstream["ta"]}} > {{product}}', File('b.txt'),
                     dag, 'tb')
    tc = ShellScript('cat {{upstream["tb"]}} > {{product}}', File('c.txt'),
                     dag, 'tc')

    assert all([t._status == TaskStatus.WaitingRender for t in [ta, tb, tc]])

    ta >> tb >> tc

    dag.render()

    assert (ta._status == TaskStatus.WaitingExecution
            and tb._status == TaskStatus.WaitingUpstream
            and tc._status == TaskStatus.WaitingUpstream)

    ta.build()

    assert (ta._status == TaskStatus.Executed
            and tb._status == TaskStatus.WaitingExecution
            and tc._status == TaskStatus.WaitingUpstream)

    tb.build()

    assert (ta._status == TaskStatus.Executed
            and tb._status == TaskStatus.Executed
            and tc._status == TaskStatus.WaitingExecution)

    tc.build()

    assert all([t._status == TaskStatus.Executed for t in [ta, tb, tc]])
Esempio n. 2
0
def process_readme_md(folders, parent_dir='.', force=False):
    """
    Process README.md files from given folders, executes them inline
    """
    dag = DAG()

    files = [Path(parent_dir, folder, 'README.md') for folder in folders]

    for f in files:
        make_task(dag, f)

    dag.render()

    # clear the output of tasks that will be executed, otherwise the output
    # from commands such as "ploomber build" will not run anything
    for t in dag.values():
        out = Path(str(t.product)).parent / 'output'
        if t.exec_status == TaskStatus.WaitingExecution and Path(out).exists():
            print(f'Deleting {out}')
            shutil.rmtree(out)

    print(dag.build(force=force))

    for task_name in dag:
        task = dag[task_name]

        if task.exec_status == TaskStatus.Executed:
            post_process_nb(str(task.product))
Esempio n. 3
0
def test_sqldump_does_not_required_product_tag(tmp_directory):
    tmp = Path(tmp_directory)

    # create a db
    conn = connect(str(tmp / "database.db"))
    client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))
    # dump output path
    out = tmp / 'dump'

    # make some data and save it in the db
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', conn)

    # create the task and run it
    dag = DAG()

    # pass template SQL code so it's treated as a placeholder, this will force
    # the render step
    SQLDump('SELECT * FROM numbers LIMIT {{limit}}',
            File(out),
            dag,
            name='dump.csv',
            client=client,
            chunksize=None,
            io_handler=io.CSVIO,
            params={'limit': 10})

    dag.render()
Esempio n. 4
0
def test_upstream_and_me_are_added():
    dag = DAG()
    product = File('file.txt')
    t = PythonCallable(fn, product, dag, 'callable', params=dict(a=1))
    dag.render()

    assert t.params['product'] is product
Esempio n. 5
0
def test_ignores_declared_product_and_upstream(tmp_directory, code):
    path = Path('sample.py')

    path.write_text(code)

    dag = DAG()
    NotebookRunner(path, product=File('out.ipynb'), dag=dag)
    dag.render()
Esempio n. 6
0
def test_building_a_single_task_when_rendered_upstream():
    dag = DAG()
    t1 = PythonCallable(touch, File('1.txt'), dag, name=1)
    t2 = PythonCallable(touch_w_upstream, File('2.txt'), dag, name=2)

    t1 >> t2

    dag.render()
    t2.build()
Esempio n. 7
0
def test_upstream_and_me_are_added():
    dag = DAG()
    t = PythonCallable(fn, File('file.txt'), dag, 'callable',
                       params=dict(a=1))
    dag.render()

    p = t.params.copy()
    p['product'] = str(p['product'])
    assert p == dict(a=1, product='file.txt')
Esempio n. 8
0
def test_on_render_exceptions_are_logged(executor, caplog):
    dag = DAG(executor=executor)
    t = PythonCallable(fn, File('file.txt'), dag, name='t')
    t.on_render = hook_crashing

    with caplog.at_level(logging.ERROR):
        with pytest.raises(DAGRenderError):
            dag.render()

    assert 'Exception when running on_render for task "t"' in caplog.text
Esempio n. 9
0
def test_on_render_hook_is_executed(tmp_directory):
    hook.count = 0

    dag = DAG()
    PythonCallable(touch_root, File('file.txt'), dag, name='t')
    dag.on_render = hook

    dag.render()

    assert hook.count == 1
Esempio n. 10
0
def test_develop_error_if_r_notebook(tmp_sample_tasks):
    dag = DAG()

    t = NotebookRunner(Path('sample.R'), product=File('out.ipynb'), dag=dag)

    dag.render()

    with pytest.raises(NotImplementedError):
        t.develop()

    with pytest.raises(NotImplementedError):
        t.debug()
Esempio n. 11
0
def test_render_error_on_missing_upstream(tmp_directory, code):
    path = Path('sample.py')
    path.write_text(code)

    dag = DAG()
    NotebookRunner(path, product=File('out.ipynb'), dag=dag)

    with pytest.raises(DAGRenderError) as excinfo:
        dag.render()

    expected = ("undefined name 'upstream'. Did you forget"
                " to declare upstream dependencies?")
    assert expected in str(excinfo.value)
Esempio n. 12
0
def test_catches_signature_errors_at_render_time():
    dag = DAG()
    t = PythonCallable(fn,
                       File('file.txt'),
                       dag,
                       'callable',
                       params=dict(non_param=1))

    with pytest.raises(TaskRenderError):
        t.render()

    with pytest.raises(DAGRenderError):
        dag.render()
Esempio n. 13
0
def test_duplicated_files_one_absolute():
    dag = DAG()
    PythonCallable(touch_root, File('a'), dag, name='task')
    PythonCallable(touch_root, File(Path('a').resolve()), dag, name='another')

    with pytest.raises(DAGRenderError) as excinfo:
        dag.render()

    expected = ("Tasks must generate unique Products. "
                "The following Products appear in more than one task "
                "{File('a'): ['task', 'another']}")

    assert expected == str(excinfo.value)
Esempio n. 14
0
def test_dag_task_status_life_cycle(executor, tmp_directory):
    """
    Check dag and task status along calls to DAG.render and DAG.build.
    Although DAG and Task status are automatically updated and propagated
    downstream upon calls to render and build, we have to parametrize this
    over executors since the object that gets updated might not be the same
    one that we declared here (this happens when a task runs in a different
    process), hence, it is the executor's responsibility to notify tasks
    on sucess/fail scenarios so downstream tasks are updated correctly
    """
    dag = DAG(executor=executor)
    t1 = PythonCallable(touch_root, File('ok.txt'), dag, name='t1')
    t2 = PythonCallable(failing_root, File('a_file.txt'), dag, name='t2')
    t3 = PythonCallable(touch, File('another_file.txt'), dag, name='t3')
    t4 = PythonCallable(touch, File('yet_another_file.txt'), dag, name='t4')
    t5 = PythonCallable(touch_root, File('file.txt'), dag, name='t5')
    t2 >> t3 >> t4

    assert dag._exec_status == DAGStatus.WaitingRender
    assert {TaskStatus.WaitingRender
            } == set([t.exec_status for t in dag.values()])

    dag.render()

    assert dag._exec_status == DAGStatus.WaitingExecution
    assert t1.exec_status == TaskStatus.WaitingExecution
    assert t2.exec_status == TaskStatus.WaitingExecution
    assert t3.exec_status == TaskStatus.WaitingUpstream
    assert t4.exec_status == TaskStatus.WaitingUpstream
    assert t5.exec_status == TaskStatus.WaitingExecution

    try:
        dag.build()
    except DAGBuildError:
        pass

    assert dag._exec_status == DAGStatus.Errored
    assert t1.exec_status == TaskStatus.Executed
    assert t2.exec_status == TaskStatus.Errored
    assert t3.exec_status == TaskStatus.Aborted
    assert t4.exec_status == TaskStatus.Aborted
    assert t5.exec_status == TaskStatus.Executed

    dag.render()

    assert dag._exec_status == DAGStatus.WaitingExecution
    assert t1.exec_status == TaskStatus.Skipped
    assert t2.exec_status == TaskStatus.WaitingExecution
    assert t3.exec_status == TaskStatus.WaitingUpstream
    assert t4.exec_status == TaskStatus.WaitingUpstream
    assert t5.exec_status == TaskStatus.Skipped
Esempio n. 15
0
def test_warnings_are_shown(tmp_directory):
    dag = DAG()
    t1 = PythonCallable(touch_root, File('file.txt'), dag)
    t2 = PythonCallable(touch, File('file2.txt'), dag)
    t1.on_render = on_render_1
    t2.on_render = on_render_2
    t1 >> t2

    with pytest.warns(None) as record:
        dag.render()

    assert len(record) == 1
    assert 'This is a warning' in str(record[0].message)
    assert 'This is another warning' in str(record[0].message)
Esempio n. 16
0
def test_change_static_analysis(tmp_sample_tasks):
    dag = DAG(executor=Serial(build_in_subprocess=False))

    # static_analysis is True by default, this should fail
    t = NotebookRunner(Path('sample.ipynb'),
                       File('out.ipynb'),
                       dag,
                       params=dict(a=1, b=2))

    # disable it
    t.static_analysis = False

    # this should work
    dag.render()
Esempio n. 17
0
def test_dag_render_step_by_step_w_skipped(tmp_directory):
    dag = DAG()

    t1 = PythonCallable(touch_root, File('t1.txt'), dag, name='t1')
    t21 = PythonCallable(touch, File('t21.txt'), dag, name='t21')
    t22 = PythonCallable(touch, File('t22.txt'), dag, name='t22')
    t3 = PythonCallable(touch, File('t3.txt'), dag, name='t3')

    t1 >> t21
    t1 >> t22

    (t21 + t22) >> t3

    assert (set(t.exec_status
                for t in dag.values()) == {TaskStatus.WaitingRender})

    dag.render()
    t1.build()

    dag.render()

    assert t1.exec_status == TaskStatus.Skipped
    assert t21.exec_status == TaskStatus.WaitingExecution
    assert t22.exec_status == TaskStatus.WaitingExecution
    assert t3.exec_status == TaskStatus.WaitingUpstream

    t21.build()
    dag.render()

    assert t1.exec_status == TaskStatus.Skipped
    assert t21.exec_status == TaskStatus.Skipped
    assert t22.exec_status == TaskStatus.WaitingExecution
    assert t3.exec_status == TaskStatus.WaitingUpstream

    t22.build()
    dag.render()

    assert t1.exec_status == TaskStatus.Skipped
    assert t21.exec_status == TaskStatus.Skipped
    assert t22.exec_status == TaskStatus.Skipped
    assert t3.exec_status == TaskStatus.WaitingExecution

    t3.build()
    dag.render()

    assert t1.exec_status == TaskStatus.Skipped
    assert t21.exec_status == TaskStatus.Skipped
    assert t22.exec_status == TaskStatus.Skipped
    assert t3.exec_status == TaskStatus.Skipped
Esempio n. 18
0
def test_dag_r(tmp_directory):
    path = Path('sample.R')

    path.write_text("""
# + tags=["parameters"]
a <- NULL
b <- 1
c <- c(1, 2, 3)
""")

    dag = DAG()
    NotebookRunner(path, product=File('out.ipynb'), dag=dag, params=dict(z=1))

    # parameter extraction is not implemented but should not raise an error
    dag.render()
Esempio n. 19
0
def test_render_error_on_syntax_error(tmp_directory):
    path = Path('sample.py')

    path.write_text("""
# + tags=["parameters"]
if
""")

    dag = DAG()
    NotebookRunner(path, product=File('out.ipynb'), dag=dag)

    with pytest.raises(DAGRenderError) as excinfo:
        dag.render()

    assert 'invalid syntax\n\nif\n\n  ^\n' in str(excinfo.value)
def test_dag_without_client(monkeypatch, tmp_directory):
    mock = Mock(wraps=dag_module.fetch_remote_metadata_in_parallel)
    monkeypatch.setattr(dag_module, 'fetch_remote_metadata_in_parallel', mock)
    mock_remote = Mock()
    monkeypatch.setattr(file._RemoteFile, '_fetch_remote_metadata',
                        mock_remote)

    dag = DAG(executor=Serial(build_in_subprocess=False))
    PythonCallable(touch_root, File('one'), dag=dag)

    dag.render()

    # should call it
    mock.assert_called_once_with(dag)
    # but should not call remotes
    mock_remote.assert_not_called()
Esempio n. 21
0
def test_forced_render(monkeypatch):
    """
    Forced render should not call Product._is_oudated. For products whose
    metadata is stored remotely this is an expensive operation
    """
    dag = DAG()
    t1 = PythonCallable(touch_root, File('1.txt'), dag, name=1)
    t2 = PythonCallable(touch, File('2.txt'), dag, name=2)
    t1 >> t2

    def patched(self, outdated_by_code):
        raise ValueError

    monkeypatch.setattr(File, '_is_outdated', patched)

    dag.render(force=True)
Esempio n. 22
0
def test_skip_kernel_install_check(tmp_directory):
    dag = DAG()

    code = """
# + tags=["parameters"]
1 + 1
    """

    NotebookRunner(code,
                   product=File(Path(tmp_directory, 'out.ipynb')),
                   dag=dag,
                   kernelspec_name='unknown_kernel',
                   ext_in='py',
                   name='nb',
                   check_if_kernel_installed=False)
    dag.render()
Esempio n. 23
0
def test_forced_render_does_not_call_is_outdated(monkeypatch):
    """
    For products whose metadata is stored remotely, checking status is an
    expensive operation. Make dure forced render does not call
    Product._is_oudated
    """
    dag = DAG()
    t1 = PythonCallable(touch_root, File('1.txt'), dag, name=1)
    t2 = PythonCallable(touch, File('2.txt'), dag, name=2)
    t1 >> t2

    def _is_outdated(self, outdated_by_code):
        raise ValueError(f'Called _is_outdated on {self}')

    monkeypatch.setattr(File, '_is_outdated', _is_outdated)

    dag.render(force=True)
Esempio n. 24
0
def test_render_error_on_undefined_name_error(tmp_directory):
    path = Path('sample.py')

    path.write_text("""
# + tags=["parameters"]

# +
df.head()
""")

    dag = DAG()
    NotebookRunner(path, product=File('out.ipynb'), dag=dag)

    with pytest.raises(DAGRenderError) as excinfo:
        dag.render()

    assert "undefined name 'df'" in str(excinfo.value)
Esempio n. 25
0
def test_task_grouping():
    dag = DAG()
    t1 = PythonCallable(touch_root, File('1.txt'), dag, name='first')
    t2 = PythonCallable(touch_root, File('2.txt'), dag, name='second')
    t3 = PythonCallable(touch, File('3.txt'), dag, name='third')
    t3.set_upstream(t1, group_name='group')
    t3.set_upstream(t2, group_name='group')
    dag.render()

    assert set(t3.upstream) == {'first', 'second'}

    assert set(t3._upstream_product_grouped) == {'group'}
    assert set(t3._upstream_product_grouped['group']) == {'first', 'second'}

    assert set(t3.params['upstream']) == {'group'}

    assert t3.params['upstream']['group']['first'] is t1.product
    assert t3.params['upstream']['group']['second'] is t2.product
Esempio n. 26
0
def test_tracebacks_are_shown_for_all_on_render_failing_tasks():
    dag = DAG()
    mock_client = Mock()
    SQLDump('SELECT * FROM {{one_table}}',
            File('one_table'),
            dag,
            name='t1',
            client=mock_client)
    SQLDump('SELECT * FROM {{another_table}}',
            File('another_table'),
            dag,
            name='t2',
            client=mock_client)

    with pytest.raises(DAGRenderError) as excinfo:
        dag.render()

    assert "SQLDump: t2 -> File('another_table')" in str(excinfo.value)
    assert "SQLDump: t1 -> File('one_table')" in str(excinfo.value)
Esempio n. 27
0
def test_passing_upstream_and_product_in_shellscript(tmp_directory):
    dag = DAG()

    fa = Path('a.txt')
    fb = Path('b.txt')
    fc = Path('c.txt')

    ta = ShellScript(('echo a > {{product}}'), File(fa), dag, 'ta')
    tb = ShellScript(('cat {{upstream["ta"]}} > {{product}}'
                      ' && echo b >> {{product}}'), File(fb), dag, 'tb')
    tc = ShellScript(('cat {{upstream["tb"]}} > {{product}}'
                      ' && echo c >> {{product}}'), File(fc), dag, 'tc')

    ta >> tb >> tc

    dag.render()

    assert str(ta.source) == 'echo a > a.txt'
    assert str(tb.source) == 'cat a.txt > b.txt && echo b >> b.txt'
    assert str(tc.source) == 'cat b.txt > c.txt && echo c >> c.txt'
Esempio n. 28
0
def test_from_params():
    dag = DAG()
    group = TaskGroup.from_params(PythonCallable,
                                  File,
                                  'dir/file.txt', {'source': touch},
                                  dag,
                                  name='task_group',
                                  params_array=[{
                                      'param': 1
                                  }, {
                                      'param': 2
                                  }])

    dag.render()

    assert len(group) == 2
    assert dag['task_group0'].source.primitive is touch
    assert dag['task_group1'].source.primitive is touch
    assert str(dag['task_group0'].product) == str(Path('dir', 'file-0.txt'))
    assert str(dag['task_group1'].product) == str(Path('dir', 'file-1.txt'))
Esempio n. 29
0
def test_from_params_resolves_paths():
    dag = DAG()
    TaskGroup.from_params(PythonCallable,
                          File,
                          'dir/file.txt', {'source': touch},
                          dag,
                          name='task_group',
                          params_array=[{
                              'param': 1
                          }, {
                              'param': 2
                          }],
                          resolve_relative_to='')

    dag.render()

    assert Path(dag['task_group0'].product) == Path('dir',
                                                    'file-0.txt').resolve()
    assert Path(dag['task_group1'].product) == Path('dir',
                                                    'file-1.txt').resolve()
Esempio n. 30
0
def test_recover_from_failed_render():
    dag = DAG()
    t1 = PythonCallable(touch_root, File('file.txt'), dag)
    t2 = PythonCallable(touch, File('file2.txt'), dag)
    t1.on_render = on_render_failed
    t2.on_render = on_render_2
    t1 >> t2

    with pytest.raises(DAGRenderError):
        dag.render()

    assert t1.exec_status == TaskStatus.ErroredRender
    assert t2.exec_status == TaskStatus.AbortedRender

    t1.on_render = on_render_1

    dag.render()

    assert t1.exec_status == TaskStatus.WaitingExecution
    assert t2.exec_status == TaskStatus.WaitingUpstream