def test_task_change_in_status(): dag = DAG('dag') ta = ShellScript('echo "a" > {{product}}', File('a.txt'), dag, 'ta') tb = ShellScript('cat {{upstream["ta"]}} > {{product}}', File('b.txt'), dag, 'tb') tc = ShellScript('cat {{upstream["tb"]}} > {{product}}', File('c.txt'), dag, 'tc') assert all([t._status == TaskStatus.WaitingRender for t in [ta, tb, tc]]) ta >> tb >> tc dag.render() assert (ta._status == TaskStatus.WaitingExecution and tb._status == TaskStatus.WaitingUpstream and tc._status == TaskStatus.WaitingUpstream) ta.build() assert (ta._status == TaskStatus.Executed and tb._status == TaskStatus.WaitingExecution and tc._status == TaskStatus.WaitingUpstream) tb.build() assert (ta._status == TaskStatus.Executed and tb._status == TaskStatus.Executed and tc._status == TaskStatus.WaitingExecution) tc.build() assert all([t._status == TaskStatus.Executed for t in [ta, tb, tc]])
def process_readme_md(folders, parent_dir='.', force=False): """ Process README.md files from given folders, executes them inline """ dag = DAG() files = [Path(parent_dir, folder, 'README.md') for folder in folders] for f in files: make_task(dag, f) dag.render() # clear the output of tasks that will be executed, otherwise the output # from commands such as "ploomber build" will not run anything for t in dag.values(): out = Path(str(t.product)).parent / 'output' if t.exec_status == TaskStatus.WaitingExecution and Path(out).exists(): print(f'Deleting {out}') shutil.rmtree(out) print(dag.build(force=force)) for task_name in dag: task = dag[task_name] if task.exec_status == TaskStatus.Executed: post_process_nb(str(task.product))
def test_sqldump_does_not_required_product_tag(tmp_directory): tmp = Path(tmp_directory) # create a db conn = connect(str(tmp / "database.db")) client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db")) # dump output path out = tmp / 'dump' # make some data and save it in the db df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', conn) # create the task and run it dag = DAG() # pass template SQL code so it's treated as a placeholder, this will force # the render step SQLDump('SELECT * FROM numbers LIMIT {{limit}}', File(out), dag, name='dump.csv', client=client, chunksize=None, io_handler=io.CSVIO, params={'limit': 10}) dag.render()
def test_upstream_and_me_are_added(): dag = DAG() product = File('file.txt') t = PythonCallable(fn, product, dag, 'callable', params=dict(a=1)) dag.render() assert t.params['product'] is product
def test_ignores_declared_product_and_upstream(tmp_directory, code): path = Path('sample.py') path.write_text(code) dag = DAG() NotebookRunner(path, product=File('out.ipynb'), dag=dag) dag.render()
def test_building_a_single_task_when_rendered_upstream(): dag = DAG() t1 = PythonCallable(touch, File('1.txt'), dag, name=1) t2 = PythonCallable(touch_w_upstream, File('2.txt'), dag, name=2) t1 >> t2 dag.render() t2.build()
def test_upstream_and_me_are_added(): dag = DAG() t = PythonCallable(fn, File('file.txt'), dag, 'callable', params=dict(a=1)) dag.render() p = t.params.copy() p['product'] = str(p['product']) assert p == dict(a=1, product='file.txt')
def test_on_render_exceptions_are_logged(executor, caplog): dag = DAG(executor=executor) t = PythonCallable(fn, File('file.txt'), dag, name='t') t.on_render = hook_crashing with caplog.at_level(logging.ERROR): with pytest.raises(DAGRenderError): dag.render() assert 'Exception when running on_render for task "t"' in caplog.text
def test_on_render_hook_is_executed(tmp_directory): hook.count = 0 dag = DAG() PythonCallable(touch_root, File('file.txt'), dag, name='t') dag.on_render = hook dag.render() assert hook.count == 1
def test_develop_error_if_r_notebook(tmp_sample_tasks): dag = DAG() t = NotebookRunner(Path('sample.R'), product=File('out.ipynb'), dag=dag) dag.render() with pytest.raises(NotImplementedError): t.develop() with pytest.raises(NotImplementedError): t.debug()
def test_render_error_on_missing_upstream(tmp_directory, code): path = Path('sample.py') path.write_text(code) dag = DAG() NotebookRunner(path, product=File('out.ipynb'), dag=dag) with pytest.raises(DAGRenderError) as excinfo: dag.render() expected = ("undefined name 'upstream'. Did you forget" " to declare upstream dependencies?") assert expected in str(excinfo.value)
def test_catches_signature_errors_at_render_time(): dag = DAG() t = PythonCallable(fn, File('file.txt'), dag, 'callable', params=dict(non_param=1)) with pytest.raises(TaskRenderError): t.render() with pytest.raises(DAGRenderError): dag.render()
def test_duplicated_files_one_absolute(): dag = DAG() PythonCallable(touch_root, File('a'), dag, name='task') PythonCallable(touch_root, File(Path('a').resolve()), dag, name='another') with pytest.raises(DAGRenderError) as excinfo: dag.render() expected = ("Tasks must generate unique Products. " "The following Products appear in more than one task " "{File('a'): ['task', 'another']}") assert expected == str(excinfo.value)
def test_dag_task_status_life_cycle(executor, tmp_directory): """ Check dag and task status along calls to DAG.render and DAG.build. Although DAG and Task status are automatically updated and propagated downstream upon calls to render and build, we have to parametrize this over executors since the object that gets updated might not be the same one that we declared here (this happens when a task runs in a different process), hence, it is the executor's responsibility to notify tasks on sucess/fail scenarios so downstream tasks are updated correctly """ dag = DAG(executor=executor) t1 = PythonCallable(touch_root, File('ok.txt'), dag, name='t1') t2 = PythonCallable(failing_root, File('a_file.txt'), dag, name='t2') t3 = PythonCallable(touch, File('another_file.txt'), dag, name='t3') t4 = PythonCallable(touch, File('yet_another_file.txt'), dag, name='t4') t5 = PythonCallable(touch_root, File('file.txt'), dag, name='t5') t2 >> t3 >> t4 assert dag._exec_status == DAGStatus.WaitingRender assert {TaskStatus.WaitingRender } == set([t.exec_status for t in dag.values()]) dag.render() assert dag._exec_status == DAGStatus.WaitingExecution assert t1.exec_status == TaskStatus.WaitingExecution assert t2.exec_status == TaskStatus.WaitingExecution assert t3.exec_status == TaskStatus.WaitingUpstream assert t4.exec_status == TaskStatus.WaitingUpstream assert t5.exec_status == TaskStatus.WaitingExecution try: dag.build() except DAGBuildError: pass assert dag._exec_status == DAGStatus.Errored assert t1.exec_status == TaskStatus.Executed assert t2.exec_status == TaskStatus.Errored assert t3.exec_status == TaskStatus.Aborted assert t4.exec_status == TaskStatus.Aborted assert t5.exec_status == TaskStatus.Executed dag.render() assert dag._exec_status == DAGStatus.WaitingExecution assert t1.exec_status == TaskStatus.Skipped assert t2.exec_status == TaskStatus.WaitingExecution assert t3.exec_status == TaskStatus.WaitingUpstream assert t4.exec_status == TaskStatus.WaitingUpstream assert t5.exec_status == TaskStatus.Skipped
def test_warnings_are_shown(tmp_directory): dag = DAG() t1 = PythonCallable(touch_root, File('file.txt'), dag) t2 = PythonCallable(touch, File('file2.txt'), dag) t1.on_render = on_render_1 t2.on_render = on_render_2 t1 >> t2 with pytest.warns(None) as record: dag.render() assert len(record) == 1 assert 'This is a warning' in str(record[0].message) assert 'This is another warning' in str(record[0].message)
def test_change_static_analysis(tmp_sample_tasks): dag = DAG(executor=Serial(build_in_subprocess=False)) # static_analysis is True by default, this should fail t = NotebookRunner(Path('sample.ipynb'), File('out.ipynb'), dag, params=dict(a=1, b=2)) # disable it t.static_analysis = False # this should work dag.render()
def test_dag_render_step_by_step_w_skipped(tmp_directory): dag = DAG() t1 = PythonCallable(touch_root, File('t1.txt'), dag, name='t1') t21 = PythonCallable(touch, File('t21.txt'), dag, name='t21') t22 = PythonCallable(touch, File('t22.txt'), dag, name='t22') t3 = PythonCallable(touch, File('t3.txt'), dag, name='t3') t1 >> t21 t1 >> t22 (t21 + t22) >> t3 assert (set(t.exec_status for t in dag.values()) == {TaskStatus.WaitingRender}) dag.render() t1.build() dag.render() assert t1.exec_status == TaskStatus.Skipped assert t21.exec_status == TaskStatus.WaitingExecution assert t22.exec_status == TaskStatus.WaitingExecution assert t3.exec_status == TaskStatus.WaitingUpstream t21.build() dag.render() assert t1.exec_status == TaskStatus.Skipped assert t21.exec_status == TaskStatus.Skipped assert t22.exec_status == TaskStatus.WaitingExecution assert t3.exec_status == TaskStatus.WaitingUpstream t22.build() dag.render() assert t1.exec_status == TaskStatus.Skipped assert t21.exec_status == TaskStatus.Skipped assert t22.exec_status == TaskStatus.Skipped assert t3.exec_status == TaskStatus.WaitingExecution t3.build() dag.render() assert t1.exec_status == TaskStatus.Skipped assert t21.exec_status == TaskStatus.Skipped assert t22.exec_status == TaskStatus.Skipped assert t3.exec_status == TaskStatus.Skipped
def test_dag_r(tmp_directory): path = Path('sample.R') path.write_text(""" # + tags=["parameters"] a <- NULL b <- 1 c <- c(1, 2, 3) """) dag = DAG() NotebookRunner(path, product=File('out.ipynb'), dag=dag, params=dict(z=1)) # parameter extraction is not implemented but should not raise an error dag.render()
def test_render_error_on_syntax_error(tmp_directory): path = Path('sample.py') path.write_text(""" # + tags=["parameters"] if """) dag = DAG() NotebookRunner(path, product=File('out.ipynb'), dag=dag) with pytest.raises(DAGRenderError) as excinfo: dag.render() assert 'invalid syntax\n\nif\n\n ^\n' in str(excinfo.value)
def test_dag_without_client(monkeypatch, tmp_directory): mock = Mock(wraps=dag_module.fetch_remote_metadata_in_parallel) monkeypatch.setattr(dag_module, 'fetch_remote_metadata_in_parallel', mock) mock_remote = Mock() monkeypatch.setattr(file._RemoteFile, '_fetch_remote_metadata', mock_remote) dag = DAG(executor=Serial(build_in_subprocess=False)) PythonCallable(touch_root, File('one'), dag=dag) dag.render() # should call it mock.assert_called_once_with(dag) # but should not call remotes mock_remote.assert_not_called()
def test_forced_render(monkeypatch): """ Forced render should not call Product._is_oudated. For products whose metadata is stored remotely this is an expensive operation """ dag = DAG() t1 = PythonCallable(touch_root, File('1.txt'), dag, name=1) t2 = PythonCallable(touch, File('2.txt'), dag, name=2) t1 >> t2 def patched(self, outdated_by_code): raise ValueError monkeypatch.setattr(File, '_is_outdated', patched) dag.render(force=True)
def test_skip_kernel_install_check(tmp_directory): dag = DAG() code = """ # + tags=["parameters"] 1 + 1 """ NotebookRunner(code, product=File(Path(tmp_directory, 'out.ipynb')), dag=dag, kernelspec_name='unknown_kernel', ext_in='py', name='nb', check_if_kernel_installed=False) dag.render()
def test_forced_render_does_not_call_is_outdated(monkeypatch): """ For products whose metadata is stored remotely, checking status is an expensive operation. Make dure forced render does not call Product._is_oudated """ dag = DAG() t1 = PythonCallable(touch_root, File('1.txt'), dag, name=1) t2 = PythonCallable(touch, File('2.txt'), dag, name=2) t1 >> t2 def _is_outdated(self, outdated_by_code): raise ValueError(f'Called _is_outdated on {self}') monkeypatch.setattr(File, '_is_outdated', _is_outdated) dag.render(force=True)
def test_render_error_on_undefined_name_error(tmp_directory): path = Path('sample.py') path.write_text(""" # + tags=["parameters"] # + df.head() """) dag = DAG() NotebookRunner(path, product=File('out.ipynb'), dag=dag) with pytest.raises(DAGRenderError) as excinfo: dag.render() assert "undefined name 'df'" in str(excinfo.value)
def test_task_grouping(): dag = DAG() t1 = PythonCallable(touch_root, File('1.txt'), dag, name='first') t2 = PythonCallable(touch_root, File('2.txt'), dag, name='second') t3 = PythonCallable(touch, File('3.txt'), dag, name='third') t3.set_upstream(t1, group_name='group') t3.set_upstream(t2, group_name='group') dag.render() assert set(t3.upstream) == {'first', 'second'} assert set(t3._upstream_product_grouped) == {'group'} assert set(t3._upstream_product_grouped['group']) == {'first', 'second'} assert set(t3.params['upstream']) == {'group'} assert t3.params['upstream']['group']['first'] is t1.product assert t3.params['upstream']['group']['second'] is t2.product
def test_tracebacks_are_shown_for_all_on_render_failing_tasks(): dag = DAG() mock_client = Mock() SQLDump('SELECT * FROM {{one_table}}', File('one_table'), dag, name='t1', client=mock_client) SQLDump('SELECT * FROM {{another_table}}', File('another_table'), dag, name='t2', client=mock_client) with pytest.raises(DAGRenderError) as excinfo: dag.render() assert "SQLDump: t2 -> File('another_table')" in str(excinfo.value) assert "SQLDump: t1 -> File('one_table')" in str(excinfo.value)
def test_passing_upstream_and_product_in_shellscript(tmp_directory): dag = DAG() fa = Path('a.txt') fb = Path('b.txt') fc = Path('c.txt') ta = ShellScript(('echo a > {{product}}'), File(fa), dag, 'ta') tb = ShellScript(('cat {{upstream["ta"]}} > {{product}}' ' && echo b >> {{product}}'), File(fb), dag, 'tb') tc = ShellScript(('cat {{upstream["tb"]}} > {{product}}' ' && echo c >> {{product}}'), File(fc), dag, 'tc') ta >> tb >> tc dag.render() assert str(ta.source) == 'echo a > a.txt' assert str(tb.source) == 'cat a.txt > b.txt && echo b >> b.txt' assert str(tc.source) == 'cat b.txt > c.txt && echo c >> c.txt'
def test_from_params(): dag = DAG() group = TaskGroup.from_params(PythonCallable, File, 'dir/file.txt', {'source': touch}, dag, name='task_group', params_array=[{ 'param': 1 }, { 'param': 2 }]) dag.render() assert len(group) == 2 assert dag['task_group0'].source.primitive is touch assert dag['task_group1'].source.primitive is touch assert str(dag['task_group0'].product) == str(Path('dir', 'file-0.txt')) assert str(dag['task_group1'].product) == str(Path('dir', 'file-1.txt'))
def test_from_params_resolves_paths(): dag = DAG() TaskGroup.from_params(PythonCallable, File, 'dir/file.txt', {'source': touch}, dag, name='task_group', params_array=[{ 'param': 1 }, { 'param': 2 }], resolve_relative_to='') dag.render() assert Path(dag['task_group0'].product) == Path('dir', 'file-0.txt').resolve() assert Path(dag['task_group1'].product) == Path('dir', 'file-1.txt').resolve()
def test_recover_from_failed_render(): dag = DAG() t1 = PythonCallable(touch_root, File('file.txt'), dag) t2 = PythonCallable(touch, File('file2.txt'), dag) t1.on_render = on_render_failed t2.on_render = on_render_2 t1 >> t2 with pytest.raises(DAGRenderError): dag.render() assert t1.exec_status == TaskStatus.ErroredRender assert t2.exec_status == TaskStatus.AbortedRender t1.on_render = on_render_1 dag.render() assert t1.exec_status == TaskStatus.WaitingExecution assert t2.exec_status == TaskStatus.WaitingUpstream