def test_metadata_is_synced_when_executing_in_subprocess(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=True)) t = PythonCallable(touch_root, File('file.txt'), dag) dag.build() assert t.product.metadata._data is not None
def test_from_params_resolves_paths_in_metaproduct(tmp_directory): def touch(product, param): Path(product['one']).touch() Path(product['another']).touch() dag = DAG(executor=Serial(build_in_subprocess=False)) TaskGroup.from_params(PythonCallable, File, { 'one': 'one.txt', 'another': 'another.txt' }, {'source': touch}, dag, name='task_group', params_array=[{ 'param': 1 }, { 'param': 2 }], resolve_relative_to='') # on windows, paths do not resolve if the file doesn't exist, so we run # the pipeline to ensure they do dag.build() assert Path(dag['task_group0'].product['one']).resolve() == Path( 'one-0.txt').resolve() assert Path(dag['task_group0'].product['another']).resolve() == Path( 'another-0.txt').resolve() assert Path(dag['task_group1'].product['one']).resolve() == Path( 'one-1.txt').resolve() assert Path(dag['task_group1'].product['another']).resolve() == Path( 'another-1.txt').resolve()
def make_training(): """Instantiates the training DAG """ # setting build_in_subprocess=False because Python does not like when we # use multiprocessing in functions defined in the main module. Works if # we define them in a different one dag = DAG(executor=Serial(build_in_subprocess=False)) output = Path('output') # add "get" task that returns the training data PythonCallable(get, File(output / 'get.csv'), dag, serializer=serializer, unserializer=unserializer) # add features tasks add_features(dag) # add "fit" task for model training fit_t = PythonCallable(fit, File(output / 'model.pickle'), dag) # train after joining features dag['join'] >> fit_t return dag
def test_lazy_load_dag_level_client(tmp_directory, tmp_imports, my_testing_module, client_spec): tasks = [ { 'source': 'my_testing_module.task', 'product': 'output.csv' }, ] data = { 'tasks': tasks, 'clients': { 'File': client_spec }, } spec = DAGSpec(data, lazy_import=True) dag = spec.to_dag() dag.executor = Serial(build_in_subprocess=False) # since lazy_load=True, creating the dag should not import # my_testing_module assert 'my_testing_module' not in sys.modules dag.build() # should be imported now assert 'my_testing_module' in sys.modules assert Path('backup', 'output.csv').exists()
def test_lazy_load_product_level_client(tmp_directory, tmp_imports, my_testing_module, client_spec): Path('script.sql').write_text(""" CREATE TABLE {{product}} AS SELECT * FROM my_table """) with sqlite3.connect('my.db') as conn: pd.DataFrame({'x': range(5)}).to_sql('my_table', conn) tasks = [ { 'source': 'script.sql', 'product': [None, 'name', 'table'], 'client': client_spec, 'product_client': client_spec, 'product_class': 'GenericSQLRelation', }, ] data = {'tasks': tasks} spec = DAGSpec(data, lazy_import=True) dag = spec.to_dag() dag.executor = Serial(build_in_subprocess=False) # since lazy_load=True, creating the dag should not import # my_testing_module assert 'my_testing_module' not in sys.modules dag.build() # should be imported now assert 'my_testing_module' in sys.modules
def test_grid_and_upstream_wildcard_callables(spec_raw, tmp_directory, add_current_to_sys_path, no_sys_modules_cache): Path('sample_source_callables.py').write_text(""" from pathlib import Path def unserializer(product): return Path(product).read_text() def upstream(product, param): Path(product).touch() def downstream(product, upstream): up = upstream['upstream-*'] one = up['upstream-0'] another = up['upstream-1'] Path(product).touch() """) spec = DAGSpec(spec_raw) dag = spec.to_dag().render() # to build faster dag.executor = Serial(build_in_subprocess=False) # make sure unserializing works correctly dag.build() assert set(dag) == {'upstream-1', 'upstream-0', 'downstream'} assert set(dag['downstream'].params['upstream']['upstream-*']) == { 'upstream-1', 'upstream-0' }
def dag(): dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.') root = PythonCallable(_touch, File('root'), dag=dag, name='root') task = PythonCallable(_touch_upstream, File('file'), dag=dag, name='task') root >> task return dag
def make_dag(env, params): dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[SQLUpload] = SQLAlchemyClient(env.db_uri) dag.clients[SQLiteRelation] = SQLAlchemyClient(env.db_uri) dump = make_task_dump(dag) upload = make_task_upload(dag) dump >> upload return dag
def test_creates_parent_dirs(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=False)) PythonCallable(touch, File('some/nested/product.txt'), dag=dag) dag.build() return dag
def test_dag_on_render_with_params(tmp_directory, tmp_imports, write_dag_hooks_spec): dag = DAGSpec('pipeline.yaml').to_dag() dag.executor = Serial(build_in_subprocess=False) dag.render() assert Path('hook').read_text() == 'on render'
def make(): dag = DAG(executor=Serial(build_in_subprocess=False)) PythonCallable(touch_root_w_param, File('1.txt'), dag, name='first', params={'some_param': object()}) return dag
def _make_dag_with_upstream(): # run in the same process, to ensure the mock object is called dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.') t1 = PythonCallable(_touch, File('1.txt'), dag=dag, name='root') PythonCallable(_touch, File('2.txt'), dag=dag, name=2) t3 = PythonCallable(_touch_upstream, File('3.txt'), dag=dag, name=3) t1 >> t3 return dag
def make(): dag = DAG(executor=Serial(build_in_subprocess=False)) PythonCallable(task_with_resource, File('output'), dag, params=dict(resources_=dict(file='resource.txt'))) return dag
def test_build_partially_with_wildcard(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=False)) PythonCallable(touch_root, File('a-1.txt'), dag, name='a-1') PythonCallable(touch_root, File('a-2.txt'), dag, name='a-2') PythonCallable(touch_root, File('b.txt'), dag, name='b') dag.build_partially('a-*') assert Path('a-1.txt').exists() assert Path('a-2.txt').exists() assert not Path('b.txt').exists()
def test_pipeline(test_env, force): # test is executed with a sample of the data dag = pipeline._make(test_env) # customize executor for testing purposes, default settings will not # start the debugger in the line that raised the exception, this # settings will, try adding an exception in any of the PythonCallable # tasks then run pytest --pdb to see it in action dag.executor = Serial(build_in_subprocess=False, catch_exceptions=False) dag.build(force=force)
def test_creates_parent_dirs_meta_product(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=False)) PythonCallable(touch_meta, { 'one': File('some/nested/product.txt'), 'another': File('some/another/product.txt') }, dag=dag) dag.build() return dag
def test_keeps_folder_layout(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[File] = LocalStorageClient('backup', path_to_project_root='.') Path('dir').mkdir() PythonCallable(_touch, File('file'), dag, name='task') PythonCallable(_touch, File('dir/nested'), dag, name='nested') dag.build() assert Path('backup', 'dir', 'nested').is_file() assert Path('backup', 'dir', '.nested.metadata').is_file() assert Path('backup', 'file').is_file() assert Path('backup', '.file.metadata').is_file()
def _make_dag_with_two_upstream(): dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.') root = PythonCallable(_touch, File('root'), dag=dag, name='root') another = PythonCallable(_touch, File('another'), dag=dag, name='another') task = PythonCallable(_touch_upstream, File('file.txt'), dag=dag, name='task') (root + another) >> task return dag
def test_warnings_are_shown(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=False)) t1 = PythonCallable(touch_root_w_warning, File('file.txt'), dag) t2 = PythonCallable(touch_w_warning, File('file2.txt'), dag) t1 >> t2 with pytest.warns(None) as record: dag.build() assert len(record) == 1 assert 'This is a warning' in str(record[0].message) assert 'This is another warning' in str(record[0].message)
def test_unserializes_upstream_metaproduct(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=False)) dag.unserializer = metaproduct_unserializer t1 = PythonCallable(touch_meta, { 'one': File('one'), 'another': File('another') }, dag=dag, name='first') t2 = PythonCallable(touch_with_first_as_upstream, File('last'), dag=dag) t1 >> t2 dag.build()
def make_larger_dag_with_client(): dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.') root = PythonCallable(touch_root, File('out/root'), dag=dag, name='root') task = PythonCallable(touch, File('out/file'), dag=dag, name='task') another = PythonCallable(touch, File('out/another'), dag=dag, name='another') root >> task >> another return dag
def test_build_partially_with_wildcard_skip_upstream(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=False)) root = PythonCallable(touch_root, File('root.txt'), dag, name='root') a1 = PythonCallable(touch, File('a-1.txt'), dag, name='a-1') root >> a1 PythonCallable(touch_root, File('a-2.txt'), dag, name='a-2') PythonCallable(touch_root, File('b.txt'), dag, name='b') dag.build_partially('a-*', skip_upstream=True) assert not Path('root.txt').exists() assert Path('a-1.txt').exists() assert Path('a-2.txt').exists() assert not Path('b.txt').exists()
def make_dag_with_client_and_metaproduct(): dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.') root = PythonCallable(touch_root_with_metaproduct, { 'root': File('out/root'), 'another': File('out/another') }, dag=dag, name='root') task = PythonCallable(touch, File('file'), dag=dag, name='task') root >> task return dag
def test_dag_on_failure_with_params(tmp_directory, tmp_imports, write_dag_hooks_spec): Path('my_module.py').write_text(""" def touch(product): raise Exception """) dag = DAGSpec('pipeline.yaml').to_dag() dag.executor = Serial(build_in_subprocess=False) with pytest.raises(DAGBuildError): dag.build() assert Path('hook').read_text() == 'on failure'
def test_change_static_analysis(tmp_sample_tasks): dag = DAG(executor=Serial(build_in_subprocess=False)) # static_analysis is True by default, this should fail t = NotebookRunner(Path('sample.ipynb'), File('out.ipynb'), dag, params=dict(a=1, b=2)) # disable it t.static_analysis = False # this should work dag.render()
def test_warns_if_export_args_but_ipynb_output(tmp_sample_tasks): dag = DAG(executor=Serial(build_in_subprocess=False)) NotebookRunner(Path('sample.ipynb'), File('out.ipynb'), dag, nbconvert_export_kwargs=dict(exclude_input=True)) with pytest.warns(UserWarning) as records: dag.build() # NOTE: not sure why sometimes two records are displayed, maybe another # library is throwing the warning assert any( "Output 'out.ipynb' is a notebook file" in record.message.args[0] for record in records)
def _make_dag_with_metaproduct(with_client=True): dag = DAG(executor=Serial(build_in_subprocess=False)) if with_client: dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.') root = PythonCallable(_touch, File('root'), dag=dag, name='root') task = PythonCallable(_touch_upstream, { 'one': File('file.txt'), 'another': File('another.txt') }, dag=dag, name='task') root >> task return dag
def _make(): dag = DAG(executor=Serial(build_in_subprocess=True)) dag.clients[File] = LocalStorageClient('backup', path_to_project_root='.') t1 = PythonCallable(_touch_many, { 'one': File('one'), 'two': File('two') }, dag, name='task') t2 = PythonCallable(_touch_upstream, File('three'), dag, name='another') t1 >> t2 return dag
def test_dag_without_client(monkeypatch, tmp_directory): mock = Mock(wraps=dag_module.fetch_remote_metadata_in_parallel) monkeypatch.setattr(dag_module, 'fetch_remote_metadata_in_parallel', mock) mock_remote = Mock() monkeypatch.setattr(file._RemoteFile, '_fetch_remote_metadata', mock_remote) dag = DAG(executor=Serial(build_in_subprocess=False)) PythonCallable(touch_root, File('one'), dag=dag) dag.render() # should call it mock.assert_called_once_with(dag) # but should not call remotes mock_remote.assert_not_called()
def test_attempts_to_download_on_each_build(tmp_directory, monkeypatch): # run in the same process, otherwise we won't know if the mock object # is called dag = DAG(executor=Serial(build_in_subprocess=False)) product = File('file.txt') PythonCallable(_touch, product, dag=dag) monkeypatch.setattr(File, 'download', Mock(wraps=product.download)) # download is called on each call to dag.render(), dag.build() calls it... dag.build() assert product.download.call_count == 1 # second time, it should attempt to download again as the remote files # could've been modified dag.build() assert product.download.call_count == 2