def test_error_invalid_dag_level_client_dotted_path(tmp_sample_tasks, add_current_to_sys_path, no_sys_modules_cache, code, expected_error): Path('dag_level_client_dotted_path.py').write_text(code) spec = DAGSpec({ 'meta': { 'extract_product': False, 'extract_upstream': True, }, 'tasks': [ { 'source': 'sample.sql', 'product': ['name', 'table'] }, ], 'clients': { 'SQLScript': 'dag_level_client_dotted_path.get' } }) with pytest.raises(TypeError) as excinfo: spec.to_dag() assert expected_error in str(excinfo.value)
def test_spec_with_functions(lazy_import, backup_spec_with_functions, add_current_to_sys_path): """ Check we can create pipeline where the task is a function defined in a local file """ spec = DAGSpec('pipeline.yaml', lazy_import=lazy_import) spec.to_dag().build()
def test_import_tasks_from_with_non_empty_env(tmp_nbs): some_tasks = [{ 'source': 'extra_task.py', 'name': 'extra_task', 'product': 'extra.ipynb', 'params': { 'some_param': '{{some_param}}' } }] Path('some_tasks.yaml').write_text(yaml.dump(some_tasks)) Path('extra_task.py').write_text(""" # + tags=["parameters"] # - """) spec_d = yaml.safe_load(Path('pipeline.yaml').read_text()) spec_d['meta']['import_tasks_from'] = 'some_tasks.yaml' spec = DAGSpec(spec_d, env={'some_param': 'some_value'}) dag = spec.to_dag() dag.render() assert dag['extra_task'].params['some_param'] == 'some_value' assert str(Path('extra_task.py').resolve()) in [ str(t['source']) for t in spec['tasks'] ]
def test_import_tasks_from_loads_relative_to_pipeline_spec(tmp_nbs): some_tasks = [{'source': 'extra_task.py', 'product': 'extra.ipynb'}] Path('some_tasks.yaml').write_text(yaml.dump(some_tasks)) Path('extra_task.py').write_text(""" # + tags=["parameters"] # - """) spec_d = yaml.safe_load(Path('pipeline.yaml').read_text()) spec_d['meta']['import_tasks_from'] = 'some_tasks.yaml' Path('pipeline.yaml').write_text(yaml.dump(spec_d)) # move to another dir to make sure we can still load the spec Path('subdir').mkdir() os.chdir('subdir') spec = DAGSpec('../pipeline.yaml') dag = spec.to_dag() dag.render() assert spec['meta']['import_tasks_from'] == str( Path('..', 'some_tasks.yaml').resolve()) assert str(Path('..', 'extra_task.py').resolve()) in [ str(t['source']) for t in spec['tasks'] ]
def test_grid_and_upstream_wildcard_callables(spec_raw, tmp_directory, add_current_to_sys_path, no_sys_modules_cache): Path('sample_source_callables.py').write_text(""" from pathlib import Path def unserializer(product): return Path(product).read_text() def upstream(product, param): Path(product).touch() def downstream(product, upstream): up = upstream['upstream-*'] one = up['upstream-0'] another = up['upstream-1'] Path(product).touch() """) spec = DAGSpec(spec_raw) dag = spec.to_dag().render() # to build faster dag.executor = Serial(build_in_subprocess=False) # make sure unserializing works correctly dag.build() assert set(dag) == {'upstream-1', 'upstream-0', 'downstream'} assert set(dag['downstream'].params['upstream']['upstream-*']) == { 'upstream-1', 'upstream-0' }
def test_python_callables_with_extract_upstream(tmp_directory): spec = DAGSpec({ 'tasks': [ { 'source': 'test_pkg.callables.root', 'product': 'root.csv' }, { 'source': 'test_pkg.callables.a', 'product': 'a.csv' }, { 'source': 'test_pkg.callables.b', 'product': 'b.csv' }, ], 'meta': { 'extract_product': False, 'extract_upstream': True } }) dag = spec.to_dag() dag.build() assert set(dag) == {'a', 'b', 'root'} assert not dag['root'].upstream assert set(dag['a'].upstream) == {'root'} assert set(dag['b'].upstream) == {'root'}
def test_import_tasks_from_paths_are_relative_to_the_yaml_spec( tmp_nbs, tmp_path): tasks_yaml = tmp_path / 'some_tasks.yaml' # source is a relative path some_tasks = [{'source': 'extra_task.py', 'product': 'extra.ipynb'}] tasks_yaml.write_text(yaml.dump(some_tasks)) # write the source code in the same folder as some_tasks.yaml Path(tmp_path, 'extra_task.py').write_text(""" # + tags=["parameters"] # - """) spec_d = yaml.safe_load(Path('pipeline.yaml').read_text()) # set an absolute path spec_d['meta']['import_tasks_from'] = str(tasks_yaml.resolve()) Path('pipeline.yaml').write_text(yaml.dump(spec_d)) spec = DAGSpec('pipeline.yaml') dag = spec.to_dag() dag.render() # paths must be interpreted as relative to tasks.yaml, not to the # current working directory assert str(Path(tmp_path, 'extra_task.py').resolve()) in [ str(t['source']) for t in spec['tasks'] ]
def test_sets_clients(tmp_sample_tasks, add_current_to_sys_path, no_sys_modules_cache, dotted_path_spec): Path('test_sets_clients.py').write_text(""" from unittest.mock import Mock def get(a=None): return Mock() """) spec = DAGSpec({ 'meta': { 'extract_product': False, 'extract_upstream': True, }, 'tasks': [ { 'source': 'sample.sql', 'product': ['name', 'table'] }, ], 'clients': { 'SQLScript': dotted_path_spec } }) dag = spec.to_dag() assert isinstance(dag.clients[SQLScript], Mock)
def test_lazy_load_product_level_client(tmp_directory, tmp_imports, my_testing_module, client_spec): Path('script.sql').write_text(""" CREATE TABLE {{product}} AS SELECT * FROM my_table """) with sqlite3.connect('my.db') as conn: pd.DataFrame({'x': range(5)}).to_sql('my_table', conn) tasks = [ { 'source': 'script.sql', 'product': [None, 'name', 'table'], 'client': client_spec, 'product_client': client_spec, 'product_class': 'GenericSQLRelation', }, ] data = {'tasks': tasks} spec = DAGSpec(data, lazy_import=True) dag = spec.to_dag() dag.executor = Serial(build_in_subprocess=False) # since lazy_load=True, creating the dag should not import # my_testing_module assert 'my_testing_module' not in sys.modules dag.build() # should be imported now assert 'my_testing_module' in sys.modules
def test_lazy_load_dag_level_client(tmp_directory, tmp_imports, my_testing_module, client_spec): tasks = [ { 'source': 'my_testing_module.task', 'product': 'output.csv' }, ] data = { 'tasks': tasks, 'clients': { 'File': client_spec }, } spec = DAGSpec(data, lazy_import=True) dag = spec.to_dag() dag.executor = Serial(build_in_subprocess=False) # since lazy_load=True, creating the dag should not import # my_testing_module assert 'my_testing_module' not in sys.modules dag.build() # should be imported now assert 'my_testing_module' in sys.modules assert Path('backup', 'output.csv').exists()
def test_lazy_load(tmp_directory, tmp_imports): Path('my_module.py').write_text(""" def fn(): pass """) tasks = [ { 'source': 'my_module.fn', 'product': 'report.ipynb', 'on_finish': 'not_a_module.not_a_function', 'on_render': 'not_a_module.not_a_function', 'on_failure': 'not_a_module.not_a_function', 'serializer': 'not_a_module.not_a_function', 'unserializer': 'not_a_module.not_a_function', 'product_client': 'not_a_module.not_a_function' }, ] data = { 'tasks': tasks, 'serializer': 'not_a_module.not_a_function', 'unserializer': 'not_a_module.not_a_function', } spec = DAGSpec(data, lazy_import=True) assert spec.to_dag()
def test_loads_serializer_and_unserializer(backup_online, add_current_to_sys_path): spec = DAGSpec({ 'tasks': [{ 'source': 'online_tasks.get', 'product': 'output/get.parquet', }, { 'source': 'online_tasks.square', 'product': 'output/square.parquet', }], 'meta': { 'extract_product': False }, 'serializer': 'online_io.serialize', 'unserializer': 'online_io.unserialize', }) dag = spec.to_dag() from online_io import serialize, unserialize assert dag['get']._serializer is serialize assert dag['get']._unserializer is unserialize assert dag['square']._serializer is serialize assert dag['square']._unserializer is unserialize
def test_error_if_location_returns_none(tmp_directory, add_current_to_sys_path, no_sys_modules_cache): Path('test_error_if_location_is_not_a_callable.py').write_text(""" def make_dag(): return None """) spec = DAGSpec( {'location': 'test_error_if_location_is_not_a_callable.make_dag'}) with pytest.raises(TypeError) as excinfo: spec.to_dag() expected = ("Error calling dotted path 'test_error_if_location_is_" "not_a_callable.make_dag'. Expected a value but got None") assert str(excinfo.value) == expected
def test_import_tasks_from(tmp_nbs): some_tasks = [{'source': 'extra_task.py', 'product': 'extra.ipynb'}] Path('some_tasks.yaml').write_text(yaml.dump(some_tasks)) Path('extra_task.py').write_text(""" # + tags=["parameters"] # - """) spec_d = yaml.safe_load(Path('pipeline.yaml').read_text()) spec_d['meta']['import_tasks_from'] = 'some_tasks.yaml' spec = DAGSpec(spec_d) spec.to_dag().render() assert str(Path('extra_task.py').resolve()) in [ str(t['source']) for t in spec['tasks'] ]
def test_error_if_location_is_not_a_callable(tmp_directory, add_current_to_sys_path, no_sys_modules_cache): Path('test_error_if_location_is_not_a_callable.py').write_text(""" make_dag = 1 """) spec = DAGSpec( {'location': 'test_error_if_location_is_not_a_callable.make_dag'}) with pytest.raises(TypeError) as excinfo: spec.to_dag() expected = ("Error loading dotted path 'test_error_if_" "location_is_not_a_callable.make_dag'. Expected a " "callable object (i.e., some kind of function). Got 1 " "(an object of type: int)") assert str(excinfo.value) == expected
def load_entry_point(entry_point): type_ = find_entry_point_type(entry_point) if type_ == EntryPoint.Directory: spec = DAGSpec.from_directory(entry_point) path = Path(entry_point) elif type_ == EntryPoint.File: spec = DAGSpec(entry_point) path = Path(entry_point).parent else: raise NotImplementedError( f'loading entry point type {type_!r} is unsupported') return spec, spec.to_dag(), path
def test_grid_and_upstream_wildcard_scripts(spec, tmp_directory): Path('upstream.py').write_text(""" # + tags=['parameters'] upstream = None """) Path('downstream.py').write_text(""" # + tags=['parameters'] upstream = ['upstream-*'] """) spec = DAGSpec(spec) dag = spec.to_dag().render() assert set(dag) == {'upstream-1', 'upstream-0', 'downstream'} assert set(dag['downstream'].params['upstream']['upstream-*']) == { 'upstream-1', 'upstream-0' }
def test_python_callables_spec(tmp_directory, add_current_to_sys_path): Path('test_python_callables_spec.py').write_text(""" def task1(product): pass """) spec = DAGSpec({ 'tasks': [ { 'source': 'test_python_callables_spec.task1', 'product': 'some_file.csv' }, ], 'meta': { 'extract_product': False, 'extract_upstream': False } }) dag = spec.to_dag() assert isinstance(dag['task1'], PythonCallable)
def test_source_loader(monkeypatch, tmp_directory, no_sys_modules_cache): monkeypatch.syspath_prepend(tmp_directory) spec = DAGSpec({ 'meta': { 'source_loader': { 'path': 'templates', 'module': 'test_pkg' }, 'extract_product': False, 'extract_upstream': False, }, 'tasks': [{ 'source': 'create-table.sql', 'product': ['some_table', 'table'], 'client': 'db.get_client' }] }) Path('db.py').write_text(""" from ploomber.clients import SQLAlchemyClient def get_client(): return SQLAlchemyClient('sqlite://') """) # check source loader is working correctly with a template that has a macro loader = spec['meta']['source_loader'] template = loader['create-table.sql'] expected = ('\nDROP TABLE IF EXISTS some_table;\nCREATE TABLE ' 'some_table AS\nSELECT * FROM table') assert template.render({'product': 'some_table'}) == expected # test the task source is correctly resolved when converted to a dag dag = spec.to_dag() dag.render() assert str(dag['create-table'].source) == expected
def test_spec_from_yaml_resolves_paths_from_wildcard(tmp_directory, spec): Path('upstream.py').write_text(""" # + tags=['parameters'] upstream = None """) Path('downstream.py').write_text(""" # + tags=['parameters'] upstream = ['upstream-*'] """) spec = DAGSpec(spec) dag = spec.to_dag().render() # on windows, paths do not resolve if the file doesn't exist Path('upstream-0.ipynb').touch() Path('upstream-1.ipynb').touch() assert str(Path(dag['upstream-0'].product).resolve()) == str( Path('upstream-0.ipynb').resolve()) assert str(Path(dag['upstream-1'].product).resolve()) == str( Path('upstream-1.ipynb').resolve())
def test_import_tasks_from_keeps_value_if_already_absolute(tmp_nbs, tmp_path): tasks_yaml = (tmp_path / 'some_tasks.yaml').resolve() path_to_script = (tmp_path / 'extra_task.py').resolve() some_tasks = [{'source': str(path_to_script), 'product': 'extra.ipynb'}] tasks_yaml.write_text(yaml.dump(some_tasks)) path_to_script.write_text(""" # + tags=["parameters"] # - """) spec_d = yaml.safe_load(Path('pipeline.yaml').read_text()) # set an absolute path spec_d['meta']['import_tasks_from'] = str(tasks_yaml) Path('pipeline.yaml').write_text(yaml.dump(spec_d)) spec = DAGSpec('pipeline.yaml') dag = spec.to_dag() dag.render() # value should be the same because it was absolute assert spec['meta']['import_tasks_from'] == str(tasks_yaml) assert str(path_to_script) in [str(t['source']) for t in spec['tasks']]
def test_to_dag_does_not_mutate_spec(tmp_nbs): spec = DAGSpec('pipeline.yaml') old_data = deepcopy(spec.data) spec.to_dag() assert spec.data == old_data