def test_sucessful_execution(executor, tmp_directory): dag = DAG(executor=executor) t1 = PythonCallable(touch_root, File('ok.txt'), dag, name='t1') t2 = PythonCallable(touch, File('a_file.txt'), dag, name='t2') t3 = PythonCallable(touch, File('another_file.txt'), dag, name='t3') t4 = PythonCallable(touch, File('yet_another_file.txt'), dag, name='t4') PythonCallable(touch_root, File('file.txt'), dag, name='t5') t1 >> t2 t1 >> t3 (t2 + t3) >> t4 dag.build() assert Path('ok.txt').exists() assert Path('a_file.txt').exists() assert Path('another_file.txt').exists() assert Path('yet_another_file.txt').exists() assert Path('file.txt').exists() assert set(t.exec_status for t in dag.values()) == {TaskStatus.Executed} assert set(t.product._is_outdated() for t in dag.values()) == {False} # nothing executed cause everything is up-to-date dag.build() assert set(t.exec_status for t in dag.values()) == {TaskStatus.Skipped}
def test_copy(copy): def _assign_upstream(upstream): _assign_upstream.obj = upstream return 42 dag_ = DAG() root = PythonCallable(_root, File('root.parquet'), dag_, name='root', serializer=serializer, params={'input_data': { 'x': [0, 0, 0] }}) task = PythonCallable(_assign_upstream, File('task.parquet'), dag_, name='task', unserializer=unserializer, serializer=serializer) root >> task dag = InMemoryDAG(dag_) out = dag.build({'root': {'x': [1]}}, copy=copy) # test that the function _assign_upstream received the same object # the task root returned in the upstream argument if copy is disabled. # if copying, then it should be a different object assert (_assign_upstream.obj['root'] is out['root']) is (not copy)
def test_executor_keeps_running_until_no_more_tasks_can_run( executor, tmp_directory): dag = DAG(executor=executor) t_fail = PythonCallable(failing_root, File('t_fail'), dag, name='t_fail') t_fail_downstream = PythonCallable(failing, File('t_fail_downstream'), dag, name='t_fail_downstream') t_touch_aborted = PythonCallable(touch, File('t_touch_aborted'), dag, name='t_touch_aborted') t_fail >> t_fail_downstream >> t_touch_aborted PythonCallable(touch_root, File('t_ok'), dag, name='t_ok') try: dag.build(force=True) except DAGBuildError: pass assert not Path('t_fail').exists() assert not Path('t_fail_downstream').exists() assert Path('t_ok').exists()
def make(): mock_client = Mock() dag = DAG() SQLDump('SELECT * FROM my_table', File('ok.txt'), dag, name='t1', client=mock_client) t2 = SQLDump('SELECT * FROM my_table', File('{{unknown}}'), dag, name='t2', client=mock_client) t3 = SQLDump('SELECT * FROM another', File('another_file.txt'), dag, name='t3', client=mock_client) t4 = SQLDump('SELECT * FROM something', File('yet_another'), dag, name='t4', client=mock_client) SQLDump('SELECT * FROM my_table_2', File('ok_2'), dag, name='t5', client=mock_client) t2 >> t3 >> t4 return dag
def test_dag_functions_do_not_fetch_metadata(function_name, executor, tmp_directory, monkeypatch_plot): """ these function should not look up metadata, since the products do not exist, the status can be determined without it """ product = File('1.txt') dag = DAG(executor=executor) PythonCallable(touch_root, product, dag, name=1) m = Mock(wraps=product.fetch_metadata) # to make this work with pickle m.__reduce__ = lambda self: (MagicMock, ()) product.fetch_metadata = m getattr(dag, function_name)() # not called product.fetch_metadata.assert_not_called() if function_name == 'build': # if building, we should still see the metadata assert product.metadata._data['stored_source_code'] assert product.metadata._data['timestamp']
def add_features(dag): """ Given a DAG, adds feature engineering tasks. The DAG must have a task "get" that returns the input data. """ get_task = dag['get'] output = Path('output') # instantiate tasks a_feature_task = PythonCallable(a_feature, File(output / 'a_feature.csv'), dag, serializer=serializer, unserializer=unserializer) another_task = PythonCallable(another, File(output / 'another.csv'), dag, serializer=serializer, unserializer=unserializer) join_task = PythonCallable(join, File(output / 'join.csv'), dag, serializer=serializer, unserializer=unserializer) # establish dependencies get_task >> a_feature_task get_task >> another_task (get_task + a_feature_task + another_task) >> join_task return dag
def make_training(): """Instantiates the training DAG """ # setting build_in_subprocess=False because Python does not like when we # use multiprocessing in functions defined in the main module. Works if # we define them in a different one dag = DAG(executor=Serial(build_in_subprocess=False)) output = Path('output') # add "get" task that returns the training data PythonCallable(get, File(output / 'get.csv'), dag, serializer=serializer, unserializer=unserializer) # add features tasks add_features(dag) # add "fit" task for model training fit_t = PythonCallable(fit, File(output / 'model.pickle'), dag) # train after joining features dag['join'] >> fit_t return dag
def test_runs_on_finish(executor, tmp_directory): hook.count = 0 hook_2.count = 0 hook_3.count = 0 hook_4.count = 0 dag = DAG(executor=executor) t = PythonCallable(fn, File('file1.txt'), dag, 't') t.on_finish = hook t.on_failure = hook_4 t2 = PythonCallable(touch_w_upstream, File('file2'), dag, 't2') t2.on_finish = hook_2 t3 = PythonCallable(fn, File('file3'), dag, 't3') t3.on_finish = hook_3 t >> t2 dag.build() assert hook.count == 1 assert hook_2.count == 1 assert hook_3.count == 1 assert hook_4.count == 0
def test_download_triggers_client_download(tmp_directory): client = Mock() product = File('file.txt', client=client) product.download() client.download.assert_has_calls( [call('file.txt'), call('.file.txt.metadata')])
def test_upload_after_task_build(tmp_directory): dag = DAG() product = File('file.txt') product.upload = Mock(wraps=product.upload) task = PythonCallable(_touch, product, dag=dag) task.build() product.upload.assert_called_once()
def test_params_are_copied_upon_initialization(): dag = DAG() params = {'a': 1} t1 = PythonCallable(touch, File('file'), dag, name='t1', params=params) t2 = PythonCallable(touch, File('file'), dag, name='t2', params=params) assert t1.params is not t2.params
def test_download(tmp_directory): client = Mock() product = File('file.txt', client=client) product.download() assert client.download.call_args_list == [(('.file.txt.metadata', ), ), (('file.txt', ), )]
def test_building_a_single_task_when_rendered_upstream(): dag = DAG() t1 = PythonCallable(touch, File('1.txt'), dag, name=1) t2 = PythonCallable(touch_w_upstream, File('2.txt'), dag, name=2) t1 >> t2 dag.render() t2.build()
def _make_dag_with_upstream(): # run in the same process, to ensure the mock object is called dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.') t1 = PythonCallable(_touch, File('1.txt'), dag=dag, name='root') PythonCallable(_touch, File('2.txt'), dag=dag, name=2) t3 = PythonCallable(_touch_upstream, File('3.txt'), dag=dag, name=3) t1 >> t3 return dag
def make_dag_with_client(): dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.') root = PythonCallable(touch_root, File('out/root'), dag=dag, name='root') task = PythonCallable(touch, File('out/file'), dag=dag, name='task') root >> task return dag
def test_unserialize_multi(tmp_directory): Path('a.txt').write_text('something') Path('b.json').write_text(json.dumps(dict(a=1, b=2))) obj_txt = unserializer_multi(File('a.txt')) obj_json = unserializer_multi(File('b.json')) assert obj_txt == 'something' assert obj_json == dict(a=1, b=2)
def test_upload(tmp_directory): Path('file.txt').touch() Path('.file.txt.metadata').touch() client = Mock() product = File('file.txt', client=client) product.upload() assert client.upload.call_args_list == [(('.file.txt.metadata', ), ), (('file.txt', ), )]
def test_do_not_upload_if_none_or_one(to_touch, tmp_directory): for f in to_touch: Path(f).touch() client = Mock() product = File('file.txt', client=client) product.upload() client.upload.assert_not_called()
def test_unserialize_with_txt_and_json_default(tmp_directory): @unserialize.unserializer(defaults=['.txt', '.json']) def unserializer(product): raise NotImplementedError Path('a.txt').write_text('something') Path('a.json').write_text(json.dumps(dict(a=1, b=2))) assert unserializer(File('a.txt')) == 'something' assert unserializer(File('a.json')) == dict(a=1, b=2)
def test_serialize_with_txt_and_json_default(tmp_directory): @serialize.serializer(defaults=['.txt', '.json']) def serializer(obj, product): raise NotImplementedError serializer('something', File('a.txt')) serializer(dict(a=1, b=2), File('b.json')) assert Path('a.txt').read_text() == 'something' assert json.loads(Path('b.json').read_text()) == dict(a=1, b=2)
def test_do_not_download_if_file_or_metadata_exists(to_touch, tmp_directory): for f in to_touch: Path(f).touch() client = Mock() product = File('file.txt', client=client) product.download() client.download.assert_not_called()
def test_task_factory_override_params(): dag = DAG() @task_factory(product=File('file.txt')) def touch(product): Path(str(product)).touch() touch(dag=dag, product=File('another.txt')) assert list(dag) == ['touch'] assert str(dag['touch'].product) == 'another.txt'
def test_parallel_execution(tmp_directory): dag = DAG('dag', executor='parallel') a1 = PythonCallable(touch_root, File('a1.txt'), dag, 'a1') a2 = PythonCallable(touch_root, File('a2.txt'), dag, 'a2') b = PythonCallable(touch, File('b.txt'), dag, 'b') c = PythonCallable(touch, File('c.txt'), dag, 'c') (a1 + a2) >> b >> c dag.build()
def test_build_partially_with_wildcard(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=False)) PythonCallable(touch_root, File('a-1.txt'), dag, name='a-1') PythonCallable(touch_root, File('a-2.txt'), dag, name='a-2') PythonCallable(touch_root, File('b.txt'), dag, name='b') dag.build_partially('a-*') assert Path('a-1.txt').exists() assert Path('a-2.txt').exists() assert not Path('b.txt').exists()
def test_delete_metadata(tmp_directory): Path('.a.txt.metadata').touch() Path('.b.txt.metadata').touch() a = File('a.txt') b = File('b.txt') m = MetaProduct({'a': a, 'b': b}) m.metadata.delete() assert not Path('.a.txt.metadata').exists() assert not Path('.b.txt.metadata').exists()
def test_can_create_task_with_many_products(): dag = DAG() fa1 = File('a1.txt') fa2 = File('a2.txt') ta = ShellScript('echo {{product}}', [fa1, fa2], dag, 'ta') ta.render() assert not ta.product.exists() assert ta.product._outdated() assert ta.product._outdated_code_dependency() assert not ta.product._outdated_data_dependencies()
def test_load_from_metaproduct(tmp_directory): Path('a.csv').write_text('a,b\n1,2') dag = DAG() task = PythonCallable(touch_meta, { 'a': File('a.csv'), 'b': File('b') }, dag) assert task.load(key='a') is not None
def test_product_upload_uploads_metadata_and_product(tmp_directory): Path('file.txt').touch() Path('.file.txt.metadata').touch() client = Mock() product = File('file.txt', client=client) product.upload() client.upload.assert_has_calls( [call(Path('.file.txt.metadata')), call(Path('file.txt'))])
def test_creates_parent_dirs_meta_product(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=False)) PythonCallable(touch_meta, { 'one': File('some/nested/product.txt'), 'another': File('some/another/product.txt') }, dag=dag) dag.build() return dag
def test_warnings_are_shown(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=False)) t1 = PythonCallable(touch_root_w_warning, File('file.txt'), dag) t2 = PythonCallable(touch_w_warning, File('file2.txt'), dag) t1 >> t2 with pytest.warns(None) as record: dag.build() assert len(record) == 1 assert 'This is a warning' in str(record[0].message) assert 'This is another warning' in str(record[0].message)