def test_can_upload_file_from_upstream_dependency(tmp_directory, pg_client_and_schema): pg_client, schema = pg_client_and_schema dag = DAG() dag.clients[PostgresRelation] = pg_client dag.clients[PostgresCopyFrom] = pg_client make = PythonCallable(make_data, product=File('data.parquet'), dag=dag, name='make') name = 'test_can_upload_file_from_upstream_dependency' pg = PostgresCopyFrom('{{upstream["make"]}}', product=PostgresRelation((schema, name, 'table')), dag=dag, name='upload') make >> pg dag.build() product = str(dag['upload']) assert pd.read_sql(f'SELECT * FROM {product}', pg_client).to_dict(orient='list') == { 'a': [1, 2, 3] }
def test_from_params_resolves_paths_in_metaproduct(tmp_directory): def touch(product, param): Path(product['one']).touch() Path(product['another']).touch() dag = DAG(executor=Serial(build_in_subprocess=False)) TaskGroup.from_params(PythonCallable, File, { 'one': 'one.txt', 'another': 'another.txt' }, {'source': touch}, dag, name='task_group', params_array=[{ 'param': 1 }, { 'param': 2 }], resolve_relative_to='') # on windows, paths do not resolve if the file doesn't exist, so we run # the pipeline to ensure they do dag.build() assert Path(dag['task_group0'].product['one']).resolve() == Path( 'one-0.txt').resolve() assert Path(dag['task_group0'].product['another']).resolve() == Path( 'another-0.txt').resolve() assert Path(dag['task_group1'].product['one']).resolve() == Path( 'one-1.txt').resolve() assert Path(dag['task_group1'].product['another']).resolve() == Path( 'another-1.txt').resolve()
def test_exceptions_are_raised_with_serial_executor(): dag = DAG() PythonCallable(fn_w_exception, File('file.txt'), dag, 'callable') with pytest.raises(MyException): dag.build()
def test_runs_on_finish(executor, tmp_directory): hook.count = 0 hook_2.count = 0 hook_3.count = 0 hook_4.count = 0 dag = DAG(executor=executor) t = PythonCallable(fn, File('file1.txt'), dag, 't') t.on_finish = hook t.on_failure = hook_4 t2 = PythonCallable(touch_w_upstream, File('file2'), dag, 't2') t2.on_finish = hook_2 t3 = PythonCallable(fn, File('file3'), dag, 't3') t3.on_finish = hook_3 t >> t2 dag.build() assert hook.count == 1 assert hook_2.count == 1 assert hook_3.count == 1 assert hook_4.count == 0
def test_can_dump_sqlite_to_parquet(tmp_directory): tmp = Path(tmp_directory) # create a db conn = connect(str(tmp / "database.db")) client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db")) # dump output path out = tmp / 'dump' # make some data and save it in the db df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', conn) cur = conn.cursor() cur.execute('select * from numbers') # create the task and run it dag = DAG() SQLDump('SELECT * FROM numbers', File(out), dag, name='dump', client=client, chunksize=10, io_handler=io.ParquetIO) dag.build() # load dumped data and data from the db dump = pd.read_parquet(out) db = pd.read_sql_query('SELECT * FROM numbers', conn) conn.close() # make sure they are the same assert dump.equals(db)
def test_execute_sample_nb(name, out_dir, tmp_sample_tasks): dag = DAG() NotebookRunner(Path(name), product=File(Path(out_dir, name + '.out.ipynb')), dag=dag) dag.build()
def test_executor_keeps_running_until_no_more_tasks_can_run( executor, tmp_directory): dag = DAG(executor=executor) t_fail = PythonCallable(failing_root, File('t_fail'), dag, name='t_fail') t_fail_downstream = PythonCallable(failing, File('t_fail_downstream'), dag, name='t_fail_downstream') t_touch_aborted = PythonCallable(touch, File('t_touch_aborted'), dag, name='t_touch_aborted') t_fail >> t_fail_downstream >> t_touch_aborted PythonCallable(touch_root, File('t_ok'), dag, name='t_ok') try: dag.build(force=True) except DAGBuildError: pass assert not Path('t_fail').exists() assert not Path('t_fail_downstream').exists() assert Path('t_ok').exists()
def test_metadata_is_synced_when_executing_in_subprocess(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=True)) t = PythonCallable(touch_root, File('file.txt'), dag) dag.build() assert t.product.metadata._data is not None
def test_append_rows(tmp_directory, pg_client_and_schema): pg_client, schema = pg_client_and_schema df = pd.DataFrame({'a': [1, 2, 3]}) df.to_csv('data.csv', index=False) dag = DAG() dag.clients[SQLUpload] = pg_client dag.clients[PostgresRelation] = pg_client # create table df.to_sql('test_append', pg_client.engine, schema=schema, if_exists='replace', index=False) SQLUpload('data.csv', product=PostgresRelation((schema, 'test_append', 'table')), dag=dag, name='upload', to_sql_kwargs={ 'if_exists': 'append', 'index': False }) dag.build() df = pd.read_sql('SELECT * FROM {}.test_append'.format(schema), pg_client.engine) assert df.shape[0] == 6
def test_on_finish(tmp_directory): dag = DAG() t = PythonCallable(touch, File('file'), dag, name='touch') t.on_finish = on_finish dag.build()
def test_cycle_exception(): dag = DAG() ta = PythonCallable(touch_root, File(Path("a.txt")), dag, "ta") tb = PythonCallable(touch, File(Path("b.txt")), dag, "tb") ta >> tb >> ta with pytest.raises(DAGCycle): dag.build()
def test_custom_io_handler(tmp_directory): dag = DAG() client = SQLAlchemyClient('sqlite:///database.db') dag.clients[SQLUpload] = client dag.clients[SQLiteRelation] = client df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]}) df.to_csv('some-file.tsv', sep='\t', index=False) def my_reading_fn(path): return pd.read_csv(path, sep='\t') SQLUpload('some-file.tsv', SQLiteRelation(('my-table', 'table')), dag=dag, name='task', io_handler=my_reading_fn, to_sql_kwargs=dict(index=False)) dag.build() other = pd.read_sql('SELECT * FROM "my-table"', con=client) client.close() assert other.equals(df)
def test_can_dump_postgres(tmp_directory, pg_client): tmp = Path(tmp_directory) # dump output path out = tmp / 'dump' # make some data and save it in the db df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', pg_client.engine, if_exists='replace') # create the task and run it dag = DAG() SQLDump('SELECT * FROM numbers', File(out), dag, name='dump', client=pg_client, chunksize=10, io_handler=io.ParquetIO) dag.build() # load dumped data and data from the db dump = pd.read_parquet(out) db = pd.read_sql_query('SELECT * FROM numbers', pg_client.engine) # make sure they are the same assert dump.equals(db)
def test_can_transfer_sqlite(tmp_directory): """ >>> import tempfile >>> tmp_directory = tempfile.mkdtemp() """ tmp = Path(tmp_directory) # create clientections to 2 dbs client_in = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database_in.db")) client_out = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database_out.db")) # make some data and save it in the db df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', client_in.engine, index=False) # create the task and run it dag = DAG() SQLTransfer('SELECT * FROM numbers', SQLiteRelation((None, 'numbers2', 'table'), client=client_out), dag, name='transfer', client=client_in, chunksize=10) dag.build() # load dumped data and data from the db original = pd.read_sql_query('SELECT * FROM numbers', client_in.engine) transfer = pd.read_sql_query('SELECT * FROM numbers2', client_out.engine) client_in.close() client_out.close() # make sure they are the same assert original.equals(transfer)
def test_can_upload_file_from_upstream_dependency(tmp_directory, pg_client_and_schema): pg_client, schema = pg_client_and_schema dag = DAG() dag.clients[SQLUpload] = pg_client dag.clients[PostgresRelation] = pg_client make = PythonCallable(make_data, product=File('data.parquet'), dag=dag, name='make') name = 'test_can_upload_file_from_upstream_dependency' pg = SQLUpload('{{upstream["make"]}}', product=PostgresRelation((schema, name, 'table')), dag=dag, name='upload', to_sql_kwargs={'if_exists': 'replace'}) make >> pg dag.build()
def test_sqldump_with_dbapiclient(tmp_directory): client = DBAPIClient(connect, dict(database='my_db.db')) # make some data and save it in the db con_raw = connect(database='my_db.db') df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)}) df.to_sql('numbers', con_raw) # create the task and run it dag = DAG() SQLDump('SELECT * FROM numbers', File('dump.csv'), dag, name='dump', client=client, chunksize=None, io_handler=io.CSVIO) dag.build() # load dumped data and data from the db dump = pd.read_csv('dump.csv') db = pd.read_sql_query('SELECT * FROM numbers', con_raw) client.close() con_raw.close() assert dump.equals(db)
def test_sucessful_execution(executor, tmp_directory): dag = DAG(executor=executor) t1 = PythonCallable(touch_root, File('ok.txt'), dag, name='t1') t2 = PythonCallable(touch, File('a_file.txt'), dag, name='t2') t3 = PythonCallable(touch, File('another_file.txt'), dag, name='t3') t4 = PythonCallable(touch, File('yet_another_file.txt'), dag, name='t4') PythonCallable(touch_root, File('file.txt'), dag, name='t5') t1 >> t2 t1 >> t3 (t2 + t3) >> t4 dag.build() assert Path('ok.txt').exists() assert Path('a_file.txt').exists() assert Path('another_file.txt').exists() assert Path('yet_another_file.txt').exists() assert Path('file.txt').exists() assert set(t.exec_status for t in dag.values()) == {TaskStatus.Executed} assert set(t.product._is_outdated() for t in dag.values()) == {False} # nothing executed cause everything is up-to-date dag.build() assert set(t.exec_status for t in dag.values()) == {TaskStatus.Skipped}
def test_failing_notebook_saves_partial_result(tmp_directory): dag = DAG() code = """ # + tags=["parameters"] var = None raise Exception('failing notebook') """ # attempting to generate an HTML report NotebookRunner(code, product=File('out.html'), dag=dag, kernelspec_name='python3', params={'var': 1}, ext_in='py', name='nb') # build breaks due to the exception with pytest.raises(DAGBuildError): dag.build() # but the file with ipynb extension exists to help debugging assert Path('out.ipynb').exists()
def test_can_execute_when_product_is_metaproduct(tmp_directory): dag = DAG() code = """ # + tags=["parameters"] var = None # + from pathlib import Path Path(product['model']).touch() """ product = { 'nb': File(Path(tmp_directory, 'out.ipynb')), 'model': File(Path(tmp_directory, 'model.pkl')) } NotebookRunner(code, product=product, dag=dag, kernelspec_name='python3', params={'var': 1}, ext_in='py', nb_product_key='nb', name='nb') dag.build()
def test_can_convert_to_html(tmp_sample_tasks): dag = DAG() NotebookRunner(Path('sample.ipynb'), product=File(Path('out.html')), dag=dag, name='nb') dag.build()
def test_creates_parent_dirs(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=False)) PythonCallable(touch, File('some/nested/product.txt'), dag=dag) dag.build() return dag
def test_task_errors_are_logged(executor, caplog): dag = DAG(executor=executor) PythonCallable(failing_root, File('file.txt'), dag, name='t') with caplog.at_level(logging.ERROR): with pytest.raises(DAGBuildError): dag.build() assert 'Error building task "t"' in caplog.text
def test_forced_build(executor, tmp_directory): dag = DAG(executor=executor) PythonCallable(touch_root, File('1.txt'), dag, name=1) dag.build() report = dag.build(force=True) assert report['Ran?'] == [True]
def test_on_finish_exceptions_are_logged(executor, tmp_directory, caplog): dag = DAG(executor=executor) t = PythonCallable(fn, File('file.txt'), dag, name='t') t.on_finish = hook_crashing with caplog.at_level(logging.ERROR): with pytest.raises(DAGBuildError): dag.build() assert 'Exception when running on_finish for task "t"' in caplog.text
def test_sample_dag(sqlite_client_and_tmp_dir, class_, identifier): client, _ = sqlite_client_and_tmp_dir dag = DAG() product = GenericProduct('some_file.txt', client=client) PythonCallable(touch, product, dag) dag.build() assert Path('some_file.txt').exists() assert product.exists() assert product.fetch_metadata() is not None
def test_on_failure_exceptions_are_logged(executor, caplog): dag = DAG(executor='serial') t = PythonCallable(fn_that_fails, File('file.txt'), dag, name='t') t.on_failure = hook_crashing with caplog.at_level(logging.ERROR): with pytest.raises(DAGBuildError): dag.build() assert 'Exception when running on_failure for task "t"' in caplog.text
def test_on_finish_hook_is_executed(tmp_directory): hook.count = 0 dag = DAG() PythonCallable(touch_root, File('file.txt'), dag, name='t') dag.on_finish = hook dag.build() assert hook.count == 1
def test_parallel_execution(tmp_directory): dag = DAG('dag', executor='parallel') a1 = PythonCallable(touch_root, File('a1.txt'), dag, 'a1') a2 = PythonCallable(touch_root, File('a2.txt'), dag, 'a2') b = PythonCallable(touch, File('b.txt'), dag, 'b') c = PythonCallable(touch, File('c.txt'), dag, 'c') (a1 + a2) >> b >> c dag.build()
def test_input_always_executes(tmp_directory): dag = DAG() Path('some_file.txt').touch() t1 = Input(File('some_file.txt'), dag, name='some_file') assert t1.product._is_outdated() dag.build() assert t1.product._is_outdated()
def test_on_failure(caplog): hook.count = 0 dag = DAG(name='dag') PythonCallable(failing_root, File('file.txt'), dag, name='t') dag.on_failure = hook with pytest.raises(DAGBuildError): dag.build() assert hook.count == 1