Example #1
0
def test_can_upload_file_from_upstream_dependency(tmp_directory,
                                                  pg_client_and_schema):

    pg_client, schema = pg_client_and_schema

    dag = DAG()

    dag.clients[PostgresRelation] = pg_client
    dag.clients[PostgresCopyFrom] = pg_client

    make = PythonCallable(make_data,
                          product=File('data.parquet'),
                          dag=dag,
                          name='make')

    name = 'test_can_upload_file_from_upstream_dependency'
    pg = PostgresCopyFrom('{{upstream["make"]}}',
                          product=PostgresRelation((schema, name, 'table')),
                          dag=dag,
                          name='upload')

    make >> pg

    dag.build()

    product = str(dag['upload'])
    assert pd.read_sql(f'SELECT * FROM {product}',
                       pg_client).to_dict(orient='list') == {
                           'a': [1, 2, 3]
                       }
Example #2
0
def test_from_params_resolves_paths_in_metaproduct(tmp_directory):
    def touch(product, param):
        Path(product['one']).touch()
        Path(product['another']).touch()

    dag = DAG(executor=Serial(build_in_subprocess=False))
    TaskGroup.from_params(PythonCallable,
                          File, {
                              'one': 'one.txt',
                              'another': 'another.txt'
                          }, {'source': touch},
                          dag,
                          name='task_group',
                          params_array=[{
                              'param': 1
                          }, {
                              'param': 2
                          }],
                          resolve_relative_to='')

    # on windows, paths do not resolve if the file doesn't exist, so we run
    # the pipeline to ensure they do
    dag.build()

    assert Path(dag['task_group0'].product['one']).resolve() == Path(
        'one-0.txt').resolve()
    assert Path(dag['task_group0'].product['another']).resolve() == Path(
        'another-0.txt').resolve()
    assert Path(dag['task_group1'].product['one']).resolve() == Path(
        'one-1.txt').resolve()
    assert Path(dag['task_group1'].product['another']).resolve() == Path(
        'another-1.txt').resolve()
def test_exceptions_are_raised_with_serial_executor():
    dag = DAG()
    PythonCallable(fn_w_exception, File('file.txt'),
                   dag, 'callable')

    with pytest.raises(MyException):
        dag.build()
Example #4
0
def test_runs_on_finish(executor, tmp_directory):
    hook.count = 0
    hook_2.count = 0
    hook_3.count = 0
    hook_4.count = 0

    dag = DAG(executor=executor)
    t = PythonCallable(fn, File('file1.txt'), dag, 't')
    t.on_finish = hook
    t.on_failure = hook_4

    t2 = PythonCallable(touch_w_upstream, File('file2'), dag, 't2')
    t2.on_finish = hook_2

    t3 = PythonCallable(fn, File('file3'), dag, 't3')
    t3.on_finish = hook_3

    t >> t2

    dag.build()

    assert hook.count == 1
    assert hook_2.count == 1
    assert hook_3.count == 1
    assert hook_4.count == 0
Example #5
0
def test_can_dump_sqlite_to_parquet(tmp_directory):
    tmp = Path(tmp_directory)

    # create a db
    conn = connect(str(tmp / "database.db"))
    client = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database.db"))
    # dump output path
    out = tmp / 'dump'

    # make some data and save it in the db
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', conn)

    cur = conn.cursor()
    cur.execute('select * from numbers')

    # create the task and run it
    dag = DAG()
    SQLDump('SELECT * FROM numbers',
            File(out),
            dag,
            name='dump',
            client=client,
            chunksize=10,
            io_handler=io.ParquetIO)
    dag.build()

    # load dumped data and data from the db
    dump = pd.read_parquet(out)
    db = pd.read_sql_query('SELECT * FROM numbers', conn)

    conn.close()

    # make sure they are the same
    assert dump.equals(db)
Example #6
0
def test_execute_sample_nb(name, out_dir, tmp_sample_tasks):
    dag = DAG()

    NotebookRunner(Path(name),
                   product=File(Path(out_dir, name + '.out.ipynb')),
                   dag=dag)
    dag.build()
Example #7
0
def test_executor_keeps_running_until_no_more_tasks_can_run(
        executor, tmp_directory):
    dag = DAG(executor=executor)
    t_fail = PythonCallable(failing_root, File('t_fail'), dag, name='t_fail')
    t_fail_downstream = PythonCallable(failing,
                                       File('t_fail_downstream'),
                                       dag,
                                       name='t_fail_downstream')
    t_touch_aborted = PythonCallable(touch,
                                     File('t_touch_aborted'),
                                     dag,
                                     name='t_touch_aborted')

    t_fail >> t_fail_downstream >> t_touch_aborted

    PythonCallable(touch_root, File('t_ok'), dag, name='t_ok')

    try:
        dag.build(force=True)
    except DAGBuildError:
        pass

    assert not Path('t_fail').exists()
    assert not Path('t_fail_downstream').exists()
    assert Path('t_ok').exists()
Example #8
0
def test_metadata_is_synced_when_executing_in_subprocess(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=True))
    t = PythonCallable(touch_root, File('file.txt'), dag)

    dag.build()

    assert t.product.metadata._data is not None
Example #9
0
def test_append_rows(tmp_directory, pg_client_and_schema):
    pg_client, schema = pg_client_and_schema

    df = pd.DataFrame({'a': [1, 2, 3]})
    df.to_csv('data.csv', index=False)

    dag = DAG()

    dag.clients[SQLUpload] = pg_client
    dag.clients[PostgresRelation] = pg_client

    # create table
    df.to_sql('test_append',
              pg_client.engine,
              schema=schema,
              if_exists='replace',
              index=False)

    SQLUpload('data.csv',
              product=PostgresRelation((schema, 'test_append', 'table')),
              dag=dag,
              name='upload',
              to_sql_kwargs={
                  'if_exists': 'append',
                  'index': False
              })

    dag.build()

    df = pd.read_sql('SELECT * FROM {}.test_append'.format(schema),
                     pg_client.engine)

    assert df.shape[0] == 6
Example #10
0
def test_on_finish(tmp_directory):
    dag = DAG()

    t = PythonCallable(touch, File('file'), dag, name='touch')
    t.on_finish = on_finish

    dag.build()
Example #11
0
def test_cycle_exception():
    dag = DAG()
    ta = PythonCallable(touch_root, File(Path("a.txt")), dag, "ta")
    tb = PythonCallable(touch, File(Path("b.txt")), dag, "tb")
    ta >> tb >> ta
    with pytest.raises(DAGCycle):
        dag.build()
Example #12
0
def test_custom_io_handler(tmp_directory):
    dag = DAG()
    client = SQLAlchemyClient('sqlite:///database.db')
    dag.clients[SQLUpload] = client
    dag.clients[SQLiteRelation] = client

    df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]})
    df.to_csv('some-file.tsv', sep='\t', index=False)

    def my_reading_fn(path):
        return pd.read_csv(path, sep='\t')

    SQLUpload('some-file.tsv',
              SQLiteRelation(('my-table', 'table')),
              dag=dag,
              name='task',
              io_handler=my_reading_fn,
              to_sql_kwargs=dict(index=False))

    dag.build()

    other = pd.read_sql('SELECT * FROM "my-table"', con=client)

    client.close()

    assert other.equals(df)
Example #13
0
def test_can_dump_postgres(tmp_directory, pg_client):
    tmp = Path(tmp_directory)

    # dump output path
    out = tmp / 'dump'

    # make some data and save it in the db
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', pg_client.engine, if_exists='replace')

    # create the task and run it
    dag = DAG()
    SQLDump('SELECT * FROM numbers',
            File(out),
            dag,
            name='dump',
            client=pg_client,
            chunksize=10,
            io_handler=io.ParquetIO)
    dag.build()

    # load dumped data and data from the db
    dump = pd.read_parquet(out)
    db = pd.read_sql_query('SELECT * FROM numbers', pg_client.engine)

    # make sure they are the same
    assert dump.equals(db)
Example #14
0
def test_can_transfer_sqlite(tmp_directory):
    """
    >>> import tempfile
    >>> tmp_directory = tempfile.mkdtemp()
    """
    tmp = Path(tmp_directory)

    # create clientections to 2 dbs
    client_in = SQLAlchemyClient('sqlite:///{}'.format(tmp / "database_in.db"))
    client_out = SQLAlchemyClient('sqlite:///{}'.format(tmp /
                                                        "database_out.db"))

    # make some data and save it in the db
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', client_in.engine, index=False)

    # create the task and run it
    dag = DAG()
    SQLTransfer('SELECT * FROM numbers',
                SQLiteRelation((None, 'numbers2', 'table'), client=client_out),
                dag,
                name='transfer',
                client=client_in,
                chunksize=10)
    dag.build()

    # load dumped data and data from the db
    original = pd.read_sql_query('SELECT * FROM numbers', client_in.engine)
    transfer = pd.read_sql_query('SELECT * FROM numbers2', client_out.engine)

    client_in.close()
    client_out.close()

    # make sure they are the same
    assert original.equals(transfer)
Example #15
0
def test_can_upload_file_from_upstream_dependency(tmp_directory,
                                                  pg_client_and_schema):

    pg_client, schema = pg_client_and_schema

    dag = DAG()

    dag.clients[SQLUpload] = pg_client
    dag.clients[PostgresRelation] = pg_client

    make = PythonCallable(make_data,
                          product=File('data.parquet'),
                          dag=dag,
                          name='make')

    name = 'test_can_upload_file_from_upstream_dependency'
    pg = SQLUpload('{{upstream["make"]}}',
                   product=PostgresRelation((schema, name, 'table')),
                   dag=dag,
                   name='upload',
                   to_sql_kwargs={'if_exists': 'replace'})

    make >> pg

    dag.build()
Example #16
0
def test_sqldump_with_dbapiclient(tmp_directory):
    client = DBAPIClient(connect, dict(database='my_db.db'))

    # make some data and save it in the db
    con_raw = connect(database='my_db.db')
    df = pd.DataFrame({'a': np.arange(0, 100), 'b': np.arange(100, 200)})
    df.to_sql('numbers', con_raw)

    # create the task and run it
    dag = DAG()
    SQLDump('SELECT * FROM numbers',
            File('dump.csv'),
            dag,
            name='dump',
            client=client,
            chunksize=None,
            io_handler=io.CSVIO)

    dag.build()

    # load dumped data and data from the db
    dump = pd.read_csv('dump.csv')
    db = pd.read_sql_query('SELECT * FROM numbers', con_raw)

    client.close()
    con_raw.close()

    assert dump.equals(db)
Example #17
0
def test_sucessful_execution(executor, tmp_directory):
    dag = DAG(executor=executor)
    t1 = PythonCallable(touch_root, File('ok.txt'), dag, name='t1')
    t2 = PythonCallable(touch, File('a_file.txt'), dag, name='t2')
    t3 = PythonCallable(touch, File('another_file.txt'), dag, name='t3')
    t4 = PythonCallable(touch, File('yet_another_file.txt'), dag, name='t4')
    PythonCallable(touch_root, File('file.txt'), dag, name='t5')
    t1 >> t2
    t1 >> t3
    (t2 + t3) >> t4

    dag.build()

    assert Path('ok.txt').exists()
    assert Path('a_file.txt').exists()
    assert Path('another_file.txt').exists()
    assert Path('yet_another_file.txt').exists()
    assert Path('file.txt').exists()

    assert set(t.exec_status for t in dag.values()) == {TaskStatus.Executed}
    assert set(t.product._is_outdated() for t in dag.values()) == {False}

    # nothing executed cause everything is up-to-date
    dag.build()

    assert set(t.exec_status for t in dag.values()) == {TaskStatus.Skipped}
Example #18
0
def test_failing_notebook_saves_partial_result(tmp_directory):
    dag = DAG()

    code = """
# + tags=["parameters"]
var = None

raise Exception('failing notebook')
    """

    # attempting to generate an HTML report
    NotebookRunner(code,
                   product=File('out.html'),
                   dag=dag,
                   kernelspec_name='python3',
                   params={'var': 1},
                   ext_in='py',
                   name='nb')

    # build breaks due to the exception
    with pytest.raises(DAGBuildError):
        dag.build()

    # but the file with ipynb extension exists to help debugging
    assert Path('out.ipynb').exists()
Example #19
0
def test_can_execute_when_product_is_metaproduct(tmp_directory):
    dag = DAG()

    code = """
# + tags=["parameters"]
var = None

# +
from pathlib import Path

Path(product['model']).touch()
    """

    product = {
        'nb': File(Path(tmp_directory, 'out.ipynb')),
        'model': File(Path(tmp_directory, 'model.pkl'))
    }

    NotebookRunner(code,
                   product=product,
                   dag=dag,
                   kernelspec_name='python3',
                   params={'var': 1},
                   ext_in='py',
                   nb_product_key='nb',
                   name='nb')
    dag.build()
Example #20
0
def test_can_convert_to_html(tmp_sample_tasks):
    dag = DAG()

    NotebookRunner(Path('sample.ipynb'),
                   product=File(Path('out.html')),
                   dag=dag,
                   name='nb')
    dag.build()
Example #21
0
def test_creates_parent_dirs(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))

    PythonCallable(touch, File('some/nested/product.txt'), dag=dag)

    dag.build()

    return dag
Example #22
0
def test_task_errors_are_logged(executor, caplog):
    dag = DAG(executor=executor)
    PythonCallable(failing_root, File('file.txt'), dag, name='t')

    with caplog.at_level(logging.ERROR):
        with pytest.raises(DAGBuildError):
            dag.build()

    assert 'Error building task "t"' in caplog.text
Example #23
0
def test_forced_build(executor, tmp_directory):
    dag = DAG(executor=executor)
    PythonCallable(touch_root, File('1.txt'), dag, name=1)

    dag.build()

    report = dag.build(force=True)

    assert report['Ran?'] == [True]
Example #24
0
def test_on_finish_exceptions_are_logged(executor, tmp_directory, caplog):
    dag = DAG(executor=executor)
    t = PythonCallable(fn, File('file.txt'), dag, name='t')
    t.on_finish = hook_crashing

    with caplog.at_level(logging.ERROR):
        with pytest.raises(DAGBuildError):
            dag.build()

    assert 'Exception when running on_finish for task "t"' in caplog.text
Example #25
0
def test_sample_dag(sqlite_client_and_tmp_dir, class_, identifier):
    client, _ = sqlite_client_and_tmp_dir
    dag = DAG()
    product = GenericProduct('some_file.txt', client=client)
    PythonCallable(touch, product, dag)
    dag.build()

    assert Path('some_file.txt').exists()
    assert product.exists()
    assert product.fetch_metadata() is not None
Example #26
0
def test_on_failure_exceptions_are_logged(executor, caplog):
    dag = DAG(executor='serial')
    t = PythonCallable(fn_that_fails, File('file.txt'), dag, name='t')
    t.on_failure = hook_crashing

    with caplog.at_level(logging.ERROR):
        with pytest.raises(DAGBuildError):
            dag.build()

    assert 'Exception when running on_failure for task "t"' in caplog.text
Example #27
0
def test_on_finish_hook_is_executed(tmp_directory):
    hook.count = 0

    dag = DAG()
    PythonCallable(touch_root, File('file.txt'), dag, name='t')
    dag.on_finish = hook

    dag.build()

    assert hook.count == 1
def test_parallel_execution(tmp_directory):
    dag = DAG('dag', executor='parallel')

    a1 = PythonCallable(touch_root, File('a1.txt'), dag, 'a1')
    a2 = PythonCallable(touch_root, File('a2.txt'), dag, 'a2')
    b = PythonCallable(touch, File('b.txt'), dag, 'b')
    c = PythonCallable(touch, File('c.txt'), dag, 'c')

    (a1 + a2) >> b >> c

    dag.build()
Example #29
0
def test_input_always_executes(tmp_directory):
    dag = DAG()

    Path('some_file.txt').touch()
    t1 = Input(File('some_file.txt'), dag, name='some_file')

    assert t1.product._is_outdated()

    dag.build()

    assert t1.product._is_outdated()
Example #30
0
def test_on_failure(caplog):
    hook.count = 0

    dag = DAG(name='dag')
    PythonCallable(failing_root, File('file.txt'), dag, name='t')
    dag.on_failure = hook

    with pytest.raises(DAGBuildError):
        dag.build()

    assert hook.count == 1