Ejemplo n.º 1
0
def test_sucessful_execution(executor, tmp_directory):
    dag = DAG(executor=executor)
    t1 = PythonCallable(touch_root, File('ok.txt'), dag, name='t1')
    t2 = PythonCallable(touch, File('a_file.txt'), dag, name='t2')
    t3 = PythonCallable(touch, File('another_file.txt'), dag, name='t3')
    t4 = PythonCallable(touch, File('yet_another_file.txt'), dag, name='t4')
    PythonCallable(touch_root, File('file.txt'), dag, name='t5')
    t1 >> t2
    t1 >> t3
    (t2 + t3) >> t4

    dag.build()

    assert Path('ok.txt').exists()
    assert Path('a_file.txt').exists()
    assert Path('another_file.txt').exists()
    assert Path('yet_another_file.txt').exists()
    assert Path('file.txt').exists()

    assert set(t.exec_status for t in dag.values()) == {TaskStatus.Executed}
    assert set(t.product._is_outdated() for t in dag.values()) == {False}

    # nothing executed cause everything is up-to-date
    dag.build()

    assert set(t.exec_status for t in dag.values()) == {TaskStatus.Skipped}
Ejemplo n.º 2
0
def test_copy(copy):
    def _assign_upstream(upstream):
        _assign_upstream.obj = upstream
        return 42

    dag_ = DAG()

    root = PythonCallable(_root,
                          File('root.parquet'),
                          dag_,
                          name='root',
                          serializer=serializer,
                          params={'input_data': {
                              'x': [0, 0, 0]
                          }})

    task = PythonCallable(_assign_upstream,
                          File('task.parquet'),
                          dag_,
                          name='task',
                          unserializer=unserializer,
                          serializer=serializer)

    root >> task

    dag = InMemoryDAG(dag_)

    out = dag.build({'root': {'x': [1]}}, copy=copy)

    # test that the function _assign_upstream received the same object
    # the task root returned in the upstream argument if copy is disabled.
    # if copying, then it should be a different object
    assert (_assign_upstream.obj['root'] is out['root']) is (not copy)
Ejemplo n.º 3
0
def test_executor_keeps_running_until_no_more_tasks_can_run(
        executor, tmp_directory):
    dag = DAG(executor=executor)
    t_fail = PythonCallable(failing_root, File('t_fail'), dag, name='t_fail')
    t_fail_downstream = PythonCallable(failing,
                                       File('t_fail_downstream'),
                                       dag,
                                       name='t_fail_downstream')
    t_touch_aborted = PythonCallable(touch,
                                     File('t_touch_aborted'),
                                     dag,
                                     name='t_touch_aborted')

    t_fail >> t_fail_downstream >> t_touch_aborted

    PythonCallable(touch_root, File('t_ok'), dag, name='t_ok')

    try:
        dag.build(force=True)
    except DAGBuildError:
        pass

    assert not Path('t_fail').exists()
    assert not Path('t_fail_downstream').exists()
    assert Path('t_ok').exists()
Ejemplo n.º 4
0
    def make():
        mock_client = Mock()

        dag = DAG()
        SQLDump('SELECT * FROM my_table',
                File('ok.txt'),
                dag,
                name='t1',
                client=mock_client)
        t2 = SQLDump('SELECT * FROM my_table',
                     File('{{unknown}}'),
                     dag,
                     name='t2',
                     client=mock_client)
        t3 = SQLDump('SELECT * FROM another',
                     File('another_file.txt'),
                     dag,
                     name='t3',
                     client=mock_client)
        t4 = SQLDump('SELECT * FROM something',
                     File('yet_another'),
                     dag,
                     name='t4',
                     client=mock_client)
        SQLDump('SELECT * FROM my_table_2',
                File('ok_2'),
                dag,
                name='t5',
                client=mock_client)
        t2 >> t3 >> t4
        return dag
Ejemplo n.º 5
0
def test_dag_functions_do_not_fetch_metadata(function_name, executor,
                                             tmp_directory, monkeypatch_plot):
    """
    these function should not look up metadata, since the products do not
    exist, the status can be determined without it
    """
    product = File('1.txt')
    dag = DAG(executor=executor)
    PythonCallable(touch_root, product, dag, name=1)

    m = Mock(wraps=product.fetch_metadata)

    # to make this work with pickle
    m.__reduce__ = lambda self: (MagicMock, ())

    product.fetch_metadata = m

    getattr(dag, function_name)()

    # not called
    product.fetch_metadata.assert_not_called()

    if function_name == 'build':
        # if building, we should still see the metadata
        assert product.metadata._data['stored_source_code']
        assert product.metadata._data['timestamp']
Ejemplo n.º 6
0
def add_features(dag):
    """
    Given a DAG, adds feature engineering tasks. The DAG must have a task "get"
    that returns the input data.
    """
    get_task = dag['get']

    output = Path('output')

    # instantiate tasks
    a_feature_task = PythonCallable(a_feature,
                                    File(output / 'a_feature.csv'),
                                    dag,
                                    serializer=serializer,
                                    unserializer=unserializer)
    another_task = PythonCallable(another,
                                  File(output / 'another.csv'),
                                  dag,
                                  serializer=serializer,
                                  unserializer=unserializer)
    join_task = PythonCallable(join,
                               File(output / 'join.csv'),
                               dag,
                               serializer=serializer,
                               unserializer=unserializer)

    # establish dependencies
    get_task >> a_feature_task
    get_task >> another_task
    (get_task + a_feature_task + another_task) >> join_task

    return dag
Ejemplo n.º 7
0
def make_training():
    """Instantiates the training DAG
    """
    # setting build_in_subprocess=False because Python does not like when we
    # use multiprocessing in functions defined in the main module. Works if
    # we define them in a different one
    dag = DAG(executor=Serial(build_in_subprocess=False))

    output = Path('output')

    # add "get" task that returns the training data
    PythonCallable(get,
                   File(output / 'get.csv'),
                   dag,
                   serializer=serializer,
                   unserializer=unserializer)

    # add features tasks
    add_features(dag)

    # add "fit" task for model training
    fit_t = PythonCallable(fit, File(output / 'model.pickle'), dag)

    # train after joining features
    dag['join'] >> fit_t

    return dag
Ejemplo n.º 8
0
def test_runs_on_finish(executor, tmp_directory):
    hook.count = 0
    hook_2.count = 0
    hook_3.count = 0
    hook_4.count = 0

    dag = DAG(executor=executor)
    t = PythonCallable(fn, File('file1.txt'), dag, 't')
    t.on_finish = hook
    t.on_failure = hook_4

    t2 = PythonCallable(touch_w_upstream, File('file2'), dag, 't2')
    t2.on_finish = hook_2

    t3 = PythonCallable(fn, File('file3'), dag, 't3')
    t3.on_finish = hook_3

    t >> t2

    dag.build()

    assert hook.count == 1
    assert hook_2.count == 1
    assert hook_3.count == 1
    assert hook_4.count == 0
Ejemplo n.º 9
0
def test_download_triggers_client_download(tmp_directory):
    client = Mock()
    product = File('file.txt', client=client)

    product.download()

    client.download.assert_has_calls(
        [call('file.txt'), call('.file.txt.metadata')])
Ejemplo n.º 10
0
def test_upload_after_task_build(tmp_directory):
    dag = DAG()
    product = File('file.txt')
    product.upload = Mock(wraps=product.upload)
    task = PythonCallable(_touch, product, dag=dag)
    task.build()

    product.upload.assert_called_once()
Ejemplo n.º 11
0
def test_params_are_copied_upon_initialization():
    dag = DAG()

    params = {'a': 1}
    t1 = PythonCallable(touch, File('file'), dag, name='t1', params=params)
    t2 = PythonCallable(touch, File('file'), dag, name='t2', params=params)

    assert t1.params is not t2.params
Ejemplo n.º 12
0
def test_download(tmp_directory):
    client = Mock()
    product = File('file.txt', client=client)

    product.download()

    assert client.download.call_args_list == [(('.file.txt.metadata', ), ),
                                              (('file.txt', ), )]
Ejemplo n.º 13
0
def test_building_a_single_task_when_rendered_upstream():
    dag = DAG()
    t1 = PythonCallable(touch, File('1.txt'), dag, name=1)
    t2 = PythonCallable(touch_w_upstream, File('2.txt'), dag, name=2)

    t1 >> t2

    dag.render()
    t2.build()
Ejemplo n.º 14
0
def _make_dag_with_upstream():
    # run in the same process, to ensure the mock object is called
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')
    t1 = PythonCallable(_touch, File('1.txt'), dag=dag, name='root')
    PythonCallable(_touch, File('2.txt'), dag=dag, name=2)
    t3 = PythonCallable(_touch_upstream, File('3.txt'), dag=dag, name=3)
    t1 >> t3
    return dag
Ejemplo n.º 15
0
def make_dag_with_client():
    dag = DAG(executor=Serial(build_in_subprocess=False))

    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')

    root = PythonCallable(touch_root, File('out/root'), dag=dag, name='root')
    task = PythonCallable(touch, File('out/file'), dag=dag, name='task')
    root >> task
    return dag
Ejemplo n.º 16
0
def test_unserialize_multi(tmp_directory):
    Path('a.txt').write_text('something')
    Path('b.json').write_text(json.dumps(dict(a=1, b=2)))

    obj_txt = unserializer_multi(File('a.txt'))
    obj_json = unserializer_multi(File('b.json'))

    assert obj_txt == 'something'
    assert obj_json == dict(a=1, b=2)
Ejemplo n.º 17
0
def test_upload(tmp_directory):
    Path('file.txt').touch()
    Path('.file.txt.metadata').touch()
    client = Mock()
    product = File('file.txt', client=client)

    product.upload()

    assert client.upload.call_args_list == [(('.file.txt.metadata', ), ),
                                            (('file.txt', ), )]
Ejemplo n.º 18
0
def test_do_not_upload_if_none_or_one(to_touch, tmp_directory):
    for f in to_touch:
        Path(f).touch()

    client = Mock()
    product = File('file.txt', client=client)

    product.upload()

    client.upload.assert_not_called()
Ejemplo n.º 19
0
def test_unserialize_with_txt_and_json_default(tmp_directory):
    @unserialize.unserializer(defaults=['.txt', '.json'])
    def unserializer(product):
        raise NotImplementedError

    Path('a.txt').write_text('something')
    Path('a.json').write_text(json.dumps(dict(a=1, b=2)))

    assert unserializer(File('a.txt')) == 'something'
    assert unserializer(File('a.json')) == dict(a=1, b=2)
Ejemplo n.º 20
0
def test_serialize_with_txt_and_json_default(tmp_directory):
    @serialize.serializer(defaults=['.txt', '.json'])
    def serializer(obj, product):
        raise NotImplementedError

    serializer('something', File('a.txt'))
    serializer(dict(a=1, b=2), File('b.json'))

    assert Path('a.txt').read_text() == 'something'
    assert json.loads(Path('b.json').read_text()) == dict(a=1, b=2)
Ejemplo n.º 21
0
def test_do_not_download_if_file_or_metadata_exists(to_touch, tmp_directory):
    for f in to_touch:
        Path(f).touch()

    client = Mock()
    product = File('file.txt', client=client)

    product.download()

    client.download.assert_not_called()
Ejemplo n.º 22
0
def test_task_factory_override_params():
    dag = DAG()

    @task_factory(product=File('file.txt'))
    def touch(product):
        Path(str(product)).touch()

    touch(dag=dag, product=File('another.txt'))

    assert list(dag) == ['touch']
    assert str(dag['touch'].product) == 'another.txt'
Ejemplo n.º 23
0
def test_parallel_execution(tmp_directory):
    dag = DAG('dag', executor='parallel')

    a1 = PythonCallable(touch_root, File('a1.txt'), dag, 'a1')
    a2 = PythonCallable(touch_root, File('a2.txt'), dag, 'a2')
    b = PythonCallable(touch, File('b.txt'), dag, 'b')
    c = PythonCallable(touch, File('c.txt'), dag, 'c')

    (a1 + a2) >> b >> c

    dag.build()
Ejemplo n.º 24
0
def test_build_partially_with_wildcard(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    PythonCallable(touch_root, File('a-1.txt'), dag, name='a-1')
    PythonCallable(touch_root, File('a-2.txt'), dag, name='a-2')
    PythonCallable(touch_root, File('b.txt'), dag, name='b')

    dag.build_partially('a-*')

    assert Path('a-1.txt').exists()
    assert Path('a-2.txt').exists()
    assert not Path('b.txt').exists()
Ejemplo n.º 25
0
def test_delete_metadata(tmp_directory):
    Path('.a.txt.metadata').touch()
    Path('.b.txt.metadata').touch()

    a = File('a.txt')
    b = File('b.txt')
    m = MetaProduct({'a': a, 'b': b})
    m.metadata.delete()

    assert not Path('.a.txt.metadata').exists()
    assert not Path('.b.txt.metadata').exists()
Ejemplo n.º 26
0
def test_can_create_task_with_many_products():
    dag = DAG()
    fa1 = File('a1.txt')
    fa2 = File('a2.txt')
    ta = ShellScript('echo {{product}}', [fa1, fa2], dag, 'ta')
    ta.render()

    assert not ta.product.exists()
    assert ta.product._outdated()
    assert ta.product._outdated_code_dependency()
    assert not ta.product._outdated_data_dependencies()
Ejemplo n.º 27
0
def test_load_from_metaproduct(tmp_directory):
    Path('a.csv').write_text('a,b\n1,2')

    dag = DAG()

    task = PythonCallable(touch_meta, {
        'a': File('a.csv'),
        'b': File('b')
    }, dag)

    assert task.load(key='a') is not None
Ejemplo n.º 28
0
def test_product_upload_uploads_metadata_and_product(tmp_directory):
    Path('file.txt').touch()
    Path('.file.txt.metadata').touch()
    client = Mock()
    product = File('file.txt', client=client)

    product.upload()

    client.upload.assert_has_calls(
        [call(Path('.file.txt.metadata')),
         call(Path('file.txt'))])
Ejemplo n.º 29
0
def test_creates_parent_dirs_meta_product(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))

    PythonCallable(touch_meta, {
        'one': File('some/nested/product.txt'),
        'another': File('some/another/product.txt')
    },
                   dag=dag)

    dag.build()

    return dag
Ejemplo n.º 30
0
def test_warnings_are_shown(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    t1 = PythonCallable(touch_root_w_warning, File('file.txt'), dag)
    t2 = PythonCallable(touch_w_warning, File('file2.txt'), dag)
    t1 >> t2

    with pytest.warns(None) as record:
        dag.build()

    assert len(record) == 1
    assert 'This is a warning' in str(record[0].message)
    assert 'This is another warning' in str(record[0].message)