Beispiel #1
0
def test_error_invalid_dag_level_client_dotted_path(tmp_sample_tasks,
                                                    add_current_to_sys_path,
                                                    no_sys_modules_cache, code,
                                                    expected_error):
    Path('dag_level_client_dotted_path.py').write_text(code)

    spec = DAGSpec({
        'meta': {
            'extract_product': False,
            'extract_upstream': True,
        },
        'tasks': [
            {
                'source': 'sample.sql',
                'product': ['name', 'table']
            },
        ],
        'clients': {
            'SQLScript': 'dag_level_client_dotted_path.get'
        }
    })

    with pytest.raises(TypeError) as excinfo:
        spec.to_dag()

    assert expected_error in str(excinfo.value)
Beispiel #2
0
def test_sets_clients(tmp_sample_tasks, add_current_to_sys_path,
                      no_sys_modules_cache, dotted_path_spec):
    Path('test_sets_clients.py').write_text("""
from unittest.mock import Mock

def get(a=None):
    return Mock()
""")

    spec = DAGSpec({
        'meta': {
            'extract_product': False,
            'extract_upstream': True,
        },
        'tasks': [
            {
                'source': 'sample.sql',
                'product': ['name', 'table']
            },
        ],
        'clients': {
            'SQLScript': dotted_path_spec
        }
    })

    dag = spec.to_dag()

    assert isinstance(dag.clients[SQLScript], Mock)
Beispiel #3
0
def test_lazy_load_product_level_client(tmp_directory, tmp_imports,
                                        my_testing_module, client_spec):
    Path('script.sql').write_text("""
CREATE TABLE {{product}} AS SELECT * FROM my_table
""")

    with sqlite3.connect('my.db') as conn:
        pd.DataFrame({'x': range(5)}).to_sql('my_table', conn)

    tasks = [
        {
            'source': 'script.sql',
            'product': [None, 'name', 'table'],
            'client': client_spec,
            'product_client': client_spec,
            'product_class': 'GenericSQLRelation',
        },
    ]

    data = {'tasks': tasks}

    spec = DAGSpec(data, lazy_import=True)

    dag = spec.to_dag()
    dag.executor = Serial(build_in_subprocess=False)

    # since lazy_load=True, creating the dag should not import
    # my_testing_module
    assert 'my_testing_module' not in sys.modules

    dag.build()

    # should be imported now
    assert 'my_testing_module' in sys.modules
Beispiel #4
0
def test_lazy_load_dag_level_client(tmp_directory, tmp_imports,
                                    my_testing_module, client_spec):

    tasks = [
        {
            'source': 'my_testing_module.task',
            'product': 'output.csv'
        },
    ]

    data = {
        'tasks': tasks,
        'clients': {
            'File': client_spec
        },
    }

    spec = DAGSpec(data, lazy_import=True)

    dag = spec.to_dag()
    dag.executor = Serial(build_in_subprocess=False)

    # since lazy_load=True, creating the dag should not import
    # my_testing_module
    assert 'my_testing_module' not in sys.modules

    dag.build()

    # should be imported now
    assert 'my_testing_module' in sys.modules
    assert Path('backup', 'output.csv').exists()
Beispiel #5
0
def test_lazy_load(tmp_directory, tmp_imports):
    Path('my_module.py').write_text("""
def fn():
    pass
""")

    tasks = [
        {
            'source': 'my_module.fn',
            'product': 'report.ipynb',
            'on_finish': 'not_a_module.not_a_function',
            'on_render': 'not_a_module.not_a_function',
            'on_failure': 'not_a_module.not_a_function',
            'serializer': 'not_a_module.not_a_function',
            'unserializer': 'not_a_module.not_a_function',
            'product_client': 'not_a_module.not_a_function'
        },
    ]

    data = {
        'tasks': tasks,
        'serializer': 'not_a_module.not_a_function',
        'unserializer': 'not_a_module.not_a_function',
    }

    spec = DAGSpec(data, lazy_import=True)

    assert spec.to_dag()
Beispiel #6
0
def test_python_callables_with_extract_upstream(tmp_directory):
    spec = DAGSpec({
        'tasks': [
            {
                'source': 'test_pkg.callables.root',
                'product': 'root.csv'
            },
            {
                'source': 'test_pkg.callables.a',
                'product': 'a.csv'
            },
            {
                'source': 'test_pkg.callables.b',
                'product': 'b.csv'
            },
        ],
        'meta': {
            'extract_product': False,
            'extract_upstream': True
        }
    })

    dag = spec.to_dag()

    dag.build()

    assert set(dag) == {'a', 'b', 'root'}
    assert not dag['root'].upstream
    assert set(dag['a'].upstream) == {'root'}
    assert set(dag['b'].upstream) == {'root'}
Beispiel #7
0
def test_add_upstream_modifies_signature(backup_spec_with_functions):
    dag = DAGSpec('pipeline.yaml').to_dag()
    dag.render()

    fn = dag['raw'].source.primitive
    params = dag['raw'].params.to_json_serializable()

    dev = CallableInteractiveDeveloper(fn, params)

    # add an upstream reference...
    nb = dev.to_nb()
    nb.cells[-1]['source'] += '\nupstream["some_task"]'
    dev.overwrite(nb)

    # source must be updated...
    source = Path('my_tasks', 'raw', 'functions.py').read_text()
    top_lines = '\n'.join(source.splitlines()[:5])

    expected = (
        'from pathlib import Path\n\n\n'
        'def function(product, upstream):\n    Path(str(product)).touch()')
    assert expected == top_lines

    # if we save again, nothing should change
    dev.overwrite(nb)

    source = Path('my_tasks', 'raw', 'functions.py').read_text()
    top_lines = '\n'.join(source.splitlines()[:5])

    assert expected == top_lines
Beispiel #8
0
def test_import_tasks_from_loads_relative_to_pipeline_spec(tmp_nbs):
    some_tasks = [{'source': 'extra_task.py', 'product': 'extra.ipynb'}]
    Path('some_tasks.yaml').write_text(yaml.dump(some_tasks))
    Path('extra_task.py').write_text("""
# + tags=["parameters"]
# -
""")

    spec_d = yaml.safe_load(Path('pipeline.yaml').read_text())
    spec_d['meta']['import_tasks_from'] = 'some_tasks.yaml'

    Path('pipeline.yaml').write_text(yaml.dump(spec_d))

    # move to another dir to make sure we can still load the spec
    Path('subdir').mkdir()
    os.chdir('subdir')

    spec = DAGSpec('../pipeline.yaml')
    dag = spec.to_dag()
    dag.render()

    assert spec['meta']['import_tasks_from'] == str(
        Path('..', 'some_tasks.yaml').resolve())
    assert str(Path('..', 'extra_task.py').resolve()) in [
        str(t['source']) for t in spec['tasks']
    ]
Beispiel #9
0
def test_grid_and_upstream_wildcard_callables(spec_raw, tmp_directory,
                                              add_current_to_sys_path,
                                              no_sys_modules_cache):
    Path('sample_source_callables.py').write_text("""
from pathlib import Path

def unserializer(product):
    return Path(product).read_text()

def upstream(product, param):
    Path(product).touch()

def downstream(product, upstream):
    up = upstream['upstream-*']
    one = up['upstream-0']
    another = up['upstream-1']
    Path(product).touch()
""")

    spec = DAGSpec(spec_raw)

    dag = spec.to_dag().render()
    # to build faster
    dag.executor = Serial(build_in_subprocess=False)

    # make sure unserializing works correctly
    dag.build()

    assert set(dag) == {'upstream-1', 'upstream-0', 'downstream'}
    assert set(dag['downstream'].params['upstream']['upstream-*']) == {
        'upstream-1', 'upstream-0'
    }
Beispiel #10
0
def test_import_tasks_from_with_non_empty_env(tmp_nbs):
    some_tasks = [{
        'source': 'extra_task.py',
        'name': 'extra_task',
        'product': 'extra.ipynb',
        'params': {
            'some_param': '{{some_param}}'
        }
    }]
    Path('some_tasks.yaml').write_text(yaml.dump(some_tasks))
    Path('extra_task.py').write_text("""
# + tags=["parameters"]
# -
""")
    spec_d = yaml.safe_load(Path('pipeline.yaml').read_text())
    spec_d['meta']['import_tasks_from'] = 'some_tasks.yaml'

    spec = DAGSpec(spec_d, env={'some_param': 'some_value'})

    dag = spec.to_dag()
    dag.render()
    assert dag['extra_task'].params['some_param'] == 'some_value'
    assert str(Path('extra_task.py').resolve()) in [
        str(t['source']) for t in spec['tasks']
    ]
Beispiel #11
0
def test_loads_serializer_and_unserializer(backup_online,
                                           add_current_to_sys_path):

    spec = DAGSpec({
        'tasks': [{
            'source': 'online_tasks.get',
            'product': 'output/get.parquet',
        }, {
            'source': 'online_tasks.square',
            'product': 'output/square.parquet',
        }],
        'meta': {
            'extract_product': False
        },
        'serializer':
        'online_io.serialize',
        'unserializer':
        'online_io.unserialize',
    })

    dag = spec.to_dag()

    from online_io import serialize, unserialize

    assert dag['get']._serializer is serialize
    assert dag['get']._unserializer is unserialize
    assert dag['square']._serializer is serialize
    assert dag['square']._unserializer is unserialize
Beispiel #12
0
def test_mixed_db_sql_spec(tmp_pipeline_sql, add_current_to_sys_path,
                           pg_client_and_schema, monkeypatch):
    _, schema = pg_client_and_schema

    with open('pipeline-multiple-dbs.yaml') as f:
        dag_spec = yaml.load(f, Loader=yaml.SafeLoader)

    # clients for this pipeline are initialized without custom create_engine
    # args but we need to set the default schema, mock the call so it
    # includes that info
    monkeypatch.setattr(db, 'create_engine', create_engine_with_schema(schema))

    dates = _random_date_from(datetime(2016, 1, 1), 365, 100)
    df = pd.DataFrame({
        'customer_id': np.random.randint(0, 5, 100),
        'value': np.random.rand(100),
        'purchase_date': dates
    })
    # make sales data for pg and sqlite
    loader = load_dotted_path(dag_spec['clients']['PostgresRelation'])
    client = loader()
    df.to_sql('sales', client.engine, if_exists='replace')
    client.engine.dispose()

    # make sales data for pg and sqlite
    loader = load_dotted_path(dag_spec['clients']['SQLiteRelation'])
    client = loader()
    df.to_sql('sales', client.engine)
    client.engine.dispose()

    dag = DAGSpec(dag_spec).to_dag()

    # FIXME: this does no show the custom Upstream key missing error
    dag.build()
Beispiel #13
0
def test_postgres_sql_spec(tmp_pipeline_sql, pg_client_and_schema,
                           add_current_to_sys_path, monkeypatch):
    _, schema = pg_client_and_schema

    with open('pipeline-postgres.yaml') as f:
        dag_spec = yaml.load(f, Loader=yaml.SafeLoader)

    # clients for this pipeline are initialized without custom create_engine
    # args but we need to set the default schema, mock the call so it
    # includes that info
    monkeypatch.setattr(db, 'create_engine', create_engine_with_schema(schema))

    dates = _random_date_from(datetime(2016, 1, 1), 365, 100)
    df = pd.DataFrame({
        'customer_id': np.random.randint(0, 5, 100),
        'value': np.random.rand(100),
        'purchase_date': dates
    })
    loader = load_dotted_path(dag_spec['clients']['SQLScript'])
    client = loader()
    df.to_sql('sales', client.engine, if_exists='replace')
    client.engine.dispose()

    dag = DAGSpec(dag_spec).to_dag()

    # FIXME: this does no show the custom Upstream key missing error
    dag.build()

    assert not dag['load'].upstream
    assert list(dag['filter'].upstream.keys()) == ['load']
    assert list(dag['transform'].upstream.keys()) == ['filter']
Beispiel #14
0
def test_import_tasks_from_paths_are_relative_to_the_yaml_spec(
        tmp_nbs, tmp_path):
    tasks_yaml = tmp_path / 'some_tasks.yaml'

    # source is a relative path
    some_tasks = [{'source': 'extra_task.py', 'product': 'extra.ipynb'}]
    tasks_yaml.write_text(yaml.dump(some_tasks))
    #  write the source code in the same folder as some_tasks.yaml
    Path(tmp_path, 'extra_task.py').write_text("""
# + tags=["parameters"]
# -
""")

    spec_d = yaml.safe_load(Path('pipeline.yaml').read_text())

    # set an absolute path
    spec_d['meta']['import_tasks_from'] = str(tasks_yaml.resolve())
    Path('pipeline.yaml').write_text(yaml.dump(spec_d))

    spec = DAGSpec('pipeline.yaml')
    dag = spec.to_dag()
    dag.render()

    # paths must be interpreted as relative to tasks.yaml, not to the
    # current working directory
    assert str(Path(tmp_path, 'extra_task.py').resolve()) in [
        str(t['source']) for t in spec['tasks']
    ]
Beispiel #15
0
def test_remove_upstream_modifies_signature(backup_spec_with_functions):
    # by the time we reach this test, my_tasks.raw.functions has alread been
    # loaded (previous test), so we force reload to avoid wrongfully reading
    # the modified source code in the raw task
    from my_tasks.raw import functions
    importlib.reload(functions)

    dag = DAGSpec('pipeline.yaml').to_dag()
    dag.render()

    fn = dag['clean'].source.primitive
    params = dag['clean'].params.to_json_serializable()

    dev = CallableInteractiveDeveloper(fn, params)

    nb = dev.to_nb()
    # delete upstream reference
    del nb.cells[-2]
    dev.overwrite(nb)

    source = Path('my_tasks', 'clean', 'functions.py').read_text()
    top_lines = '\n'.join(source.splitlines()[:5])

    expected = ('# adding this to make sure relative imports work '
                'fine\nfrom .util import util_touch\n\n\n'
                'def function(product):')

    assert top_lines == expected
Beispiel #16
0
def test_spec_with_functions(lazy_import, backup_spec_with_functions,
                             add_current_to_sys_path):
    """
    Check we can create pipeline where the task is a function defined in a
    local file
    """
    spec = DAGSpec('pipeline.yaml', lazy_import=lazy_import)
    spec.to_dag().build()
Beispiel #17
0
def test_pipeline_r(tmp_pipeline_r):
    Path('output').mkdir()

    with open('pipeline.yaml') as f:
        dag_spec = yaml.load(f, Loader=yaml.SafeLoader)

    dag = DAGSpec(dag_spec).to_dag()
    dag.build()
Beispiel #18
0
def test_spec_invalid_glob_pattern(tmp_nbs_no_yaml):
    Path('some_invalid_script.sh').touch()

    with pytest.raises(ValueError) as excinfo:
        DAGSpec.from_files('*')

    assert ('Cannot instantiate DAGSpec from files with invalid extensions'
            in str(excinfo.value))
Beispiel #19
0
def _process_file_dir_or_glob(parser, dagspec_arg=None):
    """
    Process a file entry point file, directory or glob-like pattern,
    the initialized dag and parsed args

    Parameters
    ----------
    parser : CustomParser
        CLI arg parser
    """
    # NOTE: we must use parser.parse_entry_point_value() instead or
    # args.parse_args because calling the latter wont allow us to add more
    # cli parameters, but we want that to expose parms from env
    entry_point_value = dagspec_arg or parser.parse_entry_point_value()
    entry = EntryPoint(entry_point_value)

    if entry.type in {EntryPoint.Directory, EntryPoint.Pattern}:
        # pipelines initialized from directories or patterns cannot be
        # parametrized
        path_to_env = None
    # file
    else:
        path_to_env = default.path_to_env_from_spec(entry_point_value)

    if path_to_env:
        env_dict = EnvDict(path_to_env,
                           path_to_here=Path(entry_point_value).parent
                           if entry.type == EntryPoint.File else None)
        _add_cli_args_from_env_dict_keys(parser, env_dict)

    args = parser.parse_args()
    dagspec_arg = dagspec_arg or args.entry_point

    if hasattr(args, 'log'):
        if args.log is not None:
            logging.basicConfig(level=args.log.upper())

    entry_point = EntryPoint(dagspec_arg)

    # directory
    if entry_point.type == EntryPoint.Directory:
        dag = DAGSpec.from_directory(dagspec_arg).to_dag()
    # pattern
    elif entry_point.type == EntryPoint.Pattern:
        dag = DAGSpec.from_files(dagspec_arg).to_dag()
    # file
    else:
        if path_to_env:
            # and replace keys depending on passed cli args
            replaced = _env_keys_to_override(args, parser.static_args)
            env = env_dict._replace_flatten_keys(replaced)
            dag = DAGSpec(dagspec_arg, env=env).to_dag()
        else:
            dag = DAGSpec(dagspec_arg).to_dag()

    return dag, args
Beispiel #20
0
def test_notebook_spec(processor, tmp_nbs):
    Path('output').mkdir()

    with open('pipeline.yaml') as f:
        dag_spec = yaml.load(f, Loader=yaml.SafeLoader)

    dag_spec = processor(dag_spec)

    dag = DAGSpec(dag_spec).to_dag()
    dag.build()
Beispiel #21
0
def test_find_searches_in_default_locations(monkeypatch, tmp_nbs, root_path):
    root_path = Path(root_path).resolve()
    Path('subdir').mkdir()

    mock = Mock(wraps=dagspec.default.entry_point_with_name)
    monkeypatch.setattr(dagspec.default, 'entry_point_with_name', mock)

    DAGSpec.find(starting_dir=root_path)

    mock.assert_called_once_with(root_path=root_path, name=None)
Beispiel #22
0
def test_searches_in_default_locations(monkeypatch, tmp_nbs, root_path):
    root_path = Path(root_path).resolve()
    Path('subdir').mkdir()

    mock = Mock(wraps=dagspec.entry_point)
    monkeypatch.setattr(dagspec, 'entry_point', mock)

    DAGSpec._auto_load(starting_dir=root_path)

    mock.assert_called_once_with(root_path=root_path)
Beispiel #23
0
def test_find(tmp_nbs, monkeypatch):
    mock = Mock(return_value=[None, None])
    monkeypatch.setattr(dagspec.DAGSpec, '_auto_load', mock)

    env = {'a': 1}
    DAGSpec.find(env=env)

    mock.assert_called_once_with(to_dag=False,
                                 starting_dir=None,
                                 env={'a': 1},
                                 lazy_import=False,
                                 reload=False)
Beispiel #24
0
def load_entry_point(entry_point):
    type_ = find_entry_point_type(entry_point)

    if type_ == EntryPoint.Directory:
        spec = DAGSpec.from_directory(entry_point)
        path = Path(entry_point)

    elif type_ == EntryPoint.File:
        spec = DAGSpec(entry_point)
        path = Path(entry_point).parent
    else:
        raise NotImplementedError(
            f'loading entry point type {type_!r} is unsupported')

    return spec, spec.to_dag(), path
Beispiel #25
0
def test_infer_dependencies_sql(tmp_pipeline_sql, add_current_to_sys_path):
    expected = {'filter': {'load'}, 'transform': {'filter'}, 'load': set()}

    with open('pipeline-postgres.yaml') as f:
        d = yaml.safe_load(f)

    d['meta']['extract_upstream'] = True

    for t in d['tasks']:
        t.pop('upstream', None)

    dag = DAGSpec(d).to_dag()

    deps = {name: set(task.upstream) for name, task in dag.items()}
    assert deps == expected
Beispiel #26
0
def test_meta_defaults(raw):
    spec = DAGSpec(raw)
    meta = spec['meta']
    assert meta['extract_upstream']
    assert not meta['extract_product']
    assert not meta['product_relative_to_source']
    assert not meta['jupyter_hot_reload']
Beispiel #27
0
def add():
    """Add scaffold templates for tasks whose source does not exist
    """
    # setting lazy_import to true causes sources to be returned as paths,
    # instead of placeholders
    spec, path_to_spec = DAGSpec._auto_load(to_dag=False, lazy_import=True)
    loader = ScaffoldLoader('ploomber_add')

    # TODO: when the dag has a source loader, the argument passed to
    # ploomber_add should take that into account to place the new file
    # in the appropriate location (instead of doing it relative to
    # pipeline.yaml)

    # TODO: raise an error if the location is inside the site-packages folder

    # NOTE: lazy loading freom source loader will giev errors because
    # initializing a source with a path only, loses the information from the
    # jinja environment to make macros workj. I have to test this. the best
    # solution is to add a lazy_load param to Placeholder, so it can be
    # initialized with a path for a file that does not exist

    if path_to_spec:
        print('Found spec at {}'.format(path_to_spec))

        # make sure current working dir is in the path, otherwise we might not
        # be able to import the PythonCallable functions, which we need to do
        # to locate the modules
        with add_to_sys_path(path_to_spec, chdir=False):
            for task in spec['tasks']:
                loader.create(source=task['source'],
                              params=spec['meta'],
                              class_=task['class'])
    else:
        print('Error: No pipeline.yaml spec found...')
Beispiel #28
0
def test_spec_from_directory(chdir, dir_, tmp_nbs_no_yaml):
    os.chdir(chdir)

    Path('output').mkdir()

    dag = DAGSpec.from_directory(dir_).to_dag()
    assert list(dag) == ['load', 'clean', 'plot']
Beispiel #29
0
def test_spec_glob_pattern(tmp_nbs_no_yaml):
    # directory should be ignored
    Path('output').mkdir()
    # if passed a string, it's interpreted as a glob-like pattern
    dag = DAGSpec.from_files('load.py').to_dag()

    assert list(dag) == ['load']
Beispiel #30
0
def test_error_invalid_yaml_displays_error_line(tmp_directory):
    Path('pipeline.yaml').write_text('key: [')

    with pytest.raises(yaml.parser.ParserError) as excinfo:
        DAGSpec('pipeline.yaml')

    assert 'key: [' in str(excinfo.value)