Ejemplo n.º 1
0
def test_dag_on_render_with_params(tmp_directory, tmp_imports,
                                   write_dag_hooks_spec):
    dag = DAGSpec('pipeline.yaml').to_dag()
    dag.executor = Serial(build_in_subprocess=False)

    dag.render()

    assert Path('hook').read_text() == 'on render'
Ejemplo n.º 2
0
    def get_partial():
        with open('pipeline-features.yaml') as f:
            tasks = yaml.safe_load(f)

        meta = {'extract_product': False, 'extract_upstream': False}
        spec = DAGSpec({'tasks': tasks, 'meta': meta})

        return spec.to_dag()
Ejemplo n.º 3
0
def test_save_injected_cell_in_paired_notebooks(tmp_nbs, prefix):
    dag = DAGSpec('pipeline.yaml').to_dag().render()
    dag['load'].source.pair(prefix)

    dag = DAGSpec('pipeline.yaml').to_dag().render()
    dag['load'].source.save_injected_cell()

    assert get_injected_cell(jupytext.read(Path(prefix, 'load.ipynb')))
    assert get_injected_cell(jupytext.read(Path('load.py')))
Ejemplo n.º 4
0
def test_dag_on_failure_with_params(tmp_directory, tmp_imports,
                                    write_dag_hooks_spec):
    Path('my_module.py').write_text("""
def touch(product):
    raise Exception
""")

    dag = DAGSpec('pipeline.yaml').to_dag()
    dag.executor = Serial(build_in_subprocess=False)

    with pytest.raises(DAGBuildError):
        dag.build()

    assert Path('hook').read_text() == 'on failure'
Ejemplo n.º 5
0
def test_remove_injected_cell(tmp_nbs):
    dag = DAGSpec('pipeline.yaml').to_dag().render()
    dag['load'].source.save_injected_cell()
    expected = '# + tags=["injected-parameters"]'

    assert expected in Path('load.py').read_text()

    dag = DAGSpec('pipeline.yaml').to_dag().render()
    dag['load'].source.remove_injected_cell()

    nb = jupytext.read('load.py')

    assert expected not in Path('load.py').read_text()
    assert nb.metadata.ploomber == {}
Ejemplo n.º 6
0
def test_sync(tmp_nbs):
    dag = DAGSpec('pipeline.yaml').to_dag().render()
    dag['load'].source.pair(base_path='nbs')

    nb = jupytext.reads(Path('load.py').read_text(), fmt='py:light')
    nb.cells.append(nbformat.v4.new_code_cell(source='x = 42'))
    jupytext.write(nb, 'load.py', fmt='py:light')

    dag = DAGSpec('pipeline.yaml').to_dag().render()
    dag['load'].source.sync()

    nb = jupytext.reads(Path('nbs', 'load.ipynb').read_text(), fmt='ipynb')

    assert nb.cells[-1]['source'] == 'x = 42'
Ejemplo n.º 7
0
def test_no_training_serve_skew():
    """
    Test for training-serving skew (feature engineering in training and serving
    should be the same)
    """
    dag = DAGSpec.find().to_dag()

    # load raw data
    get = pd.read_parquet(dag['get'].product)
    del get['target']

    # load feature vectors
    join = pd.read_parquet(dag['join'].product)
    del join['target']

    pipeline = InferencePipeline()

    # make predictions using the online pipeline (if training set is large
    # you can take a random sample)
    online = [
        pipeline.predict(get=get.loc[[idx]])['join'] for idx in join.index
    ]

    # cast to a data frame
    online_df = pd.concat(online)
    online_df.index = join.index

    # compare data frames
    assert online_df.equals(join)
Ejemplo n.º 8
0
def _default_spec_load(starting_dir=None, lazy_import=False, reload=False):
    """
    NOTE: this is a private API. Use DAGSpec.find() instead

    Looks for a pipeline.yaml, generates a DAGSpec and returns a DAG.
    Currently, this is only used by the PloomberContentsManager, this is
    not intended to be a public API since initializing specs from paths
    where we have to recursively look for a pipeline.yaml has some
    considerations regarding relative paths that make this confusing,
    inside the contents manager, all those things are all handled for that
    use case.

    The pipeline.yaml parent folder is temporarily added to sys.path when
    calling DAGSpec.to_dag() to make sure imports work as expected

    Returns DAG and the directory where the pipeline.yaml file is located.
    """
    root_path = starting_dir or os.getcwd()
    path_to_entry_point = default.entry_point(root_path=root_path)

    try:
        spec = DAGSpec(path_to_entry_point,
                       env=None,
                       lazy_import=lazy_import,
                       reload=reload)

        path_to_spec = Path(path_to_entry_point)
        return spec, path_to_spec.parent, path_to_spec

    except Exception as e:
        exc = DAGSpecInitializationError('Error initializing DAG from '
                                         f'{path_to_entry_point!s}')
        raise exc from e
Ejemplo n.º 9
0
def lazily_load_entry_point(starting_dir=None, reload=False):
    """
    Lazily loads entry point by recursively looking in starting_dir directory
    and parent directories.
    """

    starting_dir = starting_dir or '.'

    entry_point = os.environ.get('ENTRY_POINT')

    type_ = try_to_find_entry_point_type(entry_point)

    if type_ == EntryPoint.Directory:
        spec = DAGSpec.from_directory(entry_point)
        path = Path(entry_point)
    elif type_ == EntryPoint.DottedPath:
        entry = load_callable_dotted_path(str(entry_point), raise_=True)
        dag = entry()
        spec = dict(meta=dict(jupyter_hot_reload=False,
                              jupyter_functions_as_notebooks=False))
        # potential issue: dag defines sources as relative paths
        path = Path().resolve()
        return spec, dag, path
    else:
        spec, path = _default_spec_load(starting_dir=starting_dir,
                                        reload=reload,
                                        lazy_import=True)

    # chain exception to provide more context
    dag = spec.to_dag()

    return spec, dag, path
Ejemplo n.º 10
0
def scaffold(conda, package, entry_point, empty):
    """Create new projects (if no pipeline.yaml exists) or add missings tasks
    """
    template = '-e/--entry-point is not compatible with the {flag} flag'

    if entry_point and conda:
        raise click.ClickException(template.format(flag='--conda'))

    if entry_point and package:
        raise click.ClickException(template.format(flag='--package'))

    if entry_point and empty:
        raise click.ClickException(template.format(flag='--empty'))

    # try to load a dag by looking in default places
    if not entry_point:
        loaded = _scaffold.load_dag()
    else:
        try:
            loaded = DAGSpec(entry_point, lazy_import=True), Path(entry_point)
        except Exception as e:
            raise click.ClickException(e) from e

    if loaded:
        # add scaffold tasks
        spec, path_to_spec = loaded
        _scaffold.add(spec, path_to_spec)
    else:
        scaffold_project.cli(project_path=None,
                             conda=conda,
                             package=package,
                             empty=empty)
Ejemplo n.º 11
0
def test_pair(tmp_nbs):
    dag = DAGSpec('pipeline.yaml').to_dag().render()

    dag['load'].source.pair(base_path='nbs')
    nb = jupytext.reads(Path('load.py').read_text(), fmt='py:light')

    assert Path('nbs', 'load.ipynb').is_file()
    assert nb.metadata.jupytext.formats == 'nbs//ipynb,py:light'
Ejemplo n.º 12
0
def test_format(tmp_nbs):
    dag = DAGSpec('pipeline.yaml').to_dag().render()

    assert '# + tags=["parameters"]' in Path('load.py').read_text()

    dag['load'].source.format(fmt='py:percent')

    assert '# %% tags=["parameters"]' in Path('load.py').read_text()
Ejemplo n.º 13
0
def test_does_not_delete_injected_cell_on_save_if_manually_injected(tmp_nbs):
    dag = DAGSpec('pipeline.yaml').to_dag().render()
    dag['load'].source.save_injected_cell()

    cm = PloomberContentsManager()
    model = cm.get('load.py')
    cm.save(model, path='/load.py')

    nb = jupytext.read('load.py')
    assert get_injected_cell(nb)
Ejemplo n.º 14
0
def test_dag_manager_root_folder(backup_simple):
    dag = DAGSpec('pipeline.yaml').to_dag().render()
    m = JupyterDAGManager(dag)
    # jupyter represents the root folder with the empty string '', make sure
    # that correctly retuns the appropriate models
    content = m.get_by_parent('')

    assert len(content) == 1
    assert content[0]['name'] == 'tasks_simple.py (functions)'
    assert content[0]['type'] == 'directory'
Ejemplo n.º 15
0
def test_dag_manager(backup_spec_with_functions):
    dag = DAGSpec('pipeline.yaml').to_dag().render()
    m = JupyterDAGManager(dag)

    assert set(m) == {
        'my_tasks/raw/functions.py (functions)',
        'my_tasks/raw/functions.py (functions)/raw',
        'my_tasks/clean/functions.py (functions)',
        'my_tasks/clean/functions.py (functions)/clean'
    }
Ejemplo n.º 16
0
def test_export(mock_docker_calls, backup_packaged_project, monkeypatch, mode,
                args):
    load_tasks_mock = Mock(wraps=commons.load_tasks)
    monkeypatch.setattr(commons, 'load_tasks', load_tasks_mock)

    exporter = ArgoWorkflowsExporter(path_to_config='soopervisor.yaml',
                                     env_name='serve')
    exporter.add()
    exporter.export(mode=mode, until=None)

    yaml_str = Path('serve/argo.yaml').read_text()
    spec = yaml.safe_load(yaml_str)
    dag = DAGSpec.find().to_dag()

    load_tasks_mock.assert_called_once_with(mode=mode)

    # make sure the "source" key is represented in literal style
    # (https://yaml-multiline.info/) to make the generated script more readable
    assert 'source: |' in yaml_str

    run_task_template = spec['spec']['templates'][0]
    tasks = spec['spec']['templates'][1]['dag']['tasks']

    assert run_task_template['script'][
        'source'] == 'ploomber task {{inputs.parameters.task_name}}' + args

    assert spec['spec']['volumes'] == []
    assert run_task_template['script']['volumeMounts'] == []
    assert Workflow.from_dict(copy(spec))
    assert set(spec) == {'apiVersion', 'kind', 'metadata', 'spec'}
    assert set(spec['metadata']) == {'generateName'}
    assert set(spec['spec']) == {'entrypoint', 'templates', 'volumes'}

    # should not change workingdir
    assert run_task_template['script']['workingDir'] is None

    assert run_task_template['script'][
        'image'] == 'your-repository/name:0.1dev'
    assert run_task_template['name'] == 'run-task'
    assert spec['metadata']['generateName'] == 'my-project-'
    assert all([
        set(dag[t['name']].upstream) == set(t['dependencies']) for t in tasks
    ])

    # tasks call the right template
    assert set(t['template'] for t in tasks) == {'run-task'}

    # check each task uses the right parameters
    assert all([
        t['arguments']['parameters'][0] == {
            'name': 'task_name',
            'value': t['name']
        } for t in tasks
    ])
Ejemplo n.º 17
0
    def init_dag_from_partial(cls, partial):
        """Initialize partial returned by get_partial()
        """
        if isinstance(partial, (str, Path)):
            with open(partial) as f:
                tasks = yaml.safe_load(f)

            # cannot extract upstream because this is an incomplete DAG
            meta = {'extract_product': False, 'extract_upstream': False}
            spec = DAGSpec(
                {
                    'tasks': tasks,
                    'meta': meta
                },
                parent_path=Path(partial).parent,
            )

            return spec.to_dag()
        elif isinstance(partial, DAG):
            return partial
        else:
            raise TypeError(f'Expected {cls.__name__}.get_partial() to '
                            'return a str, pathlib.Path or ploomber.DAG, '
                            f'got {type(partial).__name__}')
Ejemplo n.º 18
0
def test_dag_manager_flat_structure(backup_spec_with_functions_flat):
    dag = DAGSpec('pipeline.yaml').to_dag().render()
    m = JupyterDAGManager(dag)

    assert set(m) == {
        'my_tasks_flat/raw.py (functions)',
        'my_tasks_flat/raw.py (functions)/raw',
        'my_tasks_flat/raw.py (functions)/raw2',
        'my_tasks_flat/clean.py (functions)',
        'my_tasks_flat/clean.py (functions)/clean',
    }

    assert 'my_tasks_flat/raw.py (functions)/' in m
    assert '/my_tasks_flat/raw.py (functions)/' in m
    assert '/my_tasks_flat/raw.py (functions)/' in m
Ejemplo n.º 19
0
def load_tasks(mode='incremental'):
    """Load tasks names and their upstream dependencies

    Parameters
    ----------
    mode : bool, default='incremental'
        One of 'incremental' (only include outdated tasks with respect to
        the remote metadata), 'regular' (ignore status, submit all tasks and
        determine status at runtime) or 'force' (ignore status, submit all
        tasks and force execution regardless of status)

    Returns
    -------
    task : dict
        A dictionary with tasks (keys) and upstream dependencies (values)
        to submit

    args : list
        A list of arguments to pass to "ploomber task {name}"
    """
    valid = Mode.get_values()
    if mode not in valid:
        raise ValueError(f'mode must be one of {valid!r}')

    dag = DAGSpec.find().to_dag()

    if mode == 'incremental':
        dag.render(remote=True)

        tasks = []

        for name, task in dag.items():
            if not mode or task.exec_status != TaskStatus.Skipped:
                tasks.append(name)
    else:
        # force makes rendering faster. we just need this to ensure the
        # pipeline does not have any rendering problems before proceeding
        dag.render(force=True)

        tasks = list(dag.keys())

    out = {}

    for t in tasks:
        out[t] = [name for name in dag[t].upstream.keys() if name in tasks]

    return out, [] if mode != 'force' else ['--force']
Ejemplo n.º 20
0
def lazily_load_entry_point(starting_dir=None, reload=False):
    """
    Lazily loads entry point by recursively looking in starting_dir directory
    and parent directories.
    """

    starting_dir = starting_dir or '.'

    entry_point = os.environ.get('ENTRY_POINT')

    type_ = try_to_find_entry_point_type(entry_point)

    if type_ == EntryPoint.Directory:
        spec = DAGSpec.from_directory(entry_point)
        path = Path(entry_point)
    elif type_ == EntryPoint.DottedPath:
        entry = load_callable_dotted_path(str(entry_point), raise_=True)
        dag = entry()
        spec = dict(meta=dict(jupyter_hot_reload=False,
                              jupyter_functions_as_notebooks=False))
        # potential issue: dag defines sources as relative paths
        path = Path().resolve()
        return spec, dag, path
    else:
        spec, path, _ = _default_spec_load(starting_dir=starting_dir,
                                           reload=reload,
                                           lazy_import=True)

    # chain exception to provide more context
    dag = spec.to_dag()

    # we remove the on_render hook because this is a lazy load, if we don't do
    # it, calling the hook will cause an error since the function never loads
    dag.on_render = None

    # same with task-level hooks
    # also disable static_analysis since we don't want to break cell injection
    # because of some issues in te code
    for name in dag._iter():
        task = dag[name]
        task._on_render = None

        if hasattr(task, 'static_analysis'):
            task.static_analysis = False

    return spec, dag, path
Ejemplo n.º 21
0
def test_renders_valid_script(name, extract_product, extract_upstream,
                              tmp_directory):
    loader = scaffold.ScaffoldLoader()
    out = loader.render(name,
                        params=dict(extract_product=extract_product,
                                    extract_upstream=extract_upstream))

    # test it generates a valid pipelines
    if Path(name).suffix != '.sql':

        Path(name).write_text(out)

        Path('pipeline.yaml').write_text(
            Template(template).render(name=name,
                                      extract_product=extract_product,
                                      extract_upstream=extract_upstream))

        DAGSpec('pipeline.yaml').to_dag().build()
Ejemplo n.º 22
0
    def __init__(self, path_to_config, env_name):
        # initialize configuration and a few checks on it
        self._cfg = self.CONFIG_CLASS.from_file_with_root_key(
            path_to_config=path_to_config,
            env_name=env_name,
        )

        self._env_name = env_name

        # initialize dag (needed for validation)
        # TODO: implement logic to the corresponding env.{target-name}.yaml
        # to simulate what's going to happen
        self._dag = DAGSpec.find(lazy_import=True).to_dag().render(
            force=True, show_progress=False)

        # ensure that the project and the config make sense
        self.validate()

        # validate specific details about the target
        self._validate(self._cfg, self._dag, self._env_name)
Ejemplo n.º 23
0
def test_pipeline():
    """
    This is a smoke test, checking that the pipeline runs (but not the output)

    NOTE: it's common for pipelines to take hours to run, a way to make this
    test feasible is to run it here with a sample of the data and save results
    in a different folder to prevent overwriting your results.
    """
    # load dag
    dag = DAGSpec.find().to_dag()

    # change executor settings: you can use "pytest --pdb" to start a debugging
    # session if the test fails. Calling dag['task'].debug() is another
    # option
    dag.executor = Serial(build_in_subprocess=False, catch_exceptions=False)

    # a third approach for debugging is to use: import IPython; IPython.embed()
    # to start an interactive session at this point. To do so, you must call
    # "pytest -s"

    dag.build()
Ejemplo n.º 24
0
def test_train():
    """
    This is a smoke test. It only check that the training pipeline runs
    (doesn't check if the output is correct). It passes a sample of the data
    to make it faster.
    """
    # load dag
    dag = DAGSpec.find(env={
        'products': '{{root}}/testing',
        'sample': True
    }).to_dag()

    # change executor settings: you can use "pytest --pdb" to start a debugging
    # session if the test fails. Calling dag['task'].debug() is another
    # option
    dag.executor = Serial(build_in_subprocess=False, catch_exceptions=False)

    # a third approach for debugging is to use: import IPython; IPython.embed()
    # to start an interactive session at this point. To do so, you must call
    # "pytest -s"

    dag.build()
Ejemplo n.º 25
0
def test_save_injected_cell_ipynb(tmp_nbs):
    # modify the spec so it has one ipynb task
    with open('pipeline.yaml') as f:
        spec = yaml.safe_load(f)

    spec['tasks'][0]['source'] = 'load.ipynb'
    Path('pipeline.yaml').write_text(yaml.dump(spec))

    # generate notebook in ipynb format
    jupytext.write(jupytext.read('load.py'), 'load.ipynb')

    dag = DAGSpec('pipeline.yaml').to_dag().render()
    nb = jupytext.read('load.py')
    expected = '"injected-parameters"'

    assert expected not in Path('load.ipynb').read_text()
    assert nb.metadata.get('ploomber') is None

    dag['load'].source.save_injected_cell()
    nb = jupytext.read('load.ipynb')

    assert expected in Path('load.ipynb').read_text()
    assert nb.metadata.ploomber.injected_manually
Ejemplo n.º 26
0
def dag_build():
    dag = DAGSpec.find().to_dag()
    dag.executor = Serial(build_in_subprocess=False)
    dag.render().build()
Ejemplo n.º 27
0
Archivo: cli.py Proyecto: cxz/ploomber
def scaffold(conda, package, entry_point, empty):
    """Create new projects (if no pipeline.yaml exists) or add missings tasks
    """
    template = '-e/--entry-point is not compatible with the {flag} flag'

    if entry_point and conda:
        err = template.format(flag='--conda')
        telemetry.log_api("scaffold_error",
                          metadata={
                              'type': 'entry_and_conda_flag',
                              'exception': err,
                              'argv': sys.argv
                          })
        raise click.ClickException(err)

    if entry_point and package:
        err = template.format(flag='--package')
        telemetry.log_api("scaffold_error",
                          metadata={
                              'type': 'entry_and_package_flag',
                              'exception': err,
                              'argv': sys.argv
                          })
        raise click.ClickException(err)

    if entry_point and empty:
        err = template.format(flag='--empty')
        telemetry.log_api("scaffold_error",
                          metadata={
                              'type': 'entry_and_empty_flag',
                              'exception': err,
                              'argv': sys.argv
                          })
        raise click.ClickException(err)

    # try to load a dag by looking in default places
    if entry_point is None:
        loaded = _scaffold.load_dag()
    else:
        try:
            loaded = (
                DAGSpec(entry_point, lazy_import='skip'),
                Path(entry_point).parent,
                Path(entry_point),
            )
        except Exception as e:
            telemetry.log_api("scaffold_error",
                              metadata={
                                  'type': 'dag_load_failed',
                                  'exception': e,
                                  'argv': sys.argv
                              })
            raise click.ClickException(e) from e

    if loaded:
        # existing pipeline, add tasks
        spec, _, path_to_spec = loaded
        _scaffold.add(spec, path_to_spec)
        telemetry.log_api("ploomber_scaffold",
                          dag=loaded,
                          metadata={
                              'type': 'add_task',
                              'argv': sys.argv
                          })
    else:
        # no pipeline, create base project
        telemetry.log_api("ploomber_scaffold",
                          metadata={
                              'type': 'base_project',
                              'argv': sys.argv
                          })
        scaffold_project.cli(project_path=None,
                             conda=conda,
                             package=package,
                             empty=empty)
Ejemplo n.º 28
0
def test_format_with_extension_change(tmp_nbs):
    dag = DAGSpec('pipeline.yaml').to_dag().render()
    dag['load'].source.format(fmt='ipynb')

    assert not Path('load.py').exists()
    assert jupytext.read('load.ipynb')