Ejemplo n.º 1
0
def test_render_error_on_syntax_error(tmp_directory):
    path = Path('sample.py')

    path.write_text("""
# + tags=["parameters"]
if
""")

    dag = DAG()
    NotebookRunner(path, product=File('out.ipynb'), dag=dag)

    with pytest.raises(DAGRenderError) as excinfo:
        dag.render()

    assert 'invalid syntax\n\nif\n\n  ^\n' in str(excinfo.value)
Ejemplo n.º 2
0
def test_dag_r(tmp_directory):
    path = Path('sample.R')

    path.write_text("""
# + tags=["parameters"]
a <- NULL
b <- 1
c <- c(1, 2, 3)
""")

    dag = DAG()
    NotebookRunner(path, product=File('out.ipynb'), dag=dag, params=dict(z=1))

    # parameter extraction is not implemented but should not raise an error
    dag.render()
Ejemplo n.º 3
0
def test_can_execute_with_parameters(tmp_directory):
    dag = DAG()

    code = """
    1 + 1
    """

    NotebookRunner(code,
                   product=File(Path(tmp_directory, 'out.ipynb')),
                   dag=dag,
                   kernelspec_name='python3',
                   params={'var': 1},
                   ext_in='py',
                   name='nb')
    dag.build()
Ejemplo n.º 4
0
def test_warns_if_export_args_but_ipynb_output(tmp_sample_tasks):
    dag = DAG(executor=Serial(build_in_subprocess=False))

    NotebookRunner(Path('sample.ipynb'),
                   File('out.ipynb'),
                   dag,
                   nbconvert_export_kwargs=dict(exclude_input=True))

    with pytest.warns(UserWarning) as records:
        dag.build()

    # NOTE: not sure why sometimes two records are displayed, maybe another
    # library is throwing the warning
    assert any(
        "Output 'out.ipynb' is a notebook file" in record.message.args[0]
        for record in records)
Ejemplo n.º 5
0
def test_skip_kernel_install_check(tmp_directory):
    dag = DAG()

    code = """
# + tags=["parameters"]
1 + 1
    """

    NotebookRunner(code,
                   product=File(Path(tmp_directory, 'out.ipynb')),
                   dag=dag,
                   kernelspec_name='unknown_kernel',
                   ext_in='py',
                   name='nb',
                   check_if_kernel_installed=False)
    dag.render()
Ejemplo n.º 6
0
def test_render_error_on_undefined_name_error(tmp_directory):
    path = Path('sample.py')

    path.write_text("""
# + tags=["parameters"]

# +
df.head()
""")

    dag = DAG()
    NotebookRunner(path, product=File('out.ipynb'), dag=dag)

    with pytest.raises(DAGRenderError) as excinfo:
        dag.render()

    assert "undefined name 'df'" in str(excinfo.value)
Ejemplo n.º 7
0
def test_develop_error_if_r_notebook(tmp_sample_tasks):
    dag = DAG()

    t = NotebookRunner(Path('sample.R'), product=File('out.ipynb'), dag=dag)

    dag.render()

    with pytest.raises(NotImplementedError):
        t.develop()

    with pytest.raises(NotImplementedError):
        t.debug()
Ejemplo n.º 8
0
def _dag_simple(nb_params=True, params=None):
    path = Path('sample.py')

    if nb_params:
        path.write_text("""
# + tags=["parameters"]
a = None
b = 1
c = 'hello'
""")
    else:
        path.write_text("""
# + tags=["parameters"]
""")

    dag = DAG()
    NotebookRunner(path, product=File('out.ipynb'), dag=dag, params=params)
    return dag
Ejemplo n.º 9
0
def test_render_pass_on_missing_product_parameter(tmp_directory):
    path = Path('sample.py')

    path.write_text("""
# + tags=["parameters"]

# +
df = None
df.to_csv(product)
""")

    dag = DAG()
    NotebookRunner(path, product=File('out.ipynb'), dag=dag)

    # the render process injects the cell with the product variable so this
    # should not raise any errors, even if the raw source code does not contain
    # the product variable
    assert dag.render()
Ejemplo n.º 10
0
def test_creates_parents(tmp_directory):
    dag = DAG()

    code = """
# + tags=["parameters"]
product = None

# +
from pathlib import Path
Path(product['file']).touch()
    """

    product = {
        'nb': File(Path(tmp_directory, 'another', 'nb', 'out.ipynb')),
        'file': File(Path(tmp_directory, 'another', 'data', 'file.txt')),
    }

    NotebookRunner(code, product=product, dag=dag, ext_in='py', name='nb')
    dag.build()
Ejemplo n.º 11
0
def make_task(dag, rel_path_in, rel_path_out, base_path):
    path_in = base_path / rel_path_in
    path_out = base_path / rel_path_out

    nb = jupytext.read(path_in)

    fmt = nbformat.versions[nbformat.current_nbformat]
    nb.cells.append(fmt.new_code_cell(metadata=dict(tags=['parameters'])))

    nb.cells.insert(0, fmt.new_markdown_cell(binder_badge(rel_path_in)))

    name = Path(path_in).name.split('.')[0]
    path_preprocessed = Path(path_in).parent / (name + '-preprocessed.ipynb')
    nbformat.write(nb, path_preprocessed)

    NotebookRunner(Path(path_preprocessed),
                   File(path_out),
                   dag,
                   kernelspec_name='python3',
                   name=name,
                   local_execution=True)
Ejemplo n.º 12
0
def tmp_dag(tmp_directory):
    dag = DAG()

    code = """
# + tags=["parameters"]
var = None

# +
1 + 1
    """
    p = Path('some_notebook.py')

    p.write_text(code)

    NotebookRunner(p,
                   product=File(Path(tmp_directory, 'out.ipynb')),
                   dag=dag,
                   kernelspec_name='python3',
                   params={'var': 1},
                   name='nb')

    dag.render()

    return dag
Ejemplo n.º 13
0
def test_can_execute_when_product_is_metaproduct(tmp_directory):
    dag = DAG()

    code = """

from pathlib import Path

Path(product['model']).touch()
    """

    product = {
        'nb': File(Path(tmp_directory, 'out.ipynb')),
        'model': File(Path(tmp_directory, 'model.pkl'))
    }

    NotebookRunner(code,
                   product=product,
                   dag=dag,
                   kernelspec_name='python3',
                   params={'var': 1},
                   ext_in='py',
                   nb_product_key='nb',
                   name='nb')
    dag.build()
Ejemplo n.º 14
0
def test_hot_reload(tmp_directory):
    cfg = DAGConfigurator()
    cfg.params.hot_reload = True

    dag = cfg.create()

    path = Path('nb.py')
    path.write_text("""
# + tags=["parameters"]
# some code

# +
1 + 1
    """)

    t = NotebookRunner(path,
                       product=File('out.ipynb'),
                       dag=dag,
                       kernelspec_name='python3')

    t.render()

    path.write_text("""
# + tags=["parameters"]
# some code

# +
2 + 2
    """)

    t.render()

    assert '2 + 2' in str(t.source)
    assert t.product._outdated_code_dependency()
    assert not t.product._outdated_data_dependencies()

    assert '2 + 2' in t.source.nb_str_rendered

    report = dag.build()

    assert report['Ran?'] == [True]
Ejemplo n.º 15
0
def _make(env):
    # this is the private function we use to pass the testing environment
    cfg = DAGConfigurator(env.dag_config)
    dag = cfg.create(name='ml-pipeline')

    # run this in parallel
    dag.executor = Parallel(processes=3)

    loader = SourceLoader(module='ml_advanced.templates')

    get = PythonCallable(tasks.get,
                         File(env.path.data / 'data.parquet'),
                         dag,
                         name='get',
                         params={'sample_frac': env.sample_frac})

    fts = PythonCallable(tasks.features,
                         File(env.path.data / 'features.parquet'),
                         dag,
                         name='features')
    join = PythonCallable(tasks.join,
                          File(env.path.data / 'join.parquet'),
                          dag,
                          name='join')

    get >> fts

    (get + fts) >> join

    model_classes = [
        'sklearn.ensemble.RandomForestClassifier',
        # these come from our package, they return a sklearn Pipeline object
        'ml_advanced.models.logistic_reg',
        'ml_advanced.models.svc',
    ]

    model_param_grids = [
        dict(n_estimators=[5, 10, 50, 100], min_samples_leaf=[2, 4, 8]),
        dict(clf__penalty=['l1', 'l2'], clf__C=[0.5, 1.0]),
        dict(clf__kernel=['linear', 'poly', 'rbf'], clf__C=[0.5, 1.0]),
    ]

    for model_class, model_params in zip(model_classes, model_param_grids):
        fit = NotebookRunner(
            loader['fit.py'],
            product={
                'nb': File(env.path.data / f'fit-{model_class}.ipynb'),
                'model': File(env.path.data / f'model-{model_class}.joblib')
            },
            dag=dag,
            params={
                'model_class': model_class,
                'model_params': model_params
            },
            # NOTE: Argo does not support "." nor  "_" in task names. Not
            # needed if only running locally
            name='fit-' + model_class.replace('.', '--').replace('_', '-'))

        join >> fit

    return dag
Ejemplo n.º 16
0
dag = DAG()

# loop over params and create one notebook task for each...
for name, params in params_all.items():
    # NotebookRunner is able to execute ipynb files using
    # papermill under the hood, if the input file has a
    # different extension (like in our case), it will first
    # convert it to an ipynb file using jupytext
    NotebookRunner(
        notebook,
        # save it in artifacts/{name}.html
        # NotebookRunner will generate ipynb files by
        # default, but you can choose other formats,
        # any format supported by the official nbconvert
        # package is supported here
        product=File(out / (name + '.html')),
        dag=dag,
        name=name,
        # pass the parameters
        params=params,
        ext_in='py',
        kernelspec_name='python3')
# -

# Build the DAG:

dag.build()

# That's it. After building the DAG, each model will generate one report, you can see them here: [Ridge](https://ploomber.github.io/posts/model-selection/artifacts/ridge), [Random Forest](https://ploomber.github.io/posts/model-selection/artifacts/rf) and [NuSVR](https://ploomber.github.io/posts/model-selection/artifacts/nusvr).
#
# Splitting logic into separate files improves readability and maintainability, if we want to add another model we only have to add a new dictionary with the parameter grid, if preprocessing is needed, we just add a factory in `pipelines.py`.