def test_render_error_on_syntax_error(tmp_directory): path = Path('sample.py') path.write_text(""" # + tags=["parameters"] if """) dag = DAG() NotebookRunner(path, product=File('out.ipynb'), dag=dag) with pytest.raises(DAGRenderError) as excinfo: dag.render() assert 'invalid syntax\n\nif\n\n ^\n' in str(excinfo.value)
def test_dag_r(tmp_directory): path = Path('sample.R') path.write_text(""" # + tags=["parameters"] a <- NULL b <- 1 c <- c(1, 2, 3) """) dag = DAG() NotebookRunner(path, product=File('out.ipynb'), dag=dag, params=dict(z=1)) # parameter extraction is not implemented but should not raise an error dag.render()
def test_can_execute_with_parameters(tmp_directory): dag = DAG() code = """ 1 + 1 """ NotebookRunner(code, product=File(Path(tmp_directory, 'out.ipynb')), dag=dag, kernelspec_name='python3', params={'var': 1}, ext_in='py', name='nb') dag.build()
def test_warns_if_export_args_but_ipynb_output(tmp_sample_tasks): dag = DAG(executor=Serial(build_in_subprocess=False)) NotebookRunner(Path('sample.ipynb'), File('out.ipynb'), dag, nbconvert_export_kwargs=dict(exclude_input=True)) with pytest.warns(UserWarning) as records: dag.build() # NOTE: not sure why sometimes two records are displayed, maybe another # library is throwing the warning assert any( "Output 'out.ipynb' is a notebook file" in record.message.args[0] for record in records)
def test_skip_kernel_install_check(tmp_directory): dag = DAG() code = """ # + tags=["parameters"] 1 + 1 """ NotebookRunner(code, product=File(Path(tmp_directory, 'out.ipynb')), dag=dag, kernelspec_name='unknown_kernel', ext_in='py', name='nb', check_if_kernel_installed=False) dag.render()
def test_render_error_on_undefined_name_error(tmp_directory): path = Path('sample.py') path.write_text(""" # + tags=["parameters"] # + df.head() """) dag = DAG() NotebookRunner(path, product=File('out.ipynb'), dag=dag) with pytest.raises(DAGRenderError) as excinfo: dag.render() assert "undefined name 'df'" in str(excinfo.value)
def test_develop_error_if_r_notebook(tmp_sample_tasks): dag = DAG() t = NotebookRunner(Path('sample.R'), product=File('out.ipynb'), dag=dag) dag.render() with pytest.raises(NotImplementedError): t.develop() with pytest.raises(NotImplementedError): t.debug()
def _dag_simple(nb_params=True, params=None): path = Path('sample.py') if nb_params: path.write_text(""" # + tags=["parameters"] a = None b = 1 c = 'hello' """) else: path.write_text(""" # + tags=["parameters"] """) dag = DAG() NotebookRunner(path, product=File('out.ipynb'), dag=dag, params=params) return dag
def test_render_pass_on_missing_product_parameter(tmp_directory): path = Path('sample.py') path.write_text(""" # + tags=["parameters"] # + df = None df.to_csv(product) """) dag = DAG() NotebookRunner(path, product=File('out.ipynb'), dag=dag) # the render process injects the cell with the product variable so this # should not raise any errors, even if the raw source code does not contain # the product variable assert dag.render()
def test_creates_parents(tmp_directory): dag = DAG() code = """ # + tags=["parameters"] product = None # + from pathlib import Path Path(product['file']).touch() """ product = { 'nb': File(Path(tmp_directory, 'another', 'nb', 'out.ipynb')), 'file': File(Path(tmp_directory, 'another', 'data', 'file.txt')), } NotebookRunner(code, product=product, dag=dag, ext_in='py', name='nb') dag.build()
def make_task(dag, rel_path_in, rel_path_out, base_path): path_in = base_path / rel_path_in path_out = base_path / rel_path_out nb = jupytext.read(path_in) fmt = nbformat.versions[nbformat.current_nbformat] nb.cells.append(fmt.new_code_cell(metadata=dict(tags=['parameters']))) nb.cells.insert(0, fmt.new_markdown_cell(binder_badge(rel_path_in))) name = Path(path_in).name.split('.')[0] path_preprocessed = Path(path_in).parent / (name + '-preprocessed.ipynb') nbformat.write(nb, path_preprocessed) NotebookRunner(Path(path_preprocessed), File(path_out), dag, kernelspec_name='python3', name=name, local_execution=True)
def tmp_dag(tmp_directory): dag = DAG() code = """ # + tags=["parameters"] var = None # + 1 + 1 """ p = Path('some_notebook.py') p.write_text(code) NotebookRunner(p, product=File(Path(tmp_directory, 'out.ipynb')), dag=dag, kernelspec_name='python3', params={'var': 1}, name='nb') dag.render() return dag
def test_can_execute_when_product_is_metaproduct(tmp_directory): dag = DAG() code = """ from pathlib import Path Path(product['model']).touch() """ product = { 'nb': File(Path(tmp_directory, 'out.ipynb')), 'model': File(Path(tmp_directory, 'model.pkl')) } NotebookRunner(code, product=product, dag=dag, kernelspec_name='python3', params={'var': 1}, ext_in='py', nb_product_key='nb', name='nb') dag.build()
def test_hot_reload(tmp_directory): cfg = DAGConfigurator() cfg.params.hot_reload = True dag = cfg.create() path = Path('nb.py') path.write_text(""" # + tags=["parameters"] # some code # + 1 + 1 """) t = NotebookRunner(path, product=File('out.ipynb'), dag=dag, kernelspec_name='python3') t.render() path.write_text(""" # + tags=["parameters"] # some code # + 2 + 2 """) t.render() assert '2 + 2' in str(t.source) assert t.product._outdated_code_dependency() assert not t.product._outdated_data_dependencies() assert '2 + 2' in t.source.nb_str_rendered report = dag.build() assert report['Ran?'] == [True]
def _make(env): # this is the private function we use to pass the testing environment cfg = DAGConfigurator(env.dag_config) dag = cfg.create(name='ml-pipeline') # run this in parallel dag.executor = Parallel(processes=3) loader = SourceLoader(module='ml_advanced.templates') get = PythonCallable(tasks.get, File(env.path.data / 'data.parquet'), dag, name='get', params={'sample_frac': env.sample_frac}) fts = PythonCallable(tasks.features, File(env.path.data / 'features.parquet'), dag, name='features') join = PythonCallable(tasks.join, File(env.path.data / 'join.parquet'), dag, name='join') get >> fts (get + fts) >> join model_classes = [ 'sklearn.ensemble.RandomForestClassifier', # these come from our package, they return a sklearn Pipeline object 'ml_advanced.models.logistic_reg', 'ml_advanced.models.svc', ] model_param_grids = [ dict(n_estimators=[5, 10, 50, 100], min_samples_leaf=[2, 4, 8]), dict(clf__penalty=['l1', 'l2'], clf__C=[0.5, 1.0]), dict(clf__kernel=['linear', 'poly', 'rbf'], clf__C=[0.5, 1.0]), ] for model_class, model_params in zip(model_classes, model_param_grids): fit = NotebookRunner( loader['fit.py'], product={ 'nb': File(env.path.data / f'fit-{model_class}.ipynb'), 'model': File(env.path.data / f'model-{model_class}.joblib') }, dag=dag, params={ 'model_class': model_class, 'model_params': model_params }, # NOTE: Argo does not support "." nor "_" in task names. Not # needed if only running locally name='fit-' + model_class.replace('.', '--').replace('_', '-')) join >> fit return dag
dag = DAG() # loop over params and create one notebook task for each... for name, params in params_all.items(): # NotebookRunner is able to execute ipynb files using # papermill under the hood, if the input file has a # different extension (like in our case), it will first # convert it to an ipynb file using jupytext NotebookRunner( notebook, # save it in artifacts/{name}.html # NotebookRunner will generate ipynb files by # default, but you can choose other formats, # any format supported by the official nbconvert # package is supported here product=File(out / (name + '.html')), dag=dag, name=name, # pass the parameters params=params, ext_in='py', kernelspec_name='python3') # - # Build the DAG: dag.build() # That's it. After building the DAG, each model will generate one report, you can see them here: [Ridge](https://ploomber.github.io/posts/model-selection/artifacts/ridge), [Random Forest](https://ploomber.github.io/posts/model-selection/artifacts/rf) and [NuSVR](https://ploomber.github.io/posts/model-selection/artifacts/nusvr). # # Splitting logic into separate files improves readability and maintainability, if we want to add another model we only have to add a new dictionary with the parameter grid, if preprocessing is needed, we just add a factory in `pipelines.py`.