def test_notebook_task_simple(): nb_name = "nb-spark" nb = NotebookTask( name="test", notebook_path=_get_nb_path(nb_name, abs=False), outputs=kwtypes(df=FlyteSchema[kwtypes(name=str, age=int)]), task_config=Spark(spark_conf={"x": "y"}), ) n, out, render = nb.execute() assert nb.python_interface.outputs.keys() == { "df", "out_nb", "out_rendered_nb" } assert nb.output_notebook_path == out == _get_nb_path(nb_name, suffix="-out") assert nb.rendered_output_path == render == _get_nb_path( nb_name, suffix="-out", ext=".html")
def test_notebook_task_simple(): nb_name = "nb-simple" nb = NotebookTask( name="test", notebook_path=_get_nb_path(nb_name, abs=False), inputs=kwtypes(pi=float), outputs=kwtypes(square=float), ) sqr, out, render = nb.execute(pi=4) assert sqr == 16.0 assert nb.python_interface.inputs == {"pi": float} assert nb.python_interface.outputs.keys() == { "square", "out_nb", "out_rendered_nb" } assert nb.output_notebook_path == out == _get_nb_path(nb_name, suffix="-out") assert nb.rendered_output_path == render == _get_nb_path( nb_name, suffix="-out", ext=".html")
def test_notebook_task_multi_values(): nb_name = "nb-multi" nb = NotebookTask( name="test", notebook_path=_get_nb_path(nb_name, abs=False), inputs=kwtypes(x=int, y=int, h=str), outputs=kwtypes(z=int, m=int, h=str, n=datetime.datetime), ) z, m, h, n, out, render = nb.execute(x=10, y=10, h="blah") assert z == 20 assert m == 100 assert h == "blah world!" assert type(n) == datetime.datetime assert nb.python_interface.inputs == {"x": int, "y": int, "h": str} assert nb.python_interface.outputs.keys() == { "z", "m", "h", "n", "out_nb", "out_rendered_nb" } assert nb.output_notebook_path == out == _get_nb_path(nb_name, suffix="-out") assert nb.rendered_output_path == render == _get_nb_path( nb_name, suffix="-out", ext=".html")
def test_notebook_task_complex(): nb_name = "nb-complex" nb = NotebookTask( name="test", notebook_path=_get_nb_path(nb_name, abs=False), inputs=kwtypes(h=str, n=int, w=str), outputs=kwtypes(h=str, w=PythonNotebook, x=X), ) h, w, x, out, render = nb.execute(h="blah", n=10, w=_get_nb_path("nb-multi")) assert h == "blah world!" assert w is not None assert x.x == 10 assert nb.python_interface.inputs == {"n": int, "h": str, "w": str} assert nb.python_interface.outputs.keys() == { "h", "w", "x", "out_nb", "out_rendered_nb" } assert nb.output_notebook_path == out == _get_nb_path(nb_name, suffix="-out") assert nb.rendered_output_path == render == _get_nb_path( nb_name, suffix="-out", ext=".html")
min_samples_split: int = 4 random_state: int = 2 nfolds: int = 10 # %% # We define a ``NotebookTask`` to run the `Jupyter notebook # <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_1.ipynb>`__. # This notebook returns ``dummified_data`` and ``dataset`` as the outputs. # # .. note:: # ``dummified_data`` is used in this example, and ``dataset`` is used in the upcoming example. nb = NotebookTask( name="eda-feature-eng-nb", notebook_path=os.path.join( pathlib.Path(__file__).parent.absolute(), "supermarket_regression_1.ipynb"), outputs=kwtypes(dummified_data=pd.DataFrame, dataset=str), requests=Resources(mem="500Mi"), ) # %% # Next, we define a ``cross_validate`` function and a ``modeling`` task to compute the MAE score of the data against # the Gradient Boosting Regressor. def cross_validate(model, nfolds, feats, targets): score = -1 * (cross_val_score( model, feats, targets, cv=nfolds, scoring="neg_mean_absolute_error")) return np.mean(score) @task
# 1. After you are satisfied with the notebook, ensure that the first cell only has the input variables for the notebook. Now add the tag ``parameters`` for the first cell. # # .. image:: https://raw.githubusercontent.com/flyteorg/flyte/static-resources/img/papermilltasks/parameters.png # :alt: Example of "parameters tag" added to the cell with input variables # # 2. Typically at the last cell of the notebook (which does not need to be the last cell), add a tag ``outputs`` for the intended cell. # # .. image:: https://raw.githubusercontent.com/flyteorg/flyte/static-resources/img/papermilltasks/outputs.png # :alt: Example of "parameters tag" added to the cell with input variables # # 3. In a python file, create a new task at the ``module`` level. # An example task is shown below: nb = NotebookTask( name="simple-nb", notebook_path=os.path.join( pathlib.Path(__file__).parent.absolute(), "nb-simple.ipynb"), inputs=kwtypes(v=float), outputs=kwtypes(square=float), ) #%% # .. note:: # # - Note the notebook_path. This is the absolute path to the actual notebook. # - Note the inputs and outputs. The variable names match the variable names in the jupyter notebook. # # Other tasks # ^^^^^^^^^^^^^^^ # You can definitely declare other tasks and seamlessly work with notebook tasks. The example below shows how to declare a task that accepts the squared value from the notebook and provides a sqrt: @task
from flytekit import Resources, kwtypes, workflow from flytekitplugins.papermill import NotebookTask # %% # We define a ``NotebookTask`` to run the `Jupyter notebook # <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_1.ipynb>`__ (EDA). # This notebook returns ``dummified_data`` and ``dataset`` as the outputs. # # .. note:: # ``dataset`` is used in this example, and ``dummified_data`` is used in the previous example. # ``dataset`` lets us send the DataFrame as a JSON string to the subsequent notebook because DataFrame input cannot be sent # directly to the notebook as per Papermill. nb_1 = NotebookTask( name="eda-featureeng-nb", notebook_path=os.path.join( pathlib.Path(__file__).parent.absolute(), "supermarket_regression_1.ipynb" ), outputs=kwtypes(dummified_data=pd.DataFrame, dataset=str), requests=Resources(mem="500Mi"), ) # %% # We define a ``NotebookTask`` to run the `Jupyter notebook # <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_2.ipynb>`__ # (Modeling). # # This notebook returns ``mae_score`` as the output. nb_2 = NotebookTask( name="regression-nb", notebook_path=os.path.join( pathlib.Path(__file__).parent.absolute(), "supermarket_regression_2.ipynb",
# * - ``inputs`` # - Inputs to be sent to the notebook # * - ``outputs`` # - Outputs to be returned from the notebook # * - ``requests`` # - Specify compute resource requests for your task. # # This notebook returns ``mae_score`` as the output. nb = NotebookTask( name="pipeline-nb", notebook_path=os.path.join( pathlib.Path(__file__).parent.absolute(), "supermarket_regression.ipynb"), inputs=kwtypes( n_estimators=int, max_depth=int, max_features=str, min_samples_split=int, random_state=int, ), outputs=kwtypes(mae_score=float), requests=Resources(mem="500Mi"), ) # %% # Since a task need not be defined, we create a ``workflow`` and return the MAE score. @workflow def notebook_wf( n_estimators: int = 150,