Esempio n. 1
0
def test_notebook_task_simple():
    nb_name = "nb-spark"
    nb = NotebookTask(
        name="test",
        notebook_path=_get_nb_path(nb_name, abs=False),
        outputs=kwtypes(df=FlyteSchema[kwtypes(name=str, age=int)]),
        task_config=Spark(spark_conf={"x": "y"}),
    )
    n, out, render = nb.execute()
    assert nb.python_interface.outputs.keys() == {
        "df", "out_nb", "out_rendered_nb"
    }
    assert nb.output_notebook_path == out == _get_nb_path(nb_name,
                                                          suffix="-out")
    assert nb.rendered_output_path == render == _get_nb_path(
        nb_name, suffix="-out", ext=".html")
Esempio n. 2
0
def test_notebook_task_simple():
    nb_name = "nb-simple"
    nb = NotebookTask(
        name="test",
        notebook_path=_get_nb_path(nb_name, abs=False),
        inputs=kwtypes(pi=float),
        outputs=kwtypes(square=float),
    )
    sqr, out, render = nb.execute(pi=4)
    assert sqr == 16.0
    assert nb.python_interface.inputs == {"pi": float}
    assert nb.python_interface.outputs.keys() == {
        "square", "out_nb", "out_rendered_nb"
    }
    assert nb.output_notebook_path == out == _get_nb_path(nb_name,
                                                          suffix="-out")
    assert nb.rendered_output_path == render == _get_nb_path(
        nb_name, suffix="-out", ext=".html")
Esempio n. 3
0
def test_notebook_task_multi_values():
    nb_name = "nb-multi"
    nb = NotebookTask(
        name="test",
        notebook_path=_get_nb_path(nb_name, abs=False),
        inputs=kwtypes(x=int, y=int, h=str),
        outputs=kwtypes(z=int, m=int, h=str, n=datetime.datetime),
    )
    z, m, h, n, out, render = nb.execute(x=10, y=10, h="blah")
    assert z == 20
    assert m == 100
    assert h == "blah world!"
    assert type(n) == datetime.datetime
    assert nb.python_interface.inputs == {"x": int, "y": int, "h": str}
    assert nb.python_interface.outputs.keys() == {
        "z", "m", "h", "n", "out_nb", "out_rendered_nb"
    }
    assert nb.output_notebook_path == out == _get_nb_path(nb_name,
                                                          suffix="-out")
    assert nb.rendered_output_path == render == _get_nb_path(
        nb_name, suffix="-out", ext=".html")
Esempio n. 4
0
def test_notebook_task_complex():
    nb_name = "nb-complex"
    nb = NotebookTask(
        name="test",
        notebook_path=_get_nb_path(nb_name, abs=False),
        inputs=kwtypes(h=str, n=int, w=str),
        outputs=kwtypes(h=str, w=PythonNotebook, x=X),
    )
    h, w, x, out, render = nb.execute(h="blah",
                                      n=10,
                                      w=_get_nb_path("nb-multi"))
    assert h == "blah world!"
    assert w is not None
    assert x.x == 10
    assert nb.python_interface.inputs == {"n": int, "h": str, "w": str}
    assert nb.python_interface.outputs.keys() == {
        "h", "w", "x", "out_nb", "out_rendered_nb"
    }
    assert nb.output_notebook_path == out == _get_nb_path(nb_name,
                                                          suffix="-out")
    assert nb.rendered_output_path == render == _get_nb_path(
        nb_name, suffix="-out", ext=".html")
    min_samples_split: int = 4
    random_state: int = 2
    nfolds: int = 10


# %%
# We define a ``NotebookTask`` to run the `Jupyter notebook
# <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_1.ipynb>`__.
# This notebook returns ``dummified_data`` and ``dataset`` as the outputs.
#
# .. note::
#   ``dummified_data`` is used in this example, and ``dataset`` is used in the upcoming example.
nb = NotebookTask(
    name="eda-feature-eng-nb",
    notebook_path=os.path.join(
        pathlib.Path(__file__).parent.absolute(),
        "supermarket_regression_1.ipynb"),
    outputs=kwtypes(dummified_data=pd.DataFrame, dataset=str),
    requests=Resources(mem="500Mi"),
)


# %%
# Next, we define a ``cross_validate`` function and a ``modeling`` task to compute the MAE score of the data against
# the Gradient Boosting Regressor.
def cross_validate(model, nfolds, feats, targets):
    score = -1 * (cross_val_score(
        model, feats, targets, cv=nfolds, scoring="neg_mean_absolute_error"))
    return np.mean(score)


@task
Esempio n. 6
0
# 1. After you are satisfied with the notebook, ensure that the first cell only has the input variables for the notebook. Now add the tag ``parameters`` for the first cell.
#
# .. image:: https://raw.githubusercontent.com/flyteorg/flyte/static-resources/img/papermilltasks/parameters.png
#     :alt: Example of "parameters tag" added to the cell with input variables
#
# 2. Typically at the last cell of the notebook (which does not need to be the last cell), add a tag ``outputs`` for the intended cell.
#
# .. image:: https://raw.githubusercontent.com/flyteorg/flyte/static-resources/img/papermilltasks/outputs.png
#     :alt: Example of "parameters tag" added to the cell with input variables
#
# 3. In a python file, create a new task at the ``module`` level.
#    An example task is shown below:
nb = NotebookTask(
    name="simple-nb",
    notebook_path=os.path.join(
        pathlib.Path(__file__).parent.absolute(), "nb-simple.ipynb"),
    inputs=kwtypes(v=float),
    outputs=kwtypes(square=float),
)


#%%
# .. note::
#
#  - Note the notebook_path. This is the absolute path to the actual notebook.
#  - Note the inputs and outputs. The variable names match the variable names in the jupyter notebook.
#
# Other tasks
# ^^^^^^^^^^^^^^^
# You can definitely declare other tasks and seamlessly work with notebook tasks. The example below shows how to declare a task that accepts the squared value from the notebook and provides a sqrt:
@task
Esempio n. 7
0
from flytekit import Resources, kwtypes, workflow
from flytekitplugins.papermill import NotebookTask

# %%
# We define a ``NotebookTask`` to run the `Jupyter notebook
# <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_1.ipynb>`__ (EDA).
# This notebook returns ``dummified_data`` and ``dataset`` as the outputs.
#
# .. note::
#   ``dataset`` is used in this example, and ``dummified_data`` is used in the previous example.
#   ``dataset`` lets us send the DataFrame as a JSON string to the subsequent notebook because DataFrame input cannot be sent
#   directly to the notebook as per Papermill.
nb_1 = NotebookTask(
    name="eda-featureeng-nb",
    notebook_path=os.path.join(
        pathlib.Path(__file__).parent.absolute(), "supermarket_regression_1.ipynb"
    ),
    outputs=kwtypes(dummified_data=pd.DataFrame, dataset=str),
    requests=Resources(mem="500Mi"),
)

# %%
# We define a ``NotebookTask`` to run the `Jupyter notebook
# <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_2.ipynb>`__
# (Modeling).
#
# This notebook returns ``mae_score`` as the output.
nb_2 = NotebookTask(
    name="regression-nb",
    notebook_path=os.path.join(
        pathlib.Path(__file__).parent.absolute(),
        "supermarket_regression_2.ipynb",
Esempio n. 8
0
#    * - ``inputs``
#      - Inputs to be sent to the notebook
#    * - ``outputs``
#      - Outputs to be returned from the notebook
#    * - ``requests``
#      - Specify compute resource requests for your task.
#
# This notebook returns ``mae_score`` as the output.
nb = NotebookTask(
    name="pipeline-nb",
    notebook_path=os.path.join(
        pathlib.Path(__file__).parent.absolute(),
        "supermarket_regression.ipynb"),
    inputs=kwtypes(
        n_estimators=int,
        max_depth=int,
        max_features=str,
        min_samples_split=int,
        random_state=int,
    ),
    outputs=kwtypes(mae_score=float),
    requests=Resources(mem="500Mi"),
)

# %%
# Since a task need not be defined, we create a ``workflow`` and return the MAE score.


@workflow
def notebook_wf(
    n_estimators: int = 150,