def subdag_factory(parent_dag_id, child_dag_id, default_args):
    with DAG(dag_id=f"{parent_dag_id}.{child_dag_id}",
             default_args=default_args) as dag:

        n_estimators = [100, 150]
        max_features = ['auto', 'sqrt']

        training_model_tasks = []
        for feature in max_features:
            for estimator in n_estimators:
                ml_id = f"{feature}_{estimator}"
                training_model_tasks.append(
                    PapermillOperator(
                        task_id=f'training_model_{ml_id}',
                        input_nb=
                        '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb',
                        output_nb=
                        f'/tmp/out-model-avocado-prediction-{ml_id}.ipynb',
                        pool='training_pool',
                        parameters={
                            'filepath': '/tmp/avocado.csv',
                            'n_estimators': estimator,
                            'max_features': feature,
                            'ml_id': ml_id
                        }))
        return dag
Beispiel #2
0
def training_group():
    with TaskGroup("trainings", tooltip="Training tasks") as group:
        n_estimators = [100, 150]
        max_features = ['auto', 'sqrt']
        for feature in max_features:
            for estimator in n_estimators:
                ml_id = f"{feature}_{estimator}"
                PapermillOperator(
                    task_id=f'training_model_{ml_id}',
                    input_nb=
                    '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb',
                    output_nb=
                    f'/tmp/out-model-avocado-prediction-{ml_id}.ipynb',
                    pool='training_pool',
                    parameters={
                        'filepath': '/tmp/avocado.csv',
                        'n_estimators': estimator,
                        'max_features': feature,
                        'ml_id': ml_id
                    })
    return group
def training_groups():
    with TaskGroup("trainings") as group:

        model_settings = Variable.get('avocado_dag_model_settings', deserialize_json=True)

        for feature in model_settings['max_features']:
            for estimator in model_settings['n_estimators']:
                ml_id = f"{feature}_{estimator}"
                PapermillOperator(
                    task_id=f'training_model_{ml_id}',
                    input_nb='/usr/local/airflow/include/notebooks/avocado_prediction.ipynb',
                    output_nb=f'/tmp/out-model-avocado-prediction-{ml_id}.ipynb',
                    pool='training_pool',
                    parameters={
                        'filepath': '/tmp/avocado.csv',
                        'n_estimators': estimator,
                        'max_features': feature,
                        'ml_id': ml_id
                    }
                )
        return group
Beispiel #4
0
    def test_execute(self, mock_papermill):
        in_nb = "/tmp/does_not_exist"
        out_nb = "/tmp/will_not_exist"
        parameters = {"msg": "hello_world", "train": 1}

        op = PapermillOperator(input_nb=in_nb,
                               output_nb=out_nb,
                               parameters=parameters,
                               task_id="papermill_operator_test",
                               dag=None)

        op.pre_execute(context={})  # make sure to have the inlets
        op.execute(context={})

        mock_papermill.execute_notebook.assert_called_once_with(
            in_nb,
            out_nb,
            parameters=parameters,
            progress_bar=False,
            report_mode=True)
START_DATE = datetime(2021, 1, 1)
SCHEDULE_INTERVAL = '0 0 * * *'
DAGRUN_TIMEOUT = timedelta(minutes=60)

with DAG(
        dag_id='example_papermill_operator',
        schedule_interval=SCHEDULE_INTERVAL,
        start_date=START_DATE,
        dagrun_timeout=DAGRUN_TIMEOUT,
        tags=['example'],
        catchup=False,
) as dag_1:
    # [START howto_operator_papermill]
    run_this = PapermillOperator(
        task_id="run_example_notebook",
        input_nb="/tmp/hello_world.ipynb",
        output_nb="/tmp/out-{{ execution_date }}.ipynb",
        parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"},
    )
    # [END howto_operator_papermill]


@task
def check_notebook(inlets, execution_date):
    """
    Verify the message in the notebook
    """
    notebook = sb.read_notebook(inlets[0].url)
    message = notebook.scraps['message']
    print(f"Message in notebook {message} for {execution_date}")

    if message.data != f"Ran from Airflow at {execution_date}!":
Beispiel #6
0
# specific language governing permissions and limitations
# under the License.
"""
This DAG will use Papermill to run the notebook "hello_world", based on the execution date
it will create an output notebook "out-<date>". All fields, including the keys in the parameters, are
templated.
"""

from datetime import timedelta

from airflow.models import DAG
from airflow.providers.papermill.operators.papermill import PapermillOperator
from airflow.utils.dates import days_ago

default_args = {'owner': 'airflow', 'start_date': days_ago(2)}

with DAG(
        dag_id='example_papermill_operator',
        default_args=default_args,
        schedule_interval='0 0 * * *',
        dagrun_timeout=timedelta(minutes=60),
        tags=['example'],
) as dag:
    # [START howto_operator_papermill]
    run_this = PapermillOperator(
        task_id="run_example_notebook",
        input_nb="/tmp/hello_world.ipynb",
        output_nb="/tmp/out-{{ execution_date }}.ipynb",
        parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"})
    # [END howto_operator_papermill]
                                  poke_interval=15)

    n_estimators = [100, 150]
    max_features = ['auto', 'sqrt']

    training_model_tasks = []
    for feature in max_features:
        for estimator in n_estimators:
            ml_id = f"{feature}_{estimator}"
            training_model_tasks.append(
                PapermillOperator(
                    task_id=f'training_model_{ml_id}',
                    input_nb=
                    '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb',
                    output_nb=
                    f'/tmp/out-model-avocado-prediction-{ml_id}.ipynb',
                    parameters={
                        'filepath': '/tmp/avocado.csv',
                        'n_estimators': estimator,
                        'max_features': feature,
                        'ml_id': ml_id
                    }))

    evaluating_rmse = BranchSQLOperator(task_id='evaluating_rmse',
                                        sql='sql/FETCH_MIN_RMSE.sql',
                                        conn_id='postgres',
                                        follow_task_ids_if_true='accurate',
                                        follow_task_ids_if_false='inaccurate')

    accurate = DummyOperator(task_id='accurate')

    inaccurate = DummyOperator(task_id='inaccurate')