def subdag_factory(parent_dag_name, child_dag_name, default_args):
    with DAG(dag_id=f'{parent_dag_name}.{child_dag_name}',
             default_args=default_args) as dag:

        model_settings = Variable.get('avocado_dag_model_settings',
                                      deserialize_json=True)

        for feature in model_settings['max_features']:
            for estimator in model_settings['n_estimators']:
                ml_id = feature + '_' + str(estimator)
                PapermillOperator(
                    task_id=f'training_model_{ml_id}',
                    input_nb=
                    '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb',
                    output_nb=
                    f'/tmp/out-model-avocado-prediction-{ml_id}.ipynb',
                    parameters={
                        'filepath': '/tmp/avocado.csv',
                        'n_estimators': estimator,
                        'max_features': feature,
                        'ml_id': ml_id
                    },
                    pool='training_pool')

        return dag
Beispiel #2
0
def subdag_factory(parent_dag_name, child_dag_name, default_args):
    with DAG(dag_id='{0}.{1}'.format(parent_dag_name, child_dag_name),
             default_args=default_args) as dag:

        model_settings = Variable.get('avocado_dag_model_settings',
                                      deserialize_json=True)
        training_model_tasks = []

        for feature in model_settings['max_features']:
            for estimator in model_settings['n_estimators']:
                ml_id = f'{feature}_{estimator}'
                training_model_tasks.append(
                    PapermillOperator(
                        task_id='training_model_{0}'.format(ml_id),
                        input_nb=
                        '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb',
                        output_nb=
                        '/usr/local/airflow/include/tmp/out-model-avocado-prediction-{0}.ipynb'
                        .format(ml_id),
                        parameters={
                            'filepath':
                            '/usr/local/airflow/include/tmp/avocado.csv',
                            'n_estimators': estimator,
                            'max_features': feature,
                            'ml_id': ml_id
                        },
                        pool='training_pool'))

        return dag
Beispiel #3
0
    def test_execute(self, mock_papermill):
        in_nb = "/tmp/does_not_exist"
        out_nb = "/tmp/will_not_exist"
        parameters = {"msg": "hello_world", "train": 1}

        po = PapermillOperator(input_nb=in_nb,
                               output_nb=out_nb,
                               parameters=parameters,
                               task_id="papermill_operator_test",
                               dag=None)

        po.pre_execute(context={})  # make sure to have the inlets
        po.execute(context={})

        mock_papermill.execute_notebook.assert_called_once_with(
            in_nb,
            out_nb,
            parameters=parameters,
            progress_bar=False,
            report_mode=True)
# specific language governing permissions and limitations
# under the License.
"""
This DAG will use Papermill to run the notebook "hello_world", based on the execution date
it will create an output notebook "out-<date>". All fields, including the keys in the parameters, are
templated.
"""

from datetime import timedelta

from airflow.models import DAG
from airflow.operators.papermill_operator import PapermillOperator
from airflow.utils.dates import days_ago

default_args = {'owner': 'airflow', 'start_date': days_ago(2)}

with DAG(
        dag_id='example_papermill_operator',
        default_args=default_args,
        schedule_interval='0 0 * * *',
        dagrun_timeout=timedelta(minutes=60),
        tags=['example'],
) as dag:
    # [START howto_operator_papermill]
    run_this = PapermillOperator(
        task_id="run_example_notebook",
        input_nb="/tmp/hello_world.ipynb",
        output_nb="/tmp/out-{{ execution_date }}.ipynb",
        parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"})
    # [END howto_operator_papermill]
Beispiel #5
0
         catchup=False) as dag:

    downloading_data = PythonOperator(task_id='downloading_data',
                                      python_callable=download_dataset)

    waiting_for_data = FileSensor(task_id='waiting_for_data',
                                  fs_conn_id='fs_default',
                                  filepath='avocado.csv',
                                  poke_interval=15)

    training_model = PapermillOperator(
        task_id='training_model',
        input_nb=
        '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb',
        output_nb='/tmp/out-model-avocado-prediction.ipynb',
        parameters={
            'filepath': '/tmp/avocado.csv',
            'n_estimators': 100,
            'max_features': 'auto',
            'output_rmse': '/tmp/out-model-avocado-prediction-rmse.txt'
        })

    evaluating_rmse = BranchPythonOperator(task_id="evaluating_rmse",
                                           python_callable=read_rmse)

    accurate = DummyOperator(task_id='accurate')

    inaccurate = DummyOperator(task_id='inaccurate')

    downloading_data >> waiting_for_data >> training_model >> evaluating_rmse
    evaluating_rmse >> [accurate, inaccurate]
Beispiel #6
0
    'owner': 'airflow',
}

with DAG(
        dag_id='example_papermill_operator',
        default_args=default_args,
        schedule_interval='0 0 * * *',
        start_date=days_ago(2),
        dagrun_timeout=timedelta(minutes=60),
        tags=['example'],
) as dag_1:
    # [START howto_operator_papermill]
    run_this = PapermillOperator(
        task_id="run_example_notebook",
        input_nb=
        "https://github.com/apache/airflow/blob/master/airflow/providers/papermill/example_dags/input_notebook.ipynb",
        output_nb="/tmp/out-{{ execution_date }}.ipynb",
        parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"},
    )
    # [END howto_operator_papermill]


def check_notebook(inlets, execution_date):
    """
    Verify the message in the notebook
    """
    notebook = sb.read_notebook(inlets[0].url)
    message = notebook.scraps['message']
    print(f"Message in notebook {message} for {execution_date}")

    if message.data != f"Ran from Airflow at {execution_date}!":
Beispiel #7
0
import airflow
from airflow import DAG
from airflow.operators.papermill_operator import PapermillOperator
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta


default_args = {
    'owner': 'Utsav',
    'start_date': datetime(2019,1,25),
}

dag = DAG('papermill_DAG', default_args=default_args, schedule_interval=None)


t1=PapermillOperator(
    task_id="Job_Schedular",
    input_nb="schedular.ipynb",
    #output_nb="op-{{execution_date}}.ipynb",
    output_nb="op1.ipynb",
    parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"},
    dag=dag,
)

t2=BashOperator(
    task_id="Finished",
    bash_command="echo Finished",
    dag=dag,
)
t1.set_downstream(t2)
    if message.data != f"Ran from Airflow at {execution_date}!":
        return False

    return True


args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2)}

dag = DAG(dag_id='example_papermill_operator',
          default_args=args,
          schedule_interval='0 0 * * *',
          dagrun_timeout=timedelta(minutes=60))

run_this = PapermillOperator(
    task_id="run_example_notebook",
    dag=dag,
    input_nb=os.path.join(os.path.dirname(os.path.realpath(__file__)),
                          "input_notebook.ipynb"),
    output_nb="/tmp/out-{{ execution_date }}.ipynb",
    parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"})

check_output = PythonOperator(task_id='check_out',
                              python_callable=check_notebook,
                              dag=dag,
                              inlets=AUTO)

check_output.set_upstream(run_this)

if __name__ == "__main__":
    dag.cli()
Beispiel #9
0
    'start_date': days_ago(2),
    'provide_context': True
}

with DAG(
    dag_id='example_notebook_var_json',
    default_args=default_args,
    catchup=False,
    schedule_interval='*/4 * * * *',
    dagrun_timeout=timedelta(minutes=60)
) as dag:

    send_this = PapermillOperator(
        task_id="send",
        provide_context=True,
        input_nb="dags/notebooks/example_notebook_var_json/create_json_var.ipynb",
        output_nb="dags/notebooks/outnbs/out-json_var.ipynb",
        parameters={"msg": "Sended"}
    )

    recive_this = PapermillOperator(
        task_id="recive",
        provide_context=True,
        input_nb="dags/notebooks/example_notebook_var_json/consume_json_var.ipynb",
        output_nb="dags/notebooks/outnbs/out-json_var.ipynb",
        parameters={"msgs": 'Recived' }
    )

    delete_vars = PapermillOperator(
        task_id="delete",
        provide_context=True,
import os
from datetime import timedelta
from datetime import datetime
from airflow.models import DAG
from airflow.operators.papermill_operator import PapermillOperator
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'Walter',
    'start_date': datetime(2020, 05, 25),
}

with DAG(
        dag_id='COVID_V2',
        default_args=default_args,
        schedule_interval='@daily',
        dagrun_timeout=timedelta(minutes=10),
) as dag:
    # [START howto_operator_papermill]
    run_this = PapermillOperator(
        task_id="COVID_V2_notebook",
        input_nb="/home/ubuntu/COVID/COVID_V_produccion.ipynb",
        output_nb="/home/ubuntu/COVID/out_COVID_V-{{ execution_date }}.ipynb",
        parameters={'msgs': 'Ran from Airflow at {{ execution_date }}!'})
    # [END howto_operator_papermill]
Beispiel #11
0
with DAG(
    dag_id='example_notebook_xcom',
    default_args=default_args,
    catchup=False,
    schedule_interval='*/4 * * * *',
    dagrun_timeout=timedelta(minutes=60)
) as dag:

    opr_hello = BashOperator(task_id='say_Hi',
                             bash_command='echo "Hi!!"')
    
    opr_sleep = BashOperator(task_id='sleep_me',
                             bash_command='sleep 5')

    send_this = PapermillOperator(
        task_id="send",
        provide_context=True,
        input_nb="dags/notebooks/send_xcom.ipynb",
        output_nb="dags/notebooks/out-xcom.ipynb",
        parameters={"msg": "Sended"}
    )

    recive_this = PapermillOperator(
        task_id="recive",
        provide_context=True,
        input_nb="dags/notebooks/recive_xcom.ipynb",
        output_nb="dags/notebooks/out-xcom.ipynb",
        parameters={"msgs": "Recived"}
    )

    opr_hello >> send_this >> opr_sleep >> recive_this
    n_estimators = [100, 150]
    max_features = ['auto', 'sqrt']
    training_model_tasks = []

    for feature in max_features:
        for estimator in n_estimators:
            ml_id = feature + '_' + str(estimator)
            training_model_tasks.append(
                PapermillOperator(
                    task_id='training_model_{0}'.format(ml_id),
                    input_nb=
                    '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb',
                    output_nb='/tmp/out-model-avocado-prediction-{0}.ipynb'.
                    format(ml_id),
                    pool='training_pool',
                    parameters={
                        'filepath': '/tmp/avocado.csv',
                        'n_estimators': estimator,
                        'max_features': feature,
                        'ml_id': ml_id
                    }))

    evaluating_rmse = BranchSQLOperator(task_id='evaluating_rmse',
                                        sql='sql/FETCH_MIN_RMSE.sql',
                                        conn_id='postgres',
                                        follow_task_ids_if_true='accurate',
                                        follow_task_ids_if_false='inaccurate')

    accurate = DummyOperator(task_id='accurate')
Beispiel #13
0
    os.mkdir(output_directory)
    print("Creating output directory to " + output_directory)


t1 = PythonOperator(
    task_id='init',
    python_callable=initDag,
    provide_context=True,
    dag=dag,
)

t2 = PapermillOperator(
    task_id='notebook01',
    depends_on_past=True,
    input_nb=dag.params['base_directory'] + "Notebook01.ipynb",
    output_nb=dag.params['base_directory'] + "output/{{ execution_date }}/" +
    "Notebook01.ipynb",
    parameters="",
    dag=dag,
)

t3 = PapermillOperator(
    task_id='notebook02',
    depends_on_past=True,
    input_nb=dag.params['base_directory'] + "Notebook02.ipynb",
    output_nb=dag.params['base_directory'] + "output/{{ execution_date }}/" +
    "Notebook02.ipynb",
    parameters="",
    dag=dag,
)
Beispiel #14
0
    task_id='drop_postgres_pre_staging',
    postgres_conn_id=POSTGRES_CONN_ID,
    sql='sql/ddl/drop_tbl_daily_exchange_rate_pre.sql',
    dag=dag)

# drop staging table
drop_postgres_staging = PostgresOperator(
    task_id='drop_postgres_staging',
    postgres_conn_id=POSTGRES_CONN_ID,
    sql='sql/ddl/drop_tbl_daily_exchange_rate_presql',
    dag=dag)

# refresh jupyter notebook
refresh_jupypter_notebook = PapermillOperator(
    task_id='refresh_jupyter_notebook',
    input_nb='/usr/local/airflow/notebooks/rates_analysis.ipynb',
    output_nb='/usr/local/airflow/notebooks/rates_analysis.ipynb',
    parameters='',
    dag=dag)

end_operator = DummyOperator(task_id='stop_dag', dag=dag)

## DAG Task Dependancies

start_operator >> create_postgres_staging
start_operator >> alphavantage_to_s3
start_operator >> create_postgres_pre_staging

create_postgres_staging >> s3_to_postgres_pre_staging
alphavantage_to_s3 >> s3_to_postgres_pre_staging
create_postgres_pre_staging >> s3_to_postgres_pre_staging
Beispiel #15
0
    "email_on_failure": False,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=5)
}

dag = DAG(
    "nb_pipeline",
    default_args=default_args,
    description="A simple notebook pipeline DAG",
    schedule_interval=timedelta(days=1),
)

t1 = PapermillOperator(
    task_id="data_ingest",
    dag=dag,
    input_nb="/mnt/airflow/ingest_data.ipynb",
    output_nb="/mnt/airflow/out/ingest_data_out_{{ execution_date }}.ipynb")

t2 = PapermillOperator(
    task_id="data_prep",
    dag=dag,
    input_nb="/mnt/airflow/data_prep.ipynb",
    output_nb="/mnt/airflow/out/data_prep_out_{{ execution_date }}.ipynb")

t3 = PapermillOperator(
    task_id="model_training",
    dag=dag,
    input_nb="/mnt/airflow/model_training.ipynb",
    output_nb="/mnt/airflow/out/model_training_out_{{ execution_date }}.ipynb",
    parameters={"tree_max_depth": 5})
project = models.Variable.get('gcp_project')
region = models.Variable.get('gcp_region')
zone = models.Variable.get('gcp_zone')
input_bucket = 'gs://' + models.Variable.get('gcs_input_bucket_prod')

default_args = {
    'start_date': airflow.utils.dates.days_ago(0),
    'schedule_interval': '@daily',
    'project': project,
    'zone': zone,
    'region': region,
}
with models.DAG('regression_models_prod', default_args=default_args) as dag:
    data_preprocessing = PapermillOperator(
        task_id='data_preprocessing',
        input_nb=input_bucket + '/notebooks/data_preprocessing.ipynb',
        output_nb='/home/airflow/gcs/data/data_preprocessing_out.ipynb',
        parameters={},
    )

    multi_linear_regression = PapermillOperator(
        task_id='multi_linear_regression',
        input_nb=input_bucket + '/notebooks/multi_linear_regression.ipynb',
        output_nb='/home/airflow/gcs/data/multi_linear_regression_out.ipynb',
        parameters={},
        dag=dag)

    random_forest_regression = PapermillOperator(
        task_id='random_forest_regression',
        input_nb=input_bucket + '/notebooks/random_forest_regression.ipynb',
        output_nb='/home/airflow/gcs/data/random_forest_regression_out.ipynb',
        parameters={},
from airflow.models import DAG
from airflow.operators.papermill_operator import PapermillOperator
from airflow.utils.dates import days_ago

default_args = {'owner': 'Airflow', 'start_date': days_ago(2)}

with DAG(dag_id='example_parallelism_consumer_json_var',
         default_args=default_args,
         catchup=False,
         schedule_interval='*/2 * * * *',
         dagrun_timeout=timedelta(minutes=60)) as dag:

    create_vars = PapermillOperator(
        task_id="Create",
        input_nb=
        "dags/notebooks/example_parallelism_consumer_json_var/create_json_var.ipynb",
        output_nb="dags/notebooks/outnbs/out-json_var_parallelism.ipynb",
        parameters={"msg": "Created"})

    print_json_var = PapermillOperator(
        task_id="print_json_var",
        input_nb=
        "dags/notebooks/example_parallelism_consumer_json_var/print_json_var.ipynb",
        output_nb="dags/notebooks/outnbs/out-json_var_parallelism.ipynb",
        parameters={"msg": "Print"})

    for i in range(3):
        task = PapermillOperator(
            task_id='consumer_json_' + str(i),
            input_nb=
            'dags/notebooks/example_parallelism_consumer_json_var/runme_' +
Beispiel #18
0
# -*- coding: utf-8 -*-

from airflow.models import DAG
from airflow.utils.dates import days_ago
from airflow.operators.papermill_operator import PapermillOperator

args = {
    "owner": "vinayak",
    "start_date": days_ago(1),
}

with DAG(
        dag_id="notebook_v1",
        default_args=default_args,
        schedule_interval="0 0 * * *",
) as dag:
    run_this = PapermillOperator(
        task_id="run_example_notebook",
        input_nb="/opt/airflow/dags/notebook.ipynb",
        output_nb="/tmp/{{ dag.dag_id }}/{{ run_id }}/notebook.ipynb",
        parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"},
    )
Beispiel #19
0
default_args = {'owner': 'Airflow', 'start_date': days_ago(2)}

with DAG(dag_id='example_notebook_parallelism',
         default_args=default_args,
         catchup=False,
         schedule_interval='*/2 * * * *',
         dagrun_timeout=timedelta(minutes=60)) as dag:

    opr_hello = BashOperator(task_id='say_Hi', bash_command='echo "Hi!!"')

    for i in range(3):
        task = PapermillOperator(
            task_id='note_runme_' + str(i),
            input_nb='dags/notebooks/example_notebook_parallelism/runme_' +
            str(i) + '.ipynb',
            output_nb="dags/notebooks/outnbs/out.ipynb",
            parameters={
                "msgs": "Tarefa paralela " + str(i),
                "time": "{{ execution_date }}"
            })
        task >> opr_hello

    opr_sleep = BashOperator(task_id='sleep_me', bash_command='sleep 5')

    opr_hello >> opr_sleep

    run_this = PapermillOperator(
        task_id="run_example_notebook",
        input_nb=
        "dags/notebooks/example_notebook_parallelism/hello_world.ipynb",
        output_nb="dags/notebooks/outnbs/out.ipynb",
Beispiel #20
0
from datetime import timedelta
from airflow import DAG
from airflow.operators.papermill_operator import PapermillOperator
from airflow.utils.dates import days_ago

dag = DAG(
    'example_papermill_operator',
    schedule_interval=None,
    start_date=days_ago(1),
)

task = PapermillOperator(
    task_id="run_example_notebook",
    input_nb="/tmp/work/example.ipynb",
    output_nb="/tmp/work/example-output-airflow.ipynb",
    parameters={
        "a": 1,
        "b": 2
    },
    dag=dag,
)