def subdag_factory(parent_dag_name, child_dag_name, default_args): with DAG(dag_id=f'{parent_dag_name}.{child_dag_name}', default_args=default_args) as dag: model_settings = Variable.get('avocado_dag_model_settings', deserialize_json=True) for feature in model_settings['max_features']: for estimator in model_settings['n_estimators']: ml_id = feature + '_' + str(estimator) PapermillOperator( task_id=f'training_model_{ml_id}', input_nb= '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb', output_nb= f'/tmp/out-model-avocado-prediction-{ml_id}.ipynb', parameters={ 'filepath': '/tmp/avocado.csv', 'n_estimators': estimator, 'max_features': feature, 'ml_id': ml_id }, pool='training_pool') return dag
def subdag_factory(parent_dag_name, child_dag_name, default_args): with DAG(dag_id='{0}.{1}'.format(parent_dag_name, child_dag_name), default_args=default_args) as dag: model_settings = Variable.get('avocado_dag_model_settings', deserialize_json=True) training_model_tasks = [] for feature in model_settings['max_features']: for estimator in model_settings['n_estimators']: ml_id = f'{feature}_{estimator}' training_model_tasks.append( PapermillOperator( task_id='training_model_{0}'.format(ml_id), input_nb= '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb', output_nb= '/usr/local/airflow/include/tmp/out-model-avocado-prediction-{0}.ipynb' .format(ml_id), parameters={ 'filepath': '/usr/local/airflow/include/tmp/avocado.csv', 'n_estimators': estimator, 'max_features': feature, 'ml_id': ml_id }, pool='training_pool')) return dag
def test_execute(self, mock_papermill): in_nb = "/tmp/does_not_exist" out_nb = "/tmp/will_not_exist" parameters = {"msg": "hello_world", "train": 1} po = PapermillOperator(input_nb=in_nb, output_nb=out_nb, parameters=parameters, task_id="papermill_operator_test", dag=None) po.pre_execute(context={}) # make sure to have the inlets po.execute(context={}) mock_papermill.execute_notebook.assert_called_once_with( in_nb, out_nb, parameters=parameters, progress_bar=False, report_mode=True)
# specific language governing permissions and limitations # under the License. """ This DAG will use Papermill to run the notebook "hello_world", based on the execution date it will create an output notebook "out-<date>". All fields, including the keys in the parameters, are templated. """ from datetime import timedelta from airflow.models import DAG from airflow.operators.papermill_operator import PapermillOperator from airflow.utils.dates import days_ago default_args = {'owner': 'airflow', 'start_date': days_ago(2)} with DAG( dag_id='example_papermill_operator', default_args=default_args, schedule_interval='0 0 * * *', dagrun_timeout=timedelta(minutes=60), tags=['example'], ) as dag: # [START howto_operator_papermill] run_this = PapermillOperator( task_id="run_example_notebook", input_nb="/tmp/hello_world.ipynb", output_nb="/tmp/out-{{ execution_date }}.ipynb", parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"}) # [END howto_operator_papermill]
catchup=False) as dag: downloading_data = PythonOperator(task_id='downloading_data', python_callable=download_dataset) waiting_for_data = FileSensor(task_id='waiting_for_data', fs_conn_id='fs_default', filepath='avocado.csv', poke_interval=15) training_model = PapermillOperator( task_id='training_model', input_nb= '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb', output_nb='/tmp/out-model-avocado-prediction.ipynb', parameters={ 'filepath': '/tmp/avocado.csv', 'n_estimators': 100, 'max_features': 'auto', 'output_rmse': '/tmp/out-model-avocado-prediction-rmse.txt' }) evaluating_rmse = BranchPythonOperator(task_id="evaluating_rmse", python_callable=read_rmse) accurate = DummyOperator(task_id='accurate') inaccurate = DummyOperator(task_id='inaccurate') downloading_data >> waiting_for_data >> training_model >> evaluating_rmse evaluating_rmse >> [accurate, inaccurate]
'owner': 'airflow', } with DAG( dag_id='example_papermill_operator', default_args=default_args, schedule_interval='0 0 * * *', start_date=days_ago(2), dagrun_timeout=timedelta(minutes=60), tags=['example'], ) as dag_1: # [START howto_operator_papermill] run_this = PapermillOperator( task_id="run_example_notebook", input_nb= "https://github.com/apache/airflow/blob/master/airflow/providers/papermill/example_dags/input_notebook.ipynb", output_nb="/tmp/out-{{ execution_date }}.ipynb", parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"}, ) # [END howto_operator_papermill] def check_notebook(inlets, execution_date): """ Verify the message in the notebook """ notebook = sb.read_notebook(inlets[0].url) message = notebook.scraps['message'] print(f"Message in notebook {message} for {execution_date}") if message.data != f"Ran from Airflow at {execution_date}!":
import airflow from airflow import DAG from airflow.operators.papermill_operator import PapermillOperator from airflow.operators.bash_operator import BashOperator from datetime import datetime, timedelta default_args = { 'owner': 'Utsav', 'start_date': datetime(2019,1,25), } dag = DAG('papermill_DAG', default_args=default_args, schedule_interval=None) t1=PapermillOperator( task_id="Job_Schedular", input_nb="schedular.ipynb", #output_nb="op-{{execution_date}}.ipynb", output_nb="op1.ipynb", parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"}, dag=dag, ) t2=BashOperator( task_id="Finished", bash_command="echo Finished", dag=dag, ) t1.set_downstream(t2)
if message.data != f"Ran from Airflow at {execution_date}!": return False return True args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2)} dag = DAG(dag_id='example_papermill_operator', default_args=args, schedule_interval='0 0 * * *', dagrun_timeout=timedelta(minutes=60)) run_this = PapermillOperator( task_id="run_example_notebook", dag=dag, input_nb=os.path.join(os.path.dirname(os.path.realpath(__file__)), "input_notebook.ipynb"), output_nb="/tmp/out-{{ execution_date }}.ipynb", parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"}) check_output = PythonOperator(task_id='check_out', python_callable=check_notebook, dag=dag, inlets=AUTO) check_output.set_upstream(run_this) if __name__ == "__main__": dag.cli()
'start_date': days_ago(2), 'provide_context': True } with DAG( dag_id='example_notebook_var_json', default_args=default_args, catchup=False, schedule_interval='*/4 * * * *', dagrun_timeout=timedelta(minutes=60) ) as dag: send_this = PapermillOperator( task_id="send", provide_context=True, input_nb="dags/notebooks/example_notebook_var_json/create_json_var.ipynb", output_nb="dags/notebooks/outnbs/out-json_var.ipynb", parameters={"msg": "Sended"} ) recive_this = PapermillOperator( task_id="recive", provide_context=True, input_nb="dags/notebooks/example_notebook_var_json/consume_json_var.ipynb", output_nb="dags/notebooks/outnbs/out-json_var.ipynb", parameters={"msgs": 'Recived' } ) delete_vars = PapermillOperator( task_id="delete", provide_context=True,
import os from datetime import timedelta from datetime import datetime from airflow.models import DAG from airflow.operators.papermill_operator import PapermillOperator from airflow.utils.dates import days_ago default_args = { 'owner': 'Walter', 'start_date': datetime(2020, 05, 25), } with DAG( dag_id='COVID_V2', default_args=default_args, schedule_interval='@daily', dagrun_timeout=timedelta(minutes=10), ) as dag: # [START howto_operator_papermill] run_this = PapermillOperator( task_id="COVID_V2_notebook", input_nb="/home/ubuntu/COVID/COVID_V_produccion.ipynb", output_nb="/home/ubuntu/COVID/out_COVID_V-{{ execution_date }}.ipynb", parameters={'msgs': 'Ran from Airflow at {{ execution_date }}!'}) # [END howto_operator_papermill]
with DAG( dag_id='example_notebook_xcom', default_args=default_args, catchup=False, schedule_interval='*/4 * * * *', dagrun_timeout=timedelta(minutes=60) ) as dag: opr_hello = BashOperator(task_id='say_Hi', bash_command='echo "Hi!!"') opr_sleep = BashOperator(task_id='sleep_me', bash_command='sleep 5') send_this = PapermillOperator( task_id="send", provide_context=True, input_nb="dags/notebooks/send_xcom.ipynb", output_nb="dags/notebooks/out-xcom.ipynb", parameters={"msg": "Sended"} ) recive_this = PapermillOperator( task_id="recive", provide_context=True, input_nb="dags/notebooks/recive_xcom.ipynb", output_nb="dags/notebooks/out-xcom.ipynb", parameters={"msgs": "Recived"} ) opr_hello >> send_this >> opr_sleep >> recive_this
n_estimators = [100, 150] max_features = ['auto', 'sqrt'] training_model_tasks = [] for feature in max_features: for estimator in n_estimators: ml_id = feature + '_' + str(estimator) training_model_tasks.append( PapermillOperator( task_id='training_model_{0}'.format(ml_id), input_nb= '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb', output_nb='/tmp/out-model-avocado-prediction-{0}.ipynb'. format(ml_id), pool='training_pool', parameters={ 'filepath': '/tmp/avocado.csv', 'n_estimators': estimator, 'max_features': feature, 'ml_id': ml_id })) evaluating_rmse = BranchSQLOperator(task_id='evaluating_rmse', sql='sql/FETCH_MIN_RMSE.sql', conn_id='postgres', follow_task_ids_if_true='accurate', follow_task_ids_if_false='inaccurate') accurate = DummyOperator(task_id='accurate')
os.mkdir(output_directory) print("Creating output directory to " + output_directory) t1 = PythonOperator( task_id='init', python_callable=initDag, provide_context=True, dag=dag, ) t2 = PapermillOperator( task_id='notebook01', depends_on_past=True, input_nb=dag.params['base_directory'] + "Notebook01.ipynb", output_nb=dag.params['base_directory'] + "output/{{ execution_date }}/" + "Notebook01.ipynb", parameters="", dag=dag, ) t3 = PapermillOperator( task_id='notebook02', depends_on_past=True, input_nb=dag.params['base_directory'] + "Notebook02.ipynb", output_nb=dag.params['base_directory'] + "output/{{ execution_date }}/" + "Notebook02.ipynb", parameters="", dag=dag, )
task_id='drop_postgres_pre_staging', postgres_conn_id=POSTGRES_CONN_ID, sql='sql/ddl/drop_tbl_daily_exchange_rate_pre.sql', dag=dag) # drop staging table drop_postgres_staging = PostgresOperator( task_id='drop_postgres_staging', postgres_conn_id=POSTGRES_CONN_ID, sql='sql/ddl/drop_tbl_daily_exchange_rate_presql', dag=dag) # refresh jupyter notebook refresh_jupypter_notebook = PapermillOperator( task_id='refresh_jupyter_notebook', input_nb='/usr/local/airflow/notebooks/rates_analysis.ipynb', output_nb='/usr/local/airflow/notebooks/rates_analysis.ipynb', parameters='', dag=dag) end_operator = DummyOperator(task_id='stop_dag', dag=dag) ## DAG Task Dependancies start_operator >> create_postgres_staging start_operator >> alphavantage_to_s3 start_operator >> create_postgres_pre_staging create_postgres_staging >> s3_to_postgres_pre_staging alphavantage_to_s3 >> s3_to_postgres_pre_staging create_postgres_pre_staging >> s3_to_postgres_pre_staging
"email_on_failure": False, "email_on_retry": False, "retries": 1, "retry_delay": timedelta(minutes=5) } dag = DAG( "nb_pipeline", default_args=default_args, description="A simple notebook pipeline DAG", schedule_interval=timedelta(days=1), ) t1 = PapermillOperator( task_id="data_ingest", dag=dag, input_nb="/mnt/airflow/ingest_data.ipynb", output_nb="/mnt/airflow/out/ingest_data_out_{{ execution_date }}.ipynb") t2 = PapermillOperator( task_id="data_prep", dag=dag, input_nb="/mnt/airflow/data_prep.ipynb", output_nb="/mnt/airflow/out/data_prep_out_{{ execution_date }}.ipynb") t3 = PapermillOperator( task_id="model_training", dag=dag, input_nb="/mnt/airflow/model_training.ipynb", output_nb="/mnt/airflow/out/model_training_out_{{ execution_date }}.ipynb", parameters={"tree_max_depth": 5})
project = models.Variable.get('gcp_project') region = models.Variable.get('gcp_region') zone = models.Variable.get('gcp_zone') input_bucket = 'gs://' + models.Variable.get('gcs_input_bucket_prod') default_args = { 'start_date': airflow.utils.dates.days_ago(0), 'schedule_interval': '@daily', 'project': project, 'zone': zone, 'region': region, } with models.DAG('regression_models_prod', default_args=default_args) as dag: data_preprocessing = PapermillOperator( task_id='data_preprocessing', input_nb=input_bucket + '/notebooks/data_preprocessing.ipynb', output_nb='/home/airflow/gcs/data/data_preprocessing_out.ipynb', parameters={}, ) multi_linear_regression = PapermillOperator( task_id='multi_linear_regression', input_nb=input_bucket + '/notebooks/multi_linear_regression.ipynb', output_nb='/home/airflow/gcs/data/multi_linear_regression_out.ipynb', parameters={}, dag=dag) random_forest_regression = PapermillOperator( task_id='random_forest_regression', input_nb=input_bucket + '/notebooks/random_forest_regression.ipynb', output_nb='/home/airflow/gcs/data/random_forest_regression_out.ipynb', parameters={},
from airflow.models import DAG from airflow.operators.papermill_operator import PapermillOperator from airflow.utils.dates import days_ago default_args = {'owner': 'Airflow', 'start_date': days_ago(2)} with DAG(dag_id='example_parallelism_consumer_json_var', default_args=default_args, catchup=False, schedule_interval='*/2 * * * *', dagrun_timeout=timedelta(minutes=60)) as dag: create_vars = PapermillOperator( task_id="Create", input_nb= "dags/notebooks/example_parallelism_consumer_json_var/create_json_var.ipynb", output_nb="dags/notebooks/outnbs/out-json_var_parallelism.ipynb", parameters={"msg": "Created"}) print_json_var = PapermillOperator( task_id="print_json_var", input_nb= "dags/notebooks/example_parallelism_consumer_json_var/print_json_var.ipynb", output_nb="dags/notebooks/outnbs/out-json_var_parallelism.ipynb", parameters={"msg": "Print"}) for i in range(3): task = PapermillOperator( task_id='consumer_json_' + str(i), input_nb= 'dags/notebooks/example_parallelism_consumer_json_var/runme_' +
# -*- coding: utf-8 -*- from airflow.models import DAG from airflow.utils.dates import days_ago from airflow.operators.papermill_operator import PapermillOperator args = { "owner": "vinayak", "start_date": days_ago(1), } with DAG( dag_id="notebook_v1", default_args=default_args, schedule_interval="0 0 * * *", ) as dag: run_this = PapermillOperator( task_id="run_example_notebook", input_nb="/opt/airflow/dags/notebook.ipynb", output_nb="/tmp/{{ dag.dag_id }}/{{ run_id }}/notebook.ipynb", parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"}, )
default_args = {'owner': 'Airflow', 'start_date': days_ago(2)} with DAG(dag_id='example_notebook_parallelism', default_args=default_args, catchup=False, schedule_interval='*/2 * * * *', dagrun_timeout=timedelta(minutes=60)) as dag: opr_hello = BashOperator(task_id='say_Hi', bash_command='echo "Hi!!"') for i in range(3): task = PapermillOperator( task_id='note_runme_' + str(i), input_nb='dags/notebooks/example_notebook_parallelism/runme_' + str(i) + '.ipynb', output_nb="dags/notebooks/outnbs/out.ipynb", parameters={ "msgs": "Tarefa paralela " + str(i), "time": "{{ execution_date }}" }) task >> opr_hello opr_sleep = BashOperator(task_id='sleep_me', bash_command='sleep 5') opr_hello >> opr_sleep run_this = PapermillOperator( task_id="run_example_notebook", input_nb= "dags/notebooks/example_notebook_parallelism/hello_world.ipynb", output_nb="dags/notebooks/outnbs/out.ipynb",
from datetime import timedelta from airflow import DAG from airflow.operators.papermill_operator import PapermillOperator from airflow.utils.dates import days_ago dag = DAG( 'example_papermill_operator', schedule_interval=None, start_date=days_ago(1), ) task = PapermillOperator( task_id="run_example_notebook", input_nb="/tmp/work/example.ipynb", output_nb="/tmp/work/example-output-airflow.ipynb", parameters={ "a": 1, "b": 2 }, dag=dag, )