def create_dag(client): dag = DAG(DAG_ID, default_args=default_args, schedule_interval=None) with dag: start_task = DummyOperator(task_id='start') finish_task = DummyOperator(task_id='finish') for table in airflow_vars['dags']['vibe_to_lake']['tables']: pusher_task_id = f'schedule_dataflow_{table}' schedule_df_task = ScheduleDataflowJobOperator( task_id=pusher_task_id, project=project, template_name='load_vibe_to_lake', job_name=f'vibe-to-lake---client---{table}', job_parameters={ 'client': '--client--', 'table': f'`{project}.pyr_--client--_{env}.{table}`', 'dest': f'{project}:lake.{table}' }, provide_context=True ) monitor_df_job_task = DataflowJobStateSensor( task_id=f'monitor_df_job_{table}', pusher_task_id=pusher_task_id, poke_interval=airflow_vars['dags']['vibe_to_lake']['poke_interval'], timeout=airflow_vars['dags']['vibe_to_lake']['poke_timeout'], dag=dag ) start_task.set_downstream(schedule_df_task) schedule_df_task.set_downstream(monitor_df_job_task) monitor_df_job_task.set_downstream(finish_task) start_task >> finish_task return dag
def create_dag(client): dag = DAG(DAG_ID, default_args=default_args, schedule_interval=None) with dag: tables = [] with open(f'{settings.DAGS_FOLDER}/table_lists/table-list.json', 'r') as f: file_json_content = f.read() tables = json.loads(file_json_content) should_run_task = BranchPythonOperator( task_id='should_run', python_callable=should_run ) start_sql_instance_task = BashOperator( task_id='start_sql_instance', bash_command=start_sql_cmd ) pre_delete_database_task = BashOperator( task_id=f'pre_delete_{database_name}_database', bash_command=delete_db_cmd ) create_db_task = BashOperator( task_id=f'create_{database_name}_database', bash_command=create_db_cmd ) import_db_task = BashOperator( task_id=f'import_{database_name}_database', bash_command=import_db_cmd ) delete_db_import_file_task = PythonOperator( task_id='delete_db_import_file', python_callable=delete_db_import_file ) post_delete_database_task = BashOperator( task_id=f'post_delete_{database_name}_database', bash_command=delete_db_cmd ) stop_sql_instance_task = BashOperator( task_id='stop_sql_instance', bash_command=stop_sql_cmd ) finish_task = DummyOperator( task_id='finish' ) try: for t in tables: pusher_task_id = f'schedule_dataflow_job_for_{t["table"]}' schedule_df_task = ScheduleDataflowJobOperator( task_id=pusher_task_id, project=project_id, template_name='load_sql_to_bq', job_name=f'load---client---{t["table"]}-sql-to-bq', job_parameters={ 'env': env, 'client': '--client--', 'bq_table': f'{project_id}:{database_name}.{t["table"]}', 'table': t["table"], 'key_field': t["keyField"] }, provide_context=True ) monitor_df_job_task = DataflowJobStateSensor( task_id=f'monitor_df_job_{t["table"]}', pusher_task_id=pusher_task_id, poke_interval=airflow_vars['dags']['vibe_to_bq_initial_load']['poke_interval'], timeout=airflow_vars['dags']['vibe_to_bq_initial_load']['poke_timeout'] ) import_db_task.set_downstream(schedule_df_task) schedule_df_task.set_downstream(monitor_df_job_task) monitor_df_job_task.set_downstream(delete_db_import_file_task) except Exception as e: log.error(e) ( should_run_task >> start_sql_instance_task >> pre_delete_database_task >> create_db_task >> import_db_task >> delete_db_import_file_task >> post_delete_database_task >> stop_sql_instance_task >> finish_task ) return dag
def create_dag(): dag = DAG( DAG_ID, default_args=default_args, # Be sure to stagger the dags so they don't run all at once, # possibly causing max memory usage and pod failure. - Stu M. schedule_interval='30 * * * *', catchup=False) with dag: start_task = DummyOperator(task_id='start') finish_task = DummyOperator(task_id='finish') for table, sources in table_map.items(): pusher_task_id = f'schedule_dataflow_{table}' parsed_table = gcloud.parse_table_name(table) get_checkpoint_task = GetCheckpointOperator( task_id=f'get_checkpoint_{table}', env=env, target=table, sources=sources) continue_if_data_task = BranchPythonOperator( task_id=f'continue_if_data_{table}', python_callable=should_continue, op_args=[table], provide_context=True) parse_query_task = PythonOperator(task_id=f'parse_query_{table}', python_callable=parse_query, op_args=[table], provide_context=True) dataflow_task = ScheduleDataflowJobOperator( task_id=pusher_task_id, project=gcloud.project(env), template_name=f'load_lake_to_staging_{parsed_table}', job_name=f'lake-to-staging-{table}', job_parameters={'env': env}, pull_parameters=[{ 'param_name': 'query', 'task_id': f'parse_query_{table}' }], provide_context=True) monitor_dataflow_task = DataflowJobStateSensor( task_id=f'monitor_df_job_{table}', poke_interval=airflow_vars['dags']['lake_to_staging'] ['poke_interval'], timeout=airflow_vars['dags']['lake_to_staging'] ['poke_timeout'], dag=dag, pusher_task_id=pusher_task_id) set_checkpoint_task = SetCheckpointOperator( task_id=f'set_checkpoint_{table}', env=env, table=table) start_task.set_downstream(get_checkpoint_task) get_checkpoint_task.set_downstream(continue_if_data_task) continue_if_data_task.set_downstream(parse_query_task) parse_query_task.set_downstream(dataflow_task) dataflow_task.set_downstream(monitor_dataflow_task) monitor_dataflow_task.set_downstream(set_checkpoint_task) set_checkpoint_task.set_downstream(finish_task) start_task >> finish_task return dag