def create_dag(client): dag = DAG(DAG_ID, default_args=default_args, schedule_interval=None) with dag: start_task = DummyOperator(task_id='start') finish_task = DummyOperator(task_id='finish') for table in airflow_vars['dags']['vibe_to_lake']['tables']: pusher_task_id = f'schedule_dataflow_{table}' schedule_df_task = ScheduleDataflowJobOperator( task_id=pusher_task_id, project=project, template_name='load_vibe_to_lake', job_name=f'vibe-to-lake---client---{table}', job_parameters={ 'client': '--client--', 'table': f'`{project}.pyr_--client--_{env}.{table}`', 'dest': f'{project}:lake.{table}' }, provide_context=True ) monitor_df_job_task = DataflowJobStateSensor( task_id=f'monitor_df_job_{table}', pusher_task_id=pusher_task_id, poke_interval=airflow_vars['dags']['vibe_to_lake']['poke_interval'], timeout=airflow_vars['dags']['vibe_to_lake']['poke_timeout'], dag=dag ) start_task.set_downstream(schedule_df_task) schedule_df_task.set_downstream(monitor_df_job_task) monitor_df_job_task.set_downstream(finish_task) start_task >> finish_task return dag
def create_dag(): dag = DAG(DAG_ID, default_args=default_args, schedule_interval='@hourly', catchup=False) with dag: finish_task = DummyOperator(task_id='finish') pusher_task_id = f'schedule_df_wrench_to_lake' should_run_task = BranchPythonOperator(task_id='should_run', python_callable=should_run) schedule_df_task = ScheduleDataflowJobOperator( task_id=pusher_task_id, project=project_id, template_name='load_wrench_to_lake', job_name=f'wrench-to-lake', job_parameters={}, provide_context=True) monitor_df_job_task = DataflowJobStateSensor( task_id=f'monitor_df_job', pusher_task_id=pusher_task_id, poke_interval=airflow_vars['dags']['wrench_to_lake'] ['poke_interval'], timeout=airflow_vars['dags']['wrench_to_lake']['poke_timeout'], dag=dag) move_files_task = PythonOperator(task_id='move_processed_files', python_callable=move_files) should_run_task >> schedule_df_task >> monitor_df_job_task >> move_files_task >> finish_task return dag
def test_returns_job(env): with DAG(dag_id='schedule_dataflow_test', start_date=datetime.now(), schedule_interval=None) as dag: task = ScheduleDataflowJobOperator( project=env['project'], template_name='load_vibe_to_lake', job_name='schedule-dataflow-test-{}'.format(int(time.time())), job_parameters={ 'client': 'bluesun', 'table': 'pyr_bluesun_local.tree_user_types', 'dest': '{}:lake.tree_user_types'.format(env['project']) }, dag=dag, task_id='test_task') ti = TaskInstance(task=task, execution_date=datetime.now()) job = task.execute(ti.get_template_context()) assert job['projectId'] == env['project']
def create_dag(): dag = DAG( DAG_ID, default_args=default_args, # Be sure to stagger the dags so they don't run all at once, # possibly causing max memory usage and pod failure. - Stu M. schedule_interval='0 * * * *', catchup=False) with dag: start_task = DummyOperator(task_id='start') finish_task = DummyOperator(task_id='finish') storage = CloudStorage.factory(project_id) cdc_imports_bucket = storage.get_bucket(bucket) cdc_imports_processed_bucket = storage.get_bucket(processed_bucket) for files_startwith, table in table_map.items(): pusher_task_id = f'schedule_df_gcs_to_lake_{table}' continue_if_file_task = BranchPythonOperator( task_id=f'continue_if_file_{files_startwith}', python_callable=should_continue, op_args=[files_startwith, cdc_imports_bucket, table]) schedule_df_task = ScheduleDataflowJobOperator( task_id=pusher_task_id, project=project_id, template_name=f'load_cdc_from_gcs_to_lake', job_name=f'gcs-to-lake-{table}', job_parameters={ 'files_startwith': files_startwith, 'dest': f'{project_id}:lake.{table}' }, provide_context=True) monitor_df_job_task = DataflowJobStateSensor( task_id=f'monitor_df_job_{table}', pusher_task_id=pusher_task_id, poke_interval=airflow_vars['dags']['cdc_from_gcs_to_lake'] ['poke_interval'], timeout=airflow_vars['dags']['cdc_from_gcs_to_lake'] ['poke_timeout'], dag=dag) move_files_task = PythonOperator( task_id=f'move_processed_files_{files_startwith}', python_callable=storage.move_files, op_args=[ files_startwith, cdc_imports_bucket, cdc_imports_processed_bucket ], ) (start_task >> continue_if_file_task >> schedule_df_task >> monitor_df_job_task >> move_files_task >> finish_task) return dag
def _create_df_task(dag): return ScheduleDataflowJobOperator( project=env['project'], template_name='load_vibe_to_lake', job_name='schedule-dataflow-test-{}'.format(int(time.time())), job_parameters={ 'client': 'bluesun', 'table': 'pyr_bluesun_local.tree_user_types', 'dest': '{}:lake.tree_user_types'.format(env['project']) }, pull_parameters=[{ 'key': 'one' }, { 'key': 'two', 'param_name': 'two-specific-key' }, { 'task_id': 'three', 'param_name': 'three' }], dag=dag, task_id='test_task')
def create_dag(client): dag = DAG(DAG_ID, default_args=default_args, schedule_interval=None) with dag: tables = [] with open(f'{settings.DAGS_FOLDER}/table_lists/table-list.json', 'r') as f: file_json_content = f.read() tables = json.loads(file_json_content) should_run_task = BranchPythonOperator( task_id='should_run', python_callable=should_run ) start_sql_instance_task = BashOperator( task_id='start_sql_instance', bash_command=start_sql_cmd ) pre_delete_database_task = BashOperator( task_id=f'pre_delete_{database_name}_database', bash_command=delete_db_cmd ) create_db_task = BashOperator( task_id=f'create_{database_name}_database', bash_command=create_db_cmd ) import_db_task = BashOperator( task_id=f'import_{database_name}_database', bash_command=import_db_cmd ) delete_db_import_file_task = PythonOperator( task_id='delete_db_import_file', python_callable=delete_db_import_file ) post_delete_database_task = BashOperator( task_id=f'post_delete_{database_name}_database', bash_command=delete_db_cmd ) stop_sql_instance_task = BashOperator( task_id='stop_sql_instance', bash_command=stop_sql_cmd ) finish_task = DummyOperator( task_id='finish' ) try: for t in tables: pusher_task_id = f'schedule_dataflow_job_for_{t["table"]}' schedule_df_task = ScheduleDataflowJobOperator( task_id=pusher_task_id, project=project_id, template_name='load_sql_to_bq', job_name=f'load---client---{t["table"]}-sql-to-bq', job_parameters={ 'env': env, 'client': '--client--', 'bq_table': f'{project_id}:{database_name}.{t["table"]}', 'table': t["table"], 'key_field': t["keyField"] }, provide_context=True ) monitor_df_job_task = DataflowJobStateSensor( task_id=f'monitor_df_job_{t["table"]}', pusher_task_id=pusher_task_id, poke_interval=airflow_vars['dags']['vibe_to_bq_initial_load']['poke_interval'], timeout=airflow_vars['dags']['vibe_to_bq_initial_load']['poke_timeout'] ) import_db_task.set_downstream(schedule_df_task) schedule_df_task.set_downstream(monitor_df_job_task) monitor_df_job_task.set_downstream(delete_db_import_file_task) except Exception as e: log.error(e) ( should_run_task >> start_sql_instance_task >> pre_delete_database_task >> create_db_task >> import_db_task >> delete_db_import_file_task >> post_delete_database_task >> stop_sql_instance_task >> finish_task ) return dag
def create_dag(): dag = DAG( DAG_ID, default_args=default_args, # Be sure to stagger the dags so they don't run all at once, # possibly causing max memory usage and pod failure. - Stu M. schedule_interval='30 * * * *', catchup=False) with dag: start_task = DummyOperator(task_id='start') finish_task = DummyOperator(task_id='finish') for table, sources in table_map.items(): pusher_task_id = f'schedule_dataflow_{table}' parsed_table = gcloud.parse_table_name(table) get_checkpoint_task = GetCheckpointOperator( task_id=f'get_checkpoint_{table}', env=env, target=table, sources=sources) continue_if_data_task = BranchPythonOperator( task_id=f'continue_if_data_{table}', python_callable=should_continue, op_args=[table], provide_context=True) parse_query_task = PythonOperator(task_id=f'parse_query_{table}', python_callable=parse_query, op_args=[table], provide_context=True) dataflow_task = ScheduleDataflowJobOperator( task_id=pusher_task_id, project=gcloud.project(env), template_name=f'load_lake_to_staging_{parsed_table}', job_name=f'lake-to-staging-{table}', job_parameters={'env': env}, pull_parameters=[{ 'param_name': 'query', 'task_id': f'parse_query_{table}' }], provide_context=True) monitor_dataflow_task = DataflowJobStateSensor( task_id=f'monitor_df_job_{table}', poke_interval=airflow_vars['dags']['lake_to_staging'] ['poke_interval'], timeout=airflow_vars['dags']['lake_to_staging'] ['poke_timeout'], dag=dag, pusher_task_id=pusher_task_id) set_checkpoint_task = SetCheckpointOperator( task_id=f'set_checkpoint_{table}', env=env, table=table) start_task.set_downstream(get_checkpoint_task) get_checkpoint_task.set_downstream(continue_if_data_task) continue_if_data_task.set_downstream(parse_query_task) parse_query_task.set_downstream(dataflow_task) dataflow_task.set_downstream(monitor_dataflow_task) monitor_dataflow_task.set_downstream(set_checkpoint_task) set_checkpoint_task.set_downstream(finish_task) start_task >> finish_task return dag
def test_already_running_then_skip(env, setup_teardown, airflow_session): def datafile(filename): return os.path.join('/workspace/airflow/dags/libs/shared/data', filename) # Save these snippets for later in case we need to mock an success. - Stu M. 4/29/19 # http = HttpMock(datafile('dataflow.json'), {'status': '200'}) # requestBuilder = RequestMockBuilder( # {'dataflow.projects.templates.launch': (None, '{"job": ""}')} # ) # with pytest.raises(HttpError) as e: # job = task.execute(ti.get_template_context()) # assert e.resp.status == 409 http = HttpMock(datafile('dataflow.json'), {'status': '200'}) errorResponse = httplib2.Response({ 'status': '409', 'reason': 'Server Error' }) requestBuilder = RequestMockBuilder( {'dataflow.projects.templates.launch': (errorResponse, b'')}) dag = DAG('shortcircuit_operator_test_with_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) task = ScheduleDataflowJobOperator( project=env['project'], template_name='load_vibe_to_lake', job_name='schedule-dataflow-test-{}'.format(int(time.time())), job_parameters={ 'client': 'bluesun', 'table': 'pyr_bluesun_local.tree_user_types', 'dest': '{}:lake.tree_user_types'.format(env['project']) }, dag=dag, task_id='schedule_dataflow_operation', http=http, requestBuilder=requestBuilder) middle_task = DummyOperator(task_id='middle_task', dag=dag) finish_task = DummyOperator(task_id='finish', dag=dag) task >> middle_task >> finish_task dag.clear() task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) with airflow_session() as session: tis = session.query(TaskInstance).filter( TaskInstance.dag_id == dag.dag_id, TaskInstance.execution_date == DEFAULT_DATE) for ti in tis: if ti.task_id == 'schedule_dataflow_operation': assert ti.state == State.SUCCESS elif ti.task_id == 'middle_task': assert ti.state == State.SKIPPED elif ti.task_id == 'finish': assert ti.state == State.SKIPPED
def create_dag(): dag = DAG(DAG_ID, catchup=False, default_args=default_args, schedule_interval='@hourly') with dag: start_task = DummyOperator(task_id='start') finish_task = DummyOperator(task_id='finish') for table in get_airflow_vars()['dags'][DAG_ID]['tables']: table = table['name'] parsed_table = gcloud.parse_table_name(table) get_checkpoint_task = GetCheckpointOperator( task_id='get_checkpoint_{}'.format(table), env=env, target=table, sources=[table]) continue_if_data_task = BranchPythonOperator( task_id='continue_if_data_{}'.format(table), python_callable=continue_if_data, op_args=[table], trigger_rule='all_done', provide_context=True) clear_gcs_bucket_by_table_task = PythonOperator( task_id='clear_gcs_bucket_{}'.format(table), python_callable=clear_gcs_bucket_by_table, op_args=[env, table]) parse_query_task = PythonOperator(task_id=f'parse_query_{table}', python_callable=parse_query, op_args=[table], provide_context=True) dataflow_task = ScheduleDataflowJobOperator( task_id=f'schedule_dataflow_{table}', project=gcloud.project(env), template_name='offload_bq_to_cs', job_name=f'bq-to-wrench-{parsed_table}', job_parameters={ 'destination': 'gs://{}/{}/{}'.format(gcs_bucket, table, f'bq-to-wrench-{parsed_table}') }, pull_parameters=[{ 'param_name': 'query', 'task_id': f'parse_query_{table}' }], provide_context=True) monitor_dataflow_task = DataflowJobStateSensor( task_id=f'monitor_dataflow_{table}', pusher_task_id=f'schedule_dataflow_{table}', poke_interval=get_airflow_vars()['dags'][DAG_ID] ['poke_interval'], timeout=get_airflow_vars()['dags'][DAG_ID]['poke_timeout'], dag=dag) gcs_to_wrench_s3_task = PythonOperator( task_id='gcs_to_wrench_s3_{}'.format(table), python_callable=gcs_to_wrench_s3, op_args=[env, table]) commit_checkpoint_task = SetCheckpointOperator( task_id='commit_checkpoint_{}'.format(table), env=env, table=table) (start_task >> get_checkpoint_task >> continue_if_data_task >> clear_gcs_bucket_by_table_task >> parse_query_task >> dataflow_task >> monitor_dataflow_task >> gcs_to_wrench_s3_task >> commit_checkpoint_task >> finish_task) return dag