dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='tsw/sd_paving_imcat_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: send file update email to interested parties send_last_file_updated_email = PoseidonEmailFileUpdatedOperator( task_id='send_last_file_updated', to='[email protected],[email protected],[email protected]', subject='IMCAT Streets File Updated', file_url='http://{}/{}'.format(conf['dest_s3_bucket'], 'tsw/sd_paving_imcat_datasd.csv'), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Execution order #: streets_latest_only must run before get_streets_data get_streets_data.set_upstream(streets_latest_only) #: upload_streets_data is dependent on successful run of get_streets_data upload_streets_data.set_upstream(get_streets_data) #: email notification is sent after the data was uploaded to S3 send_last_file_updated_email.set_upstream(upload_streets_data)
get_code_enf_files = PythonOperator( task_id='get_code_enf_files', python_callable=dfg.get_files, op_kwargs={'fname_list': fname_list, 'target_dir': dsd_temp_dir}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_code_enf_md = get_seaboard_update_dag('code-enforcement-violations.md', dag) #: Execution rules #: dsd_code_enf_latest_only must run before get_code_enf_files get_code_enf_files.set_upstream(dsd_ce_latest_only) for i in fname_list: #: Create fme shell command build_csv_task = BashOperator( task_id='get_' + i, bash_command=get_bash_command(i), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Set Task as Downstream for downloading files build_csv_task.set_upstream(get_code_enf_files)
#: Upload prod file to S3 cfs_to_S3 = S3FileTransferOperator( task_id='cfs_to_S3', source_base_path=conf['prod_data_dir'], source_key='pd_calls_for_service_'+curr_year+'_datasd.csv', dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='pd/pd_calls_for_service_'+curr_year+'_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_pd_cfs_md = get_seaboard_update_dag('police-calls-for-service.md', dag) #: Execution rules: #: pd_cfs_latest_only must run before pd_cfs_data get_cfs_data.set_upstream(pd_cfs_latest_only) #: Data processing is triggered after data retrieval. process_cfs_data.set_upstream(get_cfs_data) #: Data upload to S3 is triggered after data processing completion. cfs_to_S3.set_upstream(process_cfs_data) #: Github update depends on S3 upload success. update_pd_cfs_md.set_upstream(cfs_to_S3)
return None # load new data to mongodb load_new_data_task = PythonOperator( task_id='load_new_data', python_callable=load_new_data, dag=dag) def extract_type(ds, **kwargs): year, month, day = ds.split('-') # 2016-04-22 c_ds = "%s/%s/%s" % (day, month, year) # 15/12/2014 count = 0 tp = kwargs['tp'] keyword = kwargs['keyword'] for andamento in Andamentos.objects(data=c_ds): texto_lw = andamento.texto.lower() if keyword in texto_lw: andamento.tipo = tp andamento.save() count += 1 return count for tp in PROGRESS_TYPES: extract_tipo_task = PythonOperator( task_id='extract_%s_task' % (tp,), python_callable=extract_type, op_kwargs={'tp': tp, 'keyword': PROGRESS_TYPES[tp]}, dag=dag, provide_context=True) extract_tipo_task.set_upstream(load_new_data_task)
on_success_callback=notify, dag=dag) #: update permits.md file update_permits_md = get_seaboard_update_dag('permits.md', dag) #: update permits.md file update_solar_md = get_seaboard_update_dag('solar-permits.md', dag) #: Execution rules #: dsd_permits_latest_only must run before get_permits_files get_permits_files.set_upstream(dsd_permits_latest_only) #: clean_data tasks are executed after get_approvals_files task clean_data.set_upstream(get_permits_files) #: upload_dsd tasks are executed after clean_data tasks join_bids.set_upstream(clean_data) #: subset_solar tasks are executed after clean_data tasks subset_solar.set_upstream(join_bids) #: upload_dsd tasks are executed after subset_solar tasks upload_dsd_permits.set_upstream(subset_solar) #: upload_dsd tasks are executed after clean_data tasks upload_solar_permits.set_upstream(subset_solar)
sleep_time = context['task_instance'].xcom_pull(key='sleep')['b'] time.sleep(sleep_time) # push by returning return 2 def ab_task(**context): a, b = context['task_instance'].xcom_pull(task_ids=['a_task', 'b_task']) print('{} + {}: {}'.format(a, b, a + b)) return (a + b) dag = DAG('xcom_add', description='Simple tutorial DAG', schedule_interval=None, start_date=datetime(2018, 9, 3), catchup=False) t_src = PythonOperator(task_id='source_task', python_callable=source_task, provide_context=True, dag=dag) t_a = PythonOperator(task_id='a_task', python_callable=a_task, provide_context=True, dag=dag) t_b = PythonOperator(task_id='b_task', python_callable=b_task, provide_context=True, dag=dag) t_ab = PythonOperator(task_id='ab_task', python_callable=ab_task, provide_context=True, dag=dag) t_src.set_downstream(t_a) t_src.set_downstream(t_b) t_ab.set_upstream(t_a) t_ab.set_upstream(t_b)
provide_context=True, python_callable=clear_export_folder, dag=dag ) export_athena_scifi_table = AWSAthenaOperator( task_id="export_athena_scifi_table", #query=export_athena_scifi_table_query, query=export_athena_scifi_table_query2, workgroup = "devday-demo", database=athena_db, sleep_time = 60, output_location='s3://'+s3_dlake+"/"+athena_output+'export_athena_scifi_table' ) export_scifi_tofile = PythonOperator ( task_id='export_scifi_tofile', provide_context=True, python_callable=export_scifi_tofile, dag=dag ) check_athena_export_table.set_upstream(disp_variables) drop_athena_export_table.set_upstream(check_athena_export_table) check_athena_export_table_done.set_upstream(check_athena_export_table) check_athena_export_table_pass.set_upstream(drop_athena_export_table) check_athena_export_table_pass.set_upstream(check_athena_export_table_done) export_athena_scifi_table.set_upstream(clear_export_folder) clear_export_folder.set_upstream(check_athena_export_table_pass) export_scifi_tofile.set_upstream(export_athena_scifi_table)
df = df.astype('float64') df.to_csv('dags/c2k_final.csv') default_args = { 'owner': 'Israel Z', 'start_date': dt.datetime(2018, 5, 9), 'retries': 1, 'retry_delay': dt.timedelta(minutes=5), } with DAG('flow_pandas', default_args=default_args, schedule_interval='*/10 * * * *', ) as dag: download = PythonOperator(task_id='download', python_callable=download) dropn = PythonOperator(task_id='dropn', python_callable=dropn) fill = PythonOperator(task_id='fill', python_callable=fill) cast = PythonOperator(task_id='cast', python_callable=cast) # Dependencies dropn.set_upstream(download) fill.set_upstream(dropn) cast.set_upstream(fill)
dag = DAG(dag_id='sire_docs', default_args=args, start_date=start_date, schedule_interval=schedule) sire_docs_latest_only = LatestOnlyOperator(task_id='sire_docs_latest_only', dag=dag) #: Get sire tables get_doc_tables = PythonOperator( task_id='get_sire_tables', python_callable=get_sire, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Execution rules #: sire_docs_latest_only must run before get_doc_tables get_doc_tables.set_upstream(sire_docs_latest_only) files = [f for f in os.listdir(conf['prod_data_dir'])] for f in files: if f.split('_')[0] == "sire": #: Upload sire prod files to S3 upload_doc_tables = S3FileTransferOperator( task_id='upload_{}'.format(f), source_base_path=conf['prod_data_dir'], source_key=f, dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='city_docs/{}'.format(f), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify,
on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_parking_trans_md = get_seaboard_update_dag('parking-meters-transactions.md', dag) #: Execution Rules #: parking_meters_latest_only must run before get_parking_files get_parking_files.set_upstream(parking_meters_latest_only) #: Download Files, build prod file. #: build_prod_file depends on get_parking_files: build_prod_file.set_upstream(get_parking_files) #: Upload Prod File #: upload_prod_file depends on build_prod_file upload_prod_file.set_upstream(build_prod_file) #: Build Aggs #: build_by_month_aggregation depends on build_prod_file: build_by_month_aggregation.set_upstream(build_prod_file) #: build_by_day_aggregation depends on build_prod_file: build_by_day_aggregation.set_upstream(build_prod_file) #: Upload Aggs
on_success_callback=notify, dag=dag) #: Upload prod cip_datasd.csv file to S3 upload_cip_data = S3FileTransferOperator( task_id='upload_cip_data', source_base_path=conf['prod_data_dir'], source_key='cip_{0}_datasd.csv'.format(fiscal_yr), dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='cip/' + 'cip_{0}_datasd.csv'.format(fiscal_yr), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update portal modified date update_cip_md = get_seaboard_update_dag('cip.md', dag) #: Execution order #: cip_latest_only must run before get_cip_data get_cip_data.set_upstream(cip_latest_only) #: upload_cip_data is dependent on successful run of get_cip_data upload_cip_data.set_upstream(get_cip_data) #: upload_cip_data must succeed before updating github update_cip_md.set_upstream(upload_cip_data)
date_operator = BashOperator( task_id='date_task', bash_command='date', dag=dag) #------------------------------------------------------------------------------- # second operator sleep_operator = BashOperator( task_id='sleep_task', depends_on_past=False, bash_command='sleep 5', dag=dag) #------------------------------------------------------------------------------- # third operator def print_hello(): return 'Hello world!' hello_operator = PythonOperator( task_id='hello_task', python_callable=print_hello, dag=dag) #------------------------------------------------------------------------------- # dependencies sleep_operator.set_upstream(date_operator) hello_operator.set_upstream(date_operator)
source_base_path=conf['prod_data_dir'], source_key='stormwater_violations_merged.geojson', dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='tsw_int/stormwater_violations_merged.geojson', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Execution rules # Get VPM violations runs after latest only get_vpm_violations.set_upstream(violations_latest_only) # Get salesforce violations runs after latest only get_sf_violations.set_upstream(violations_latest_only) # Get pts violations runs after latest only get_pts_violations.set_upstream(violations_latest_only) # SW Violations merge runs after get_pts and get_sf combine_sw_violations.set_upstream(get_sf_violations) combine_sw_violations.set_upstream(get_pts_violations) combine_sw_violations.set_upstream(get_vpm_violations) # Upload of CSV happens after combine violations_csv_to_s3.set_upstream(combine_sw_violations) violations_geojson_to_s3.set_upstream(combine_sw_violations) violations_csv_null_geos_to_s3.set_upstream(combine_sw_violations)
get_task = PythonOperator( task_id='get_' + machine_service_name, python_callable=get_requests_service_name, op_kwargs={ 'service_name': service_name, 'machine_service_name': machine_service_name }, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) service_tasks.append(get_task) #: join_council_districts must run before get_task get_task.set_upstream(create_prod_files) if i == 'pothole': #: get_task must run before sonar potholes get_task.set_downstream(create_potholes_sonar) filename = conf['prod_data_dir'] + "/get_it_done_*.csv" files = [os.path.basename(x) for x in glob.glob(filename)] for index, file_ in enumerate(files): file_name = file_.split('.')[0] name_parts = file_name.split('_') task_name = '_'.join(name_parts[3:-2]) md_name = '-'.join(name_parts[3:-2]) #: Upload prod gid file to S3
#: Upload prod file to S3 collisions_to_S3 = S3FileTransferOperator( task_id='collisions_to_S3', source_base_path=conf['prod_data_dir'], source_key='pd_collisions_datasd.csv', dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='pd/pd_collisions_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_pd_cls_md = get_seaboard_update_dag('police-collisions.md', dag) #: Execution rules: #: pd_col_latest_only must run before get_collisions_data get_collisions_data.set_upstream(pd_col_latest_only) #: Data processing is triggered after data retrieval. process_collisions_data.set_upstream(get_collisions_data) #: Data upload to S3 is triggered after data processing completion. collisions_to_S3.set_upstream(process_collisions_data) #: Github update depends on S3 upload success. update_pd_cls_md.set_upstream(collisions_to_S3)
dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='sde/tree_canopy_datasd.pbf', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update portal modified date update_gis_md = get_seaboard_update_dag('tree-canopy-2014.md', dag) #: Execution order #: Latest only operator must run before getting tree canopy data get_shapefiles.set_upstream(treecan_latest_only) #: get_shapefiles must run before converting to geojson shp_to_geojson.set_upstream(get_shapefiles) #: to_geojson must run before converting to geobuf geojson_to_geobuf.set_upstream(shp_to_geojson) #: to_geobuf must run before zipping geobuf geobuf_zip.set_upstream(geojson_to_geobuf) #: get_shapefile must run before zipping shapefile shape_zip.set_upstream(get_shapefiles) #: zipping shapefile must run before uploading upload_shp_file.set_upstream(shape_zip)
def push_by_returning(**kwargs): # pushes an XCom without a specific target, just by returning it return value_2 def puller(**kwargs): ti = kwargs['ti'] # get value_1 v1 = ti.xcom_pull(key=None, task_ids='push') assert v1 == value_1 # get value_2 v2 = ti.xcom_pull(task_ids='push_by_returning') assert v2 == value_2 # get both value_1 and value_2 v1, v2 = ti.xcom_pull(key=None, task_ids=['push', 'push_by_returning']) assert (v1, v2) == (value_1, value_2) push1 = PythonOperator(task_id='push', dag=dag, python_callable=push) push2 = PythonOperator(task_id='push_by_returning', dag=dag, python_callable=push_by_returning) pull = PythonOperator(task_id='puller', dag=dag, python_callable=puller) pull.set_upstream([push1, push2])
) email_task = PythonOperator( task_id='email_currency', python_callable=email_currency, dag=dag ) yesterday_task = PythonOperator( task_id='yesterday_currency', python_callable=get_yesterdays_data, op_kwargs={ "engine": PSQL_CONN, }, dag=dag ) yesterday_upsert = PythonOperator( task_id='yesterday_upsert', python_callable=upsert_yesterdays_data, op_kwargs={ "engine": PSQL_CONN, }, dag=dag ) pull_task.set_downstream(run_task) run_task.set_downstream(email_task) yesterday_upsert.set_downstream(yesterday_task) yesterday_upsert.set_upstream(run_task) yesterday_task.set_upstream(run_task) yesterday_task.set_downstream(email_task)
task_id = get_enpdpoints_task_id, python_callable = get_endpoint_with_dates, op_args = [SAVE_PATH, BASE_URL, API_KEYS], templates_dict = ep_template ) t_branch = BranchPythonOperator( task_id = branch_task_id, python_callable = row_count_branch, op_args = [get_enpdpoints_task_id, file_to_gcs_task_id, zero_branch_task_id], trigger_rule = "all_done" ) t_gcs = FileToGoogleCloudStorageOperator( task_id = file_to_gcs_task_id, google_cloud_storage_conn_id = 'gcs_silo', bucket = "deanslist", src = "{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_path' )}}", dst = endpoint_name + "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_name') }}", dag = dag ) t_zero_row = DummyOperator( task_id =zero_branch_task_id ) t2.set_upstream(t1) t2.set_downstream(t_branch) t_branch.set_downstream(t_gcs) t_branch.set_downstream(t_zero_row)
'retries': 5, 'retry_delay': timedelta(minutes=5) } dag = DAG('s3_dag_test', default_args=default_args, schedule_interval='@once') t1 = BashOperator(task_id='bash_test', bash_command='echo "hello, it should work"', dag=dag) sensor = S3KeySensor(task_id='check_s3_for_file_in_s3', bucket_key='logs*', wildcard_match=True, bucket_name='airflow-logs-ben', aws_conn_id='my_conn_S3', timeout=18 * 60 * 60, poke_interval=120, dag=dag) pythonop = PythonOperator(start_date=datetime(2016, 11, 1), python_callable=check_it, task_id="my_python_check") pythonop_write = PythonOperator(start_date=datetime(2016, 11, 1), python_callable=write_it, op_args=["me"], task_id="my_python_write") t1.set_upstream(sensor) pythonop.set_upstream(sensor) pythonop_write.set_upstream(sensor)
# 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } delta = timedelta(seconds=3) dag = DAG('test_delete_dag', default_args=default_args, schedule_interval=delta) # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator(task_id='print_date', bash_command='date', dag=dag) def py_callable(*args, **kwargs): print "args = " print args print "kwargs = " print kwargs t3 = PythonOperator(task_id='py_callable', python_callable=py_callable, op_args=['dogs'], op_kwargs={'cats': 20}, provide_context=True, dag=dag) t3.set_upstream(t1)
df_R, on=['country_region', 'province_state', 'event_date']) df = pd.merge(df, df_D, on=['country_region', 'province_state', 'event_date']) df['mortality_rate'] = df['d_cases'] / df['c_cases'] df['recovery_rate'] = df['r_cases'] / df['c_cases'] #df_final = df[COLUMNS_VIEW] df_final = df with db_connection.begin() as transaction: transaction.execute("DELETE FROM covid.cases_data WHERE 1=1") df_final.to_sql("cases_data", con=transaction, schema="covid", if_exists="append", index=False) integration_procces = PythonOperator(dag=dag, task_id="integration_procces", provide_context=True, python_callable=integration_procces) file_sensor_task_C >> transform_process_C >> insert_process_C file_sensor_task_R >> transform_process_R >> insert_process_R file_sensor_task_D >> transform_process_D >> insert_process_D #[insert_process_C,insert_process_R,insert_process_D] >> integration_procces integration_procces.set_upstream(insert_process_C) integration_procces.set_upstream(insert_process_R) integration_procces.set_upstream(insert_process_D)
# pushes an XCom without a specific target, just by returning it return value_2 def puller(**kwargs): ti = kwargs['ti'] # get value_1 v1 = ti.xcom_pull(key=None, task_ids='push') assert v1 == value_1 # get value_2 v2 = ti.xcom_pull(task_ids='push_by_returning') assert v2 == value_2 # get both value_1 and value_2 v1, v2 = ti.xcom_pull(key=None, task_ids=['push', 'push_by_returning']) assert (v1, v2) == (value_1, value_2) push1 = PythonOperator( task_id='push', dag=dag, python_callable=push) push2 = PythonOperator( task_id='push_by_returning', dag=dag, python_callable=push_by_returning) pull = PythonOperator( task_id='puller', dag=dag, python_callable=puller) pull.set_upstream([push1, push2])
run_this = PythonOperator( task_id='Start', provide_context=True, python_callable=print_context, dag=dag, ) #1 # execute_impala_by_sql_file('forecast_store_code_scope_sprint4',\ # './sqls/1.forecast_store_code_scope_sprint4.sql') step1 = PythonOperator(task_id="step1", python_callable=execute_impala_by_sql_file, op_kwargs={'table_name': "forecast_store_code_scope_sprint4", 'file_path':f'{config["parent_path"]}/data_preperation/data_aggregation/regular_item/1.forecast_store_code_scope_sprint4.sql'}, dag=dag) step1.set_upstream(run_this) #2 # execute_impala_by_sql_file('forecast_itemid_list_threebrands_sprint4',\ # './sqls/2.forecast_itemid_list_threebrands_sprint4.sql') step2 = PythonOperator(task_id="step2", python_callable=execute_impala_by_sql_file, provide_context=True, op_kwargs={'table_name': "forecast_itemid_list_threebrands_sprint4", 'file_path':f'{config["parent_path"]}/data_preperation/data_aggregation/regular_item/2.forecast_itemid_list_threebrands_sprint4.sql', 'set_timeperiod':True}, dag=dag) step2.set_upstream(step1) #3 # execute_impala_by_sql_file('forecast_item_id_family_codes_sprint4',\
check_updates_with_judges_task = PythonOperator( task_id='check_updates_with_judges', python_callable=check_updates_with_judges, dag=dag) def extract_name(): # TODO: Criar função para extrair o nome do juiz do texto return None # http://blog.yhat.com/posts/named-entities-in-law-and-order-using-nlp.html def check_name(): # TODO: Verificar o nome extraido return None # Validar com uma base de nomes de JUIZES (portal da transparencia) extract_name_task = PythonOperator( task_id='extract_name_task', python_callable=extract_name, dag=dag) check_name_task = PythonOperator( task_id='check_name_task', python_callable=check_name, dag=dag) extract_name_task.set_upstream(check_updates_with_judges_task) check_name_task.set_upstream(extract_name_task)
} dag = DAG(**dag_params) clean = PythonOperator(task_id='clean', python_callable=executor.clean, dag=dag) check_connect = PythonOperator(task_id='check_connect', python_callable=executor.check_connect, dag=dag) backup_docs = PythonOperator(task_id='backup_docs', python_callable=executor.backup_docs, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS) backup_trans = PythonOperator(task_id='backup_trans', python_callable=executor.backup_trans, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS) backup_performance = PythonOperator(task_id='backup_performance', python_callable=executor.backup_performance, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS) backup_fields = PythonOperator(task_id='backup_fields', python_callable=executor.backup_field, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS) dim_field = PythonOperator(task_id='dim_field', python_callable=executor.dim_field, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS) fact_document = PythonOperator(task_id='fact_document', python_callable=executor.fact_document, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS) fact_performance = PythonOperator(task_id='fact_performance', python_callable=executor.fact_performance, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS) fact_data_extract = PythonOperator(task_id='fact_data_extract', python_callable=executor.fact_data_extract, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS) report = PythonOperator(task_id='report', python_callable=executor.report, dag=dag, trigger_rule=TriggerRule.ALL_DONE) clean >> check_connect >> [backup_docs, backup_trans, backup_performance, backup_fields] dim_field.set_upstream(backup_fields) fact_document.set_upstream([backup_docs, backup_trans]) fact_performance.set_upstream([backup_performance, fact_document]) fact_data_extract.set_upstream([fact_document, fact_performance]) [dim_field, fact_document, fact_performance, fact_data_extract] >> report
def my_sleeping_function(random_base): '''This is a function that will run within the DAG execution''' time.sleep(random_base) def print_context(ds, **kwargs): pprint(kwargs) print(ds) return 'Whatever you return gets printed in the logs' run_this = PythonOperator( task_id='print_the_context', provide_context=True, python_callable=print_context, dag=dag) for i in range(10): ''' Generating 10 sleeping task, sleeping from 0 to 9 seconds respectively ''' task = PythonOperator( task_id='sleep_for_'+str(i), python_callable=my_sleeping_function, op_kwargs={'random_base': float(i)/10}, dag=dag) task.set_upstream(run_this)
task_id='dsd_approvals_latest_only', dag=dag) #: Get most recent weekly permit approvals reports get_approvals_files = PythonOperator( task_id='get_approvals_files', python_callable=dfg.get_files, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, op_kwargs={'fname_list': fnames, 'target_dir': dsd_temp_dir}, dag=dag) #: dsd_approvals_latest_only must run before get_approvals_files get_approvals_files.set_upstream(dsd_approvals_latest_only) #: update github modified date (solar permits) update_solar_md = get_seaboard_update_dag('solar-permits.md', dag) for key in app.approval_dict: #: Consolidate weekly permitting data by scraping OpenDSD API scrape_dsd = PythonOperator( task_id='scrape_dsd_' + key, python_callable=app.scrape_dsd, op_kwargs={'key': key}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag)
#: Uploads the generated production file upload_traffic_counts = S3FileTransferOperator( task_id='upload_traffic_counts', source_base_path=conf['prod_data_dir'], source_key='traffic_counts_datasd.csv', dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='traffic_counts/traffic_counts_datasd.csv', replace=True, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_traffic_md = get_seaboard_update_dag('traffic-volumes.md', dag) #: Execution Rules #: traffic_counts_latest_only must run before get_traffic_counts get_traffic_counts.set_upstream(tc_latest_only) #: Cleaning task triggered after data retrieval. clean_traffic_counts.set_upstream(get_traffic_counts) #: Production build task triggered after cleaning task. build_traffic_counts.set_upstream(clean_traffic_counts) #: Data upload to S3 triggered after production build task. upload_traffic_counts.set_upstream(build_traffic_counts) #: Update .md file after S3 upload update_traffic_md.set_upstream(upload_traffic_counts)
source_key='latest_indicator_bac_tests_datasd.csv', dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='water_testing/latest_indicator_bac_tests_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update portal modified date update_water_md = get_seaboard_update_dag( 'monitoring-of-indicator-bacteria-in-drinking-water.md', dag) #: Execution Rules #: water_latest_only must run before get_indicator_bac_tests get_indicator_bac_tests.set_upstream(wtr_latest_only) #: Upload indicator bac tests after it has successfully run upload_indicator_bac_tests.set_upstream(get_indicator_bac_tests) #: get_last_bac_tests is dependent on get_indicator_bac_tests get_latest_bac_tests.set_upstream(get_indicator_bac_tests) #: Upload latest indicator bac tests after the file has been generated upload_latest_indicator_bac_tests.set_upstream(get_latest_bac_tests) #: update .md file after S3 upload update_water_md.set_upstream(upload_latest_indicator_bac_tests)
# Convert input CSV files for table in SECMAR_TABLES + ["operations_valides"]: t = PythonOperator( task_id="transform_" + table, python_callable=secmar_transform, provide_context=True, dag=dag, pool="transform", op_kwargs={ "in_path": in_path(table), "out_path": out_path(table), "transformer": secmar_transformer(table), }, ) t.set_upstream(start) t.set_downstream(end_transform) create_tables = PythonOperator( task_id="create_tables", python_callable=create_tables_fn, provide_context=True, dag=dag, ) create_tables.set_upstream(end_transform) # Import CSV files into PostgreSQL embulk_operations = embulk_import(dag, "operations") embulk_operations.set_upstream(create_tables) embulk_operations.set_downstream(end_import)
"flood-monitoring/archive/" + filename, filename) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": print("object doesn't exist") else: raise return filename def insertDailyDump(cassandra, credentials, **kwargs): yesterday_ds = kwargs['yesterday_ds'] filename = downloadDatafile(yesterday_ds, credentials) putInCassandra(filename, cassandra) os.remove(filename) connection = BaseHook.get_connection("s3_conn") extra = connection.extra parsed_extra = json.loads(extra) cassandra = BaseHook.get_connection("cassandra_connection") putIn = PythonOperator(task_id='put_in_cassandra', python_callable=insertDailyDump, provide_context=True, op_kwargs={ 'cassandra': cassandra, 'credentials': parsed_extra }, dag=dag) putIn.set_upstream(s3ready)
with open(output_path, "w") as fp: fp.write(input_value) fp.write("\n\n") fp.write(str(datetime.now().strftime("%Y-%m-%dT%H:%M:%S"))) return "success" with DAG(dag_id="dbnd_operators", default_args=default_args) as dag_operators: # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = my_task(2) t2, t3 = my_multiple_outputs(t1) tp = PythonOperator( task_id="some_python_function", python_callable=some_python_function, op_kwargs={ "input_path": t3, "output_path": "/tmp/output.txt" }, ) tp.set_upstream(t3.op) t1_op = t1.op if __name__ == "__main__": ti = TaskInstance(t1_op, days_ago(0)) ti.run(ignore_task_deps=True, ignore_ti_state=True, test_mode=True) # # # # dag_operators.clear() # dag_operators.run()
# Task JOIN def join_data(): filtro = pd.read_csv(data_path + 'enade_filtro.csv') idadecent = pd.read_csv(data_path + 'idadecent.csv') idadequadrado = pd.read_csv(data_path + 'idadequadrado.csv') estcivil = pd.read_csv(data_path + 'estcivil.csv') cor = pd.read_csv(data_path + 'cor.csv') final = pd.concat([filtro, idadecent, idadequadrado, estcivil, cor], axis=1) # realizar o concat por coluna final.to_csv(data_path + 'enade_tratado.csv', index=False) print(final) # PythonOperator para o - join_data task_join = PythonOperator(task_id='join_data', python_callable=join_data, dag=dag) # Definindo o encadeamento da execução start_processing >> get_data >> unzip_file >> task_aplica_filtro task_aplica_filtro >> [task_idade_cent, task_est_civil, task_cor] task_idade_quad.set_upstream(task_idade_cent) task_join.set_upstream([task_est_civil, task_cor, task_idade_quad])
pyop_unzip_file = PythonOperator(task_id='unzip_file', python_callable=unzip_file) pyop_filter_data = PythonOperator(task_id='filter_data', python_callable=filter_data) pyop_mean_normalize_age = PythonOperator( task_id='mean_normalize_age', python_callable=get_mean_normalized_age) pyop_squared_mean_normalize_age = PythonOperator( task_id='squared_mean_normalize_age', python_callable=get_squared_mean_normalized_age) pyop_marital_status = PythonOperator(task_id='marital_status', python_callable=get_marital_status) pyop_skin_color = PythonOperator(task_id='skin_color', python_callable=get_skin_color) pyop_join_data = PythonOperator(task_id='join_data', python_callable=join_data) baop_start_process >> baop_get_data >> pyop_unzip_file >> pyop_filter_data pyop_filter_data >> pyop_mean_normalize_age >> pyop_squared_mean_normalize_age pyop_filter_data >> [pyop_marital_status, pyop_skin_color] pyop_join_data.set_upstream([ pyop_squared_mean_normalize_age, pyop_marital_status, pyop_skin_color ])
on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload prod art file to S3 upload_public_art = S3FileTransferOperator( task_id='upload_public_art', source_base_path=conf['prod_data_dir'], source_key='public_art_locations_datasd.csv', dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='public_art/public_art_locations_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update portal modified date update_public_art_md = get_seaboard_update_dag('public-art.md', dag) #: Execution rules #: public_art_latest_only must run before get_public_art get_public_art.set_upstream(public_art_latest_only) #: get_public_art must run before file upload upload_public_art.set_upstream(get_public_art) #: upload_gid_requests must succeed before updating github update_public_art_md.set_upstream(upload_public_art)
print("All target DAGs are finished") args = { 'start_date': days_ago(1), 'owner': 'airflow', } dag = DAG( dag_id='trigger_with_multi_dagrun_sensor', max_active_runs=1, schedule_interval='@hourly', default_args=args, ) gen_target_dag_run = TriggerMultiDagRunOperator( task_id='gen_target_dag_run', dag=dag, trigger_dag_id='common_target', python_callable=generate_dag_run, ) # Wait until there is no running instance of target DAG wait_target_dag = MultiDagRunSensor(task_id='wait_target_dag', dag=dag) wait_target_dag.set_upstream(gen_target_dag_run) after_dags_handler_op = PythonOperator(task_id='after_dags_handler', python_callable=after_dags_handler, dag=dag) after_dags_handler_op.set_upstream(wait_target_dag)
python_callable=update_json_date, provide_context=True, op_kwargs={'ds_fname': 'indicator_bacteria_monitoring'}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_water_md = get_seaboard_update_dag( 'monitoring-of-indicator-bacteria-in-drinking-water.md', dag) #: Execution Rules #: water_latest_only must run before get_indicator_bac_tests get_indicator_bac_tests.set_upstream(wtr_latest_only) #: Upload indicator bac tests after it has successfully run upload_indicator_bac_tests.set_upstream(get_indicator_bac_tests) #: get_last_bac_tests is dependent on get_indicator_bac_tests get_latest_bac_tests.set_upstream(get_indicator_bac_tests) #: Upload latest indicator bac tests after the file has been generated upload_latest_indicator_bac_tests.set_upstream(get_latest_bac_tests) #: update .md file after S3 upload update_water_md.set_upstream(upload_latest_indicator_bac_tests) #: upload data must succeed before updating json update_json_date.set_upstream(upload_latest_indicator_bac_tests)
provide_context=True, python_callable=load_rea_config, dag=dag, ) expression_filter = PythonOperator( task_id='exp_filter', provide_context=True, python_callable=base_func['expression_filter'], dag=dag, params=PARAM_OBJ.params(base_func['expression_filter']), ) expression_filter_test = PythonOperator( task_id='exp_filter', provide_context=True, python_callable=expression_filter_test, dag=dag, params=PARAM_OBJ.params(base_func['expression_filter']), ) # end_task = PythonOperator( # task_id='dag_end', # provide_context=True, # python_callable=dag_end, # dag=dag # ) # end_task.set_upstream(load_config) expression_filter_test.set_upstream(load_config)
token = response.headers['X-ARC-Token'] headers = {'X-ARC-Token': token} payload = {'uuid': '##UUID##', 'level': level, 'message': msg} response = requests.post(API_HOST+'/api/v1/podevent', headers=headers, json=payload) def preamble(ds, **kwargs): print('PREAMBLE ------------------------------------------------------------------------') create_podevent('Starting ##PHASE## workflow for POD ##UUID##') def postamble(ds, **kwargs): print('POSTAMBLE ------------------------------------------------------------------------') create_podevent('Finishing ##PHASE## workflow for POD ##UUID##') create_podevent('State changed to: ACTIVE', level='STATUS') def failure(ds, **kwargs): print('POSTAMBLE ------------------------------------------------------------------------') create_podevent('Finishing ##PHASE## workflow for POD ##UUID##, Failed') create_podevent('State changed to: FAILED', level='STATUS') t1 = PythonOperator(task_id='preamble', provide_context=True, python_callable=preamble, dag=dag) t2 = PythonOperator(task_id='maintask', provide_context=True, python_callable=##WFNAME##.start, dag=dag) t3 = PythonOperator(task_id='postamble', provide_context=True, python_callable=postamble, dag=dag) t4 = PythonOperator(task_id='failure', provide_context=True, python_callable=failure, dag=dag, trigger_rule='all_failed') t2.set_upstream(t1) t3.set_upstream(t2) t4.set_upstream(t2)
train_config = training_config(estimator=byoc_est, inputs=inputs) # step - trigger CDK to deploy model as ECS service using Airflow Python Operator def dkn_model_deploy(data, **context): print("mock for dkn deployment") default_args = { 'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2), 'provide_context': True } dag = DAG('tensorflow_example', default_args=default_args, schedule_interval='@once') train_op = SageMakerTrainingOperator(task_id='tf_training', config=train_config, wait_for_completion=True, dag=dag) deploy_op = PythonOperator(task_id='model_deploy', python_callable=dkn_model_deploy, op_args=['gw1', 'gw2'], provide_context=True, dag=dag) deploy_op.set_upstream(train_op)
channel=slack_channel, username='******', text='Cluster has been *restarted!*\n' 'It\'s all fine move forward with your ETLs and Crawlers!\n' 'Message datetime: {{params.curr_date}}', params={'curr_date': str(datetime.now(pytz.timezone('America/Sao_Paulo')))}, dag=dag ) run_etl_crawler_cluster_up = SubDagOperator( subdag=sub_dag('check_cluster_slack', 'crawler_dag_cluster_up', dag.schedule_interval), task_id='crawler_dag_cluster_up', dag=dag, ) run_etl_crawler_cluster_restarted = SubDagOperator( subdag=sub_dag('check_cluster_slack', 'crawler_dag_cluster_restarted', dag.schedule_interval), task_id='crawler_dag_cluster_restarted', dag=dag, ) branch1.set_upstream(check_cluster) send_slack_cluster_ok.set_upstream(branch1) send_slack_cluster_start.set_upstream(branch1) start_cluster.set_upstream(send_slack_cluster_start) branch2.set_upstream(start_cluster) send_slack_cluster_down.set_upstream(branch2) send_slack_cluster_restarted_ok.set_upstream(branch2) run_etl_crawler_cluster_up.set_upstream(send_slack_cluster_ok) run_etl_crawler_cluster_restarted.set_upstream(send_slack_cluster_restarted_ok)
on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload prod fire_department_SD.csv file to S3 upload_fd_data = S3FileTransferOperator( task_id='upload_fd_data', source_base_path=conf['prod_data_dir'], source_key='/fd_problems_{}_datasd.csv'.format(cur_yr), dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='fd_cad/' + 'fd_problems_{}_datasd.csv'.format(cur_yr), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) update_fire_incidents_problems = get_seaboard_update_dag('fire-incidents-problems.md', dag) #: Execution order #: fd_latest_only must run before get_fd_data get_fd_data.set_upstream(fd_latest_only) #: upload_fd_data is dependent on successful run of get_fd_data upload_fd_data.set_upstream(get_fd_data) #: upload_fd_data must succeed before updating github update_fire_incidents_problems.set_upstream(upload_fd_data)
'retries': 0, } dag = DAG(dag_id='credit_card_processor', default_args=args, schedule_interval=None, dagrun_timeout=timedelta(minutes=15)) create_dirs_task = PythonOperator(task_id='create_dirs', python_callable=create_dirs, dag=dag) download_statement_task = PythonOperator(task_id='download_statement', python_callable=download_statement, dag=dag) download_statement_task.set_upstream(create_dirs_task) wait_for_statement_task = PythonOperator(task_id='wait_for_statement', python_callable=wait_for_statement, dag=dag) wait_for_statement_task.set_upstream(download_statement_task) open_tabula_task = PythonOperator(task_id='open_tabula', python_callable=open_tabula, dag=dag) open_tabula_task.set_upstream(wait_for_statement_task) wait_for_csv_task = PythonOperator(task_id='wait_for_csv', python_callable=wait_for_csv, dag=dag) wait_for_csv_task.set_upstream(open_tabula_task)
#: Create subsets create_subsets = PythonOperator( task_id='create_subsets', python_callable=make_prod_files, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_ttcs_md = get_seaboard_update_dag('business-listings.md', dag) #: Execution Rules #: ttcs_latest_only must run before get_active get_active_businesses.set_upstream(ttcs_latest_only) #: ttcs_latest_only must run before get_bids clean_data.set_upstream(get_active_businesses) #: Data cleaning occurs after BIDs data retrieval. geocode_data.set_upstream(clean_data) #: spatial join occurs after geocoding. join_bids.set_upstream(geocode_data) #: last 3mo subsetting occurs after spatial join create_subsets.set_upstream(join_bids) subset_names = [os.path.basename(x) for x in glob.glob(conf['prod_data_dir']+'/sd_businesses_*.csv')] for index, subset in enumerate(subset_names): fname = subset[14:-11]
replace=True, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: update permits.md file update_permits_md = get_seaboard_update_dag('permits-dsd.md', dag) #: Execution rules #: dsd_permits_latest_only must run before get_permits_files get_permits_files.set_upstream(dsd_permits_latest_only) #: clean_data tasks are executed after get_approvals_files task clean_data.set_upstream(get_permits_files) #: upload_dsd tasks are executed after clean_data tasks join_bids.set_upstream(clean_data) #: upload_dsd tasks are executed after join bids tasks upload_dsd_permits.set_upstream(join_bids) #: github updates are executed after S3 upload tasks update_permits_md.set_upstream(upload_dsd_permits)
fetch_tweets = PythonOperator( task_id='fetch_tweets', python_callable=fetchtweets, dag=dag) # -------------------------------------------------------------------------------- # Clean the eight files. In this step you can get rid of or cherry pick columns # and different parts of the text # -------------------------------------------------------------------------------- clean_tweets = PythonOperator( task_id='clean_tweets', python_callable=cleantweets, dag=dag) clean_tweets.set_upstream(fetch_tweets) # -------------------------------------------------------------------------------- # In this section you can use a script to analyze the twitter data. Could simply # be a sentiment analysis through algorithms like bag of words or something more # complicated. You can also take a look at Web Services to do such tasks # -------------------------------------------------------------------------------- analyze_tweets = PythonOperator( task_id='analyze_tweets', python_callable=analyzetweets, dag=dag) analyze_tweets.set_upstream(clean_tweets) # --------------------------------------------------------------------------------
def my_sleeping_function(random_base): """This is a function that will run within the DAG execution""" time.sleep(random_base) def print_context(ds, **kwargs): pprint(kwargs) print(ds) return "Whatever you return gets printed in the logs" run_this = PythonOperator( task_id="print_the_context", provide_context=True, python_callable=print_context, dag=dag, ) # Generate 10 sleeping tasks, sleeping from 0 to 4 seconds respectively for i in range(5): task = PythonOperator( task_id="sleep_for_" + str(i), python_callable=my_sleeping_function, op_kwargs={"random_base": float(i) / 10}, dag=dag, ) task.set_upstream(run_this)
'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG('chained_job', schedule_interval='@once', default_args=default_args) producer = PythonOperator( task_id='run_job_producer', python_callable=run_job, op_args=('75588', 300, '8edd9e11f4de44b39f666777ac79bfe1'), retries=1, dag=dag ) consumer = PythonOperator( task_id='run_job_consumer', python_callable=run_job, op_args=('75588', 300, '8edd9e11f4de44b39f666777ac79bfe1'), retries=1, dag=dag ) consumer.set_upstream(producer)
grouped_partition = grouped_partition.reset_index() push_data(grouped_partition, kwargs['dest_file']) logging.info('ETL JOBLIFT SUCCESSFULLY RAN.') dag = DAG('joblift_cpc_ETL', description='Simple tutorial DAG', start_date=datetime.now() - timedelta(days=4), schedule_interval='0 0 * * *' ) load_cpc_data = PythonOperator(task_id='load_cpc_files_data', python_callable=read_files, op_kwargs={'path_folder': PATH_CPC, 'dest_file': FOLDER_PATH_DESTINATION_CPC}, dag= dag) load_rate_data = PythonOperator(task_id= 'load_exchange_rate_api_data', python_callable=get_exchange_rate, op_kwargs={'url': URL, 'start_date': START_DATE, 'end_date': END_DATE, 'folder_dest_file': FOLDER_PATH_DESTINATION_RATE}, dag=dag) Transform_push = PythonOperator(task_id='transform_data', python_callable=transform_data, op_kwargs={'dfp1': FOLDER_PATH_DESTINATION_CPC, 'dfp2': FOLDER_PATH_DESTINATION_RATE, 'dest_file': FOLDER_PATH_DESTINATION_FINAL}, dag=dag) Transform_push.set_upstream([load_rate_data, load_cpc_data])
#: Upload prod SE file to S3 upload_special_events = S3FileTransferOperator( task_id='upload_special_events', source_base_path=conf['prod_data_dir'], source_key='special_events_list_datasd.csv', dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='special_events/special_events_list_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update portal modified date update_special_events_md = get_seaboard_update_dag('special-events.md', dag) #: Execution rules #: se_latest_only must run before get_special_events get_special_events.set_upstream(se_latest_only) #: process_special_events dependent on get_special_events process_special_events.set_upstream(get_special_events) #: upload_special_events dependent on process_special_events upload_special_events.set_upstream(process_special_events) #: update github modified date after S3 upload update_special_events_md.set_upstream(upload_special_events)
def escreve_dw(): final = pd.read_csv(data_path + 'enade_tratado.csv') engine = sqlalchemy.create_engine( 'mysql+pymysql://root:root@localhost/enade') final.to_sql('tratado', con=engine, index=False, if_exists='append') task_escreve_dw = PythonOperator(task_id='escreve_dw', python_callable=escreve_dw, dag=dag) get_data >> unzip_data >> task_aplica_filtro task_aplica_filtro >> [ task_idade_cent, task_est_civil, task_cor, task_escopai, task_escomae, task_renda ] # informando que a minha task_idade_quad tem que vir depois de task_idade_cent task_idade_quad.set_upstream(task_idade_cent) # task_idade_cent.downstream_list(task_idade_quad) - FAZ A MESMA COISA ACIMA task_join.set_upstream([ task_est_civil, task_cor, task_escopai, task_escomae, task_renda, task_idade_quad ]) # dizendo que a task_escreve_dw executa depois de task_join task_join.set_downstream(task_escreve_dw) # subindo os containers no Docker com 2 workers # docker-compose up -d --scale worker==2
script_location="s3n://public-qubole/qbol-library/scripts/show_table.hql", notfiy=True, tags=['tag1', 'tag2'], # If the script at s3 location has any qubole specific macros to be replaced # macros='[{"date": "{{ ds }}"}, {"name" : "abc"}]', trigger_rule="all_done", dag=dag) t3 = PythonOperator( task_id='compare_result', provide_context=True, python_callable=compare_result, trigger_rule="all_done", dag=dag) t3.set_upstream(t1) t3.set_upstream(t2) options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(t3) join = DummyOperator( task_id='join', trigger_rule='one_success', dag=dag )
def create_evaluate_ops( task_prefix, # pylint:disable=too-many-arguments data_format, input_paths, prediction_path, metric_fn_and_keys, validate_fn, batch_prediction_job_id=None, project_id=None, region=None, dataflow_options=None, model_uri=None, model_name=None, version_name=None, dag=None): """ Creates Operators needed for model evaluation and returns. It gets prediction over inputs via Cloud ML Engine BatchPrediction API by calling MLEngineBatchPredictionOperator, then summarize and validate the result via Cloud Dataflow using DataFlowPythonOperator. For details and pricing about Batch prediction, please refer to the website https://cloud.google.com/ml-engine/docs/how-tos/batch-predict and for Cloud Dataflow, https://cloud.google.com/dataflow/docs/ It returns three chained operators for prediction, summary, and validation, named as <prefix>-prediction, <prefix>-summary, and <prefix>-validation, respectively. (<prefix> should contain only alphanumeric characters or hyphen.) The upstream and downstream can be set accordingly like: pred, _, val = create_evaluate_ops(...) pred.set_upstream(upstream_op) ... downstream_op.set_upstream(val) Callers will provide two python callables, metric_fn and validate_fn, in order to customize the evaluation behavior as they wish. - metric_fn receives a dictionary per instance derived from json in the batch prediction result. The keys might vary depending on the model. It should return a tuple of metrics. - validation_fn receives a dictionary of the averaged metrics that metric_fn generated over all instances. The key/value of the dictionary matches to what's given by metric_fn_and_keys arg. The dictionary contains an additional metric, 'count' to represent the total number of instances received for evaluation. The function would raise an exception to mark the task as failed, in a case the validation result is not okay to proceed (i.e. to set the trained version as default). Typical examples are like this: def get_metric_fn_and_keys(): import math # imports should be outside of the metric_fn below. def error_and_squared_error(inst): label = float(inst['input_label']) classes = float(inst['classes']) # 0 or 1 err = abs(classes-label) squared_err = math.pow(classes-label, 2) return (err, squared_err) # returns a tuple. return error_and_squared_error, ['err', 'mse'] # key order must match. def validate_err_and_count(summary): if summary['err'] > 0.2: raise ValueError('Too high err>0.2; summary=%s' % summary) if summary['mse'] > 0.05: raise ValueError('Too high mse>0.05; summary=%s' % summary) if summary['count'] < 1000: raise ValueError('Too few instances<1000; summary=%s' % summary) return summary For the details on the other BatchPrediction-related arguments (project_id, job_id, region, data_format, input_paths, prediction_path, model_uri), please refer to MLEngineBatchPredictionOperator too. :param task_prefix: a prefix for the tasks. Only alphanumeric characters and hyphen are allowed (no underscores), since this will be used as dataflow job name, which doesn't allow other characters. :type task_prefix: str :param data_format: either of 'TEXT', 'TF_RECORD', 'TF_RECORD_GZIP' :type data_format: str :param input_paths: a list of input paths to be sent to BatchPrediction. :type input_paths: list[str] :param prediction_path: GCS path to put the prediction results in. :type prediction_path: str :param metric_fn_and_keys: a tuple of metric_fn and metric_keys: - metric_fn is a function that accepts a dictionary (for an instance), and returns a tuple of metric(s) that it calculates. - metric_keys is a list of strings to denote the key of each metric. :type metric_fn_and_keys: tuple of a function and a list[str] :param validate_fn: a function to validate whether the averaged metric(s) is good enough to push the model. :type validate_fn: function :param batch_prediction_job_id: the id to use for the Cloud ML Batch prediction job. Passed directly to the MLEngineBatchPredictionOperator as the job_id argument. :type batch_prediction_job_id: str :param project_id: the Google Cloud Platform project id in which to execute Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s `default_args['project_id']` will be used. :type project_id: str :param region: the Google Cloud Platform region in which to execute Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s `default_args['region']` will be used. :type region: str :param dataflow_options: options to run Dataflow jobs. If None, then the `dag`'s `default_args['dataflow_default_options']` will be used. :type dataflow_options: dictionary :param model_uri: GCS path of the model exported by Tensorflow using tensorflow.estimator.export_savedmodel(). It cannot be used with model_name or version_name below. See MLEngineBatchPredictionOperator for more detail. :type model_uri: str :param model_name: Used to indicate a model to use for prediction. Can be used in combination with version_name, but cannot be used together with model_uri. See MLEngineBatchPredictionOperator for more detail. If None, then the `dag`'s `default_args['model_name']` will be used. :type model_name: str :param version_name: Used to indicate a model version to use for prediction, in combination with model_name. Cannot be used together with model_uri. See MLEngineBatchPredictionOperator for more detail. If None, then the `dag`'s `default_args['version_name']` will be used. :type version_name: str :param dag: The `DAG` to use for all Operators. :type dag: airflow.models.DAG :returns: a tuple of three operators, (prediction, summary, validation) :rtype: tuple(DataFlowPythonOperator, DataFlowPythonOperator, PythonOperator) """ # Verify that task_prefix doesn't have any special characters except hyphen # '-', which is the only allowed non-alphanumeric character by Dataflow. if not re.match(r"^[a-zA-Z][-A-Za-z0-9]*$", task_prefix): raise AirflowException( "Malformed task_id for DataFlowPythonOperator (only alphanumeric " "and hyphens are allowed but got: " + task_prefix) metric_fn, metric_keys = metric_fn_and_keys if not callable(metric_fn): raise AirflowException("`metric_fn` param must be callable.") if not callable(validate_fn): raise AirflowException("`validate_fn` param must be callable.") if dag is not None and dag.default_args is not None: default_args = dag.default_args project_id = project_id or default_args.get('project_id') region = region or default_args.get('region') model_name = model_name or default_args.get('model_name') version_name = version_name or default_args.get('version_name') dataflow_options = dataflow_options or \ default_args.get('dataflow_default_options') evaluate_prediction = MLEngineBatchPredictionOperator( task_id=(task_prefix + "-prediction"), project_id=project_id, job_id=batch_prediction_job_id, region=region, data_format=data_format, input_paths=input_paths, output_path=prediction_path, uri=model_uri, model_name=model_name, version_name=version_name, dag=dag) metric_fn_encoded = base64.b64encode(dill.dumps(metric_fn, recurse=True)) evaluate_summary = DataFlowPythonOperator( task_id=(task_prefix + "-summary"), py_options=["-m"], py_file="airflow.gcp.utils.mlengine_prediction_summary", dataflow_default_options=dataflow_options, options={ "prediction_path": prediction_path, "metric_fn_encoded": metric_fn_encoded, "metric_keys": ','.join(metric_keys) }, py_interpreter='python2', dag=dag) evaluate_summary.set_upstream(evaluate_prediction) def apply_validate_fn(*args, **kwargs): prediction_path = kwargs["templates_dict"]["prediction_path"] scheme, bucket, obj, _, _ = urlsplit(prediction_path) if scheme != "gs" or not bucket or not obj: raise ValueError( "Wrong format prediction_path: {}".format(prediction_path)) summary = os.path.join(obj.strip("/"), "prediction.summary.json") gcs_hook = GoogleCloudStorageHook() summary = json.loads(gcs_hook.download(bucket, summary)) return validate_fn(summary) evaluate_validation = PythonOperator( task_id=(task_prefix + "-validation"), python_callable=apply_validate_fn, provide_context=True, templates_dict={"prediction_path": prediction_path}, dag=dag) evaluate_validation.set_upstream(evaluate_summary) return evaluate_prediction, evaluate_summary, evaluate_validation
#: Email new committees send_committee_report = PoseidonEmailWithPythonOperator( task_id='send_committee_report', to='*****@*****.**', subject='Campaign committees update', template_id='tem_7xCrDCTyvjMGS9VpBM8rRmwD', dispatch_type='sonar_dispatch', python_callable=send_comm_report, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Execution rules #: campaign_fin_latest_only must run before schedule_460A schedule_460A.set_upstream(campaign_fin_latest_only) #: schedule_460A must run before schedule_460B1 schedule_460B1.set_upstream(schedule_460A) #: schedule_460B1 must run before schedule_460C schedule_460C.set_upstream(schedule_460B1) #: schedule_460C must run before schedule_460D schedule_460D.set_upstream(schedule_460C) #: schedule_460D must run before schedule_sum schedule_sum.set_upstream(schedule_460D) #: schedule_sum must run before schedule_497 schedule_497.set_upstream(schedule_sum) #: schedule_497 must run before schedule_496 schedule_496.set_upstream(schedule_497) #: schedule_496 must run before combine_schedules combine_schedules.set_upstream(schedule_496) #: combine_schedules must run before file upload
suffixes=('Prior', 'Current')).dropna() transformed_data = pd.concat([ data, data.loc[:, data.columns. difference(['Date', 'VolumePrior', 'VolumeCurrent'])].div( data.OpenCurrent, axis=0).round(decimals=3).add_suffix('Percent') ], axis=1) transformed_data.to_csv(f'/tmp/work/{symbol}.csv') print("Data Retrieved!") # dag args = {"owner": "Scrape test", "start_date": airflow.utils.dates.days_ago(2)} dag = DAG(dag_id="scrape_test", default_args=args, schedule_interval=None) # tasks BAC_Task = PythonOperator(task_id="pull_BAC_data", python_callable=pull_BAC_data, dag=dag) AAN_Task = PythonOperator(task_id="pull_AAN_data", python_callable=pull_AAN_data, dag=dag) # dependencies BAC_Task.set_upstream(AAN_Task)
on_retry_callback=notify, on_success_callback=notify, dag=dag) make_op_act = PythonOperator( task_id='create_operating_act', python_callable=create_operating_act, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Execution Rules #: budget_latest_only must run before get_accounts get_accounts.set_upstream(budget_latest_only) #: get_accounts must run before get_refs get_refs.set_upstream(get_accounts) #: get_refs must run before get_capital_ptd get_capital_ptd.set_upstream(get_refs) #: get_refs must run before get_capital_fy get_capital_fy.set_upstream(get_refs) #: get_refs must run before get_operating get_operating.set_upstream(get_refs) #: get_refs must run before get_cip_ptd_act get_cip_ptd_act.set_upstream(get_refs) #: get_refs must run before get_cip_fy_act get_cip_fy_act.set_upstream(get_refs) #: get_refs must run before get_op_act get_op_act.set_upstream(get_refs)
industry_task = PythonOperator(task_id='update_uqer_industry_info', provide_context=True, python_callable=update_uqer_industry_info, dag=dag) sw1_adj_industry_task = PythonOperator(task_id='update_sw1_adj_industry', provide_context=True, python_callable=update_sw1_adj_industry, dag=dag) dx_industry_task = PythonOperator(task_id='update_dx_industry', provide_context=True, python_callable=update_dx_industry, dag=dag) industry_task.set_upstream(market_task) sw1_adj_industry_task.set_upstream(industry_task) dx_industry_task.set_upstream(industry_task) categories_task = PythonOperator(task_id='update_categories', provide_context=True, python_callable=update_category, dag=dag) categories_task.set_upstream(sw1_adj_industry_task) index_task = PythonOperator(task_id='update_uqer_index_components', provide_context=True, python_callable=update_uqer_index_components, dag=dag)
def create_evaluate_ops(task_prefix, data_format, input_paths, prediction_path, metric_fn_and_keys, validate_fn, batch_prediction_job_id=None, project_id=None, region=None, dataflow_options=None, model_uri=None, model_name=None, version_name=None, dag=None): """ Creates Operators needed for model evaluation and returns. It gets prediction over inputs via Cloud ML Engine BatchPrediction API by calling MLEngineBatchPredictionOperator, then summarize and validate the result via Cloud Dataflow using DataFlowPythonOperator. For details and pricing about Batch prediction, please refer to the website https://cloud.google.com/ml-engine/docs/how-tos/batch-predict and for Cloud Dataflow, https://cloud.google.com/dataflow/docs/ It returns three chained operators for prediction, summary, and validation, named as <prefix>-prediction, <prefix>-summary, and <prefix>-validation, respectively. (<prefix> should contain only alphanumeric characters or hyphen.) The upstream and downstream can be set accordingly like: pred, _, val = create_evaluate_ops(...) pred.set_upstream(upstream_op) ... downstream_op.set_upstream(val) Callers will provide two python callables, metric_fn and validate_fn, in order to customize the evaluation behavior as they wish. - metric_fn receives a dictionary per instance derived from json in the batch prediction result. The keys might vary depending on the model. It should return a tuple of metrics. - validation_fn receives a dictionary of the averaged metrics that metric_fn generated over all instances. The key/value of the dictionary matches to what's given by metric_fn_and_keys arg. The dictionary contains an additional metric, 'count' to represent the total number of instances received for evaluation. The function would raise an exception to mark the task as failed, in a case the validation result is not okay to proceed (i.e. to set the trained version as default). Typical examples are like this: def get_metric_fn_and_keys(): import math # imports should be outside of the metric_fn below. def error_and_squared_error(inst): label = float(inst['input_label']) classes = float(inst['classes']) # 0 or 1 err = abs(classes-label) squared_err = math.pow(classes-label, 2) return (err, squared_err) # returns a tuple. return error_and_squared_error, ['err', 'mse'] # key order must match. def validate_err_and_count(summary): if summary['err'] > 0.2: raise ValueError('Too high err>0.2; summary=%s' % summary) if summary['mse'] > 0.05: raise ValueError('Too high mse>0.05; summary=%s' % summary) if summary['count'] < 1000: raise ValueError('Too few instances<1000; summary=%s' % summary) return summary For the details on the other BatchPrediction-related arguments (project_id, job_id, region, data_format, input_paths, prediction_path, model_uri), please refer to MLEngineBatchPredictionOperator too. :param task_prefix: a prefix for the tasks. Only alphanumeric characters and hyphen are allowed (no underscores), since this will be used as dataflow job name, which doesn't allow other characters. :type task_prefix: string :param data_format: either of 'TEXT', 'TF_RECORD', 'TF_RECORD_GZIP' :type data_format: string :param input_paths: a list of input paths to be sent to BatchPrediction. :type input_paths: list of strings :param prediction_path: GCS path to put the prediction results in. :type prediction_path: string :param metric_fn_and_keys: a tuple of metric_fn and metric_keys: - metric_fn is a function that accepts a dictionary (for an instance), and returns a tuple of metric(s) that it calculates. - metric_keys is a list of strings to denote the key of each metric. :type metric_fn_and_keys: tuple of a function and a list of strings :param validate_fn: a function to validate whether the averaged metric(s) is good enough to push the model. :type validate_fn: function :param batch_prediction_job_id: the id to use for the Cloud ML Batch prediction job. Passed directly to the MLEngineBatchPredictionOperator as the job_id argument. :type batch_prediction_job_id: string :param project_id: the Google Cloud Platform project id in which to execute Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s `default_args['project_id']` will be used. :type project_id: string :param region: the Google Cloud Platform region in which to execute Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s `default_args['region']` will be used. :type region: string :param dataflow_options: options to run Dataflow jobs. If None, then the `dag`'s `default_args['dataflow_default_options']` will be used. :type dataflow_options: dictionary :param model_uri: GCS path of the model exported by Tensorflow using tensorflow.estimator.export_savedmodel(). It cannot be used with model_name or version_name below. See MLEngineBatchPredictionOperator for more detail. :type model_uri: string :param model_name: Used to indicate a model to use for prediction. Can be used in combination with version_name, but cannot be used together with model_uri. See MLEngineBatchPredictionOperator for more detail. If None, then the `dag`'s `default_args['model_name']` will be used. :type model_name: string :param version_name: Used to indicate a model version to use for prediciton, in combination with model_name. Cannot be used together with model_uri. See MLEngineBatchPredictionOperator for more detail. If None, then the `dag`'s `default_args['version_name']` will be used. :type version_name: string :param dag: The `DAG` to use for all Operators. :type dag: airflow.DAG :returns: a tuple of three operators, (prediction, summary, validation) :rtype: tuple(DataFlowPythonOperator, DataFlowPythonOperator, PythonOperator) """ # Verify that task_prefix doesn't have any special characters except hyphen # '-', which is the only allowed non-alphanumeric character by Dataflow. if not re.match(r"^[a-zA-Z][-A-Za-z0-9]*$", task_prefix): raise AirflowException( "Malformed task_id for DataFlowPythonOperator (only alphanumeric " "and hyphens are allowed but got: " + task_prefix) metric_fn, metric_keys = metric_fn_and_keys if not callable(metric_fn): raise AirflowException("`metric_fn` param must be callable.") if not callable(validate_fn): raise AirflowException("`validate_fn` param must be callable.") if dag is not None and dag.default_args is not None: default_args = dag.default_args project_id = project_id or default_args.get('project_id') region = region or default_args.get('region') model_name = model_name or default_args.get('model_name') version_name = version_name or default_args.get('version_name') dataflow_options = dataflow_options or \ default_args.get('dataflow_default_options') evaluate_prediction = MLEngineBatchPredictionOperator( task_id=(task_prefix + "-prediction"), project_id=project_id, job_id=batch_prediction_job_id, region=region, data_format=data_format, input_paths=input_paths, output_path=prediction_path, uri=model_uri, model_name=model_name, version_name=version_name, dag=dag) metric_fn_encoded = base64.b64encode(dill.dumps(metric_fn, recurse=True)) evaluate_summary = DataFlowPythonOperator( task_id=(task_prefix + "-summary"), py_options=["-m"], py_file="airflow.contrib.operators.mlengine_prediction_summary", dataflow_default_options=dataflow_options, options={ "prediction_path": prediction_path, "metric_fn_encoded": metric_fn_encoded, "metric_keys": ','.join(metric_keys) }, dag=dag) evaluate_summary.set_upstream(evaluate_prediction) def apply_validate_fn(*args, **kwargs): prediction_path = kwargs["templates_dict"]["prediction_path"] scheme, bucket, obj, _, _ = urlsplit(prediction_path) if scheme != "gs" or not bucket or not obj: raise ValueError("Wrong format prediction_path: %s", prediction_path) summary = os.path.join(obj.strip("/"), "prediction.summary.json") gcs_hook = GoogleCloudStorageHook() summary = json.loads(gcs_hook.download(bucket, summary)) return validate_fn(summary) evaluate_validation = PythonOperator( task_id=(task_prefix + "-validation"), python_callable=apply_validate_fn, provide_context=True, templates_dict={"prediction_path": prediction_path}, dag=dag) evaluate_validation.set_upstream(evaluate_summary) return evaluate_prediction, evaluate_summary, evaluate_validation
documentum_docs_latest_only = LatestOnlyOperator(task_id='documentum_24_docs_latest_only', dag=dag) #: Get documentum tables get_doc_tables = PythonOperator( task_id='get_documentum_tables', python_callable=get_documentum, op_kwargs={'mode': schedule_mode}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Execution rules #: documentum_docs_latest_only must run before get_doc_tables get_doc_tables.set_upstream(documentum_docs_latest_only) files = [f for f in os.listdir(conf['prod_data_dir'])] tables_other = dn.table_name(schedule_mode) for f in files: file_name = f.split('.')[0] name_parts = file_name.split('_') if name_parts[0] == "documentum": file_check = '_'.join(name_parts[1:]).upper() if file_check in tables_other: #: Upload onbase prod files to S3 upload_doc_tables = S3FileTransferOperator( task_id='upload_' + file_name, source_base_path=conf['prod_data_dir'], source_key='{}.csv'.format(file_name), dest_s3_conn_id=conf['default_s3_conn_id'],