def test_python_callable_keyword_arguments_are_templatized(self): """Test PythonOperator op_kwargs are templatized""" recorded_calls = [] task = PythonOperator( task_id='python_operator', # a Mock instance cannot be used as a callable function or test fails with a # TypeError: Object of type Mock is not JSON serializable python_callable=(build_recording_function(recorded_calls)), op_kwargs={ 'an_int': 4, 'a_date': date(2019, 1, 1), 'a_templated_string': "dag {{dag.dag_id}} ran on {{ds}}." }, dag=self.dag) self.dag.create_dagrun( run_id='manual__' + DEFAULT_DATE.isoformat(), execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, state=State.RUNNING ) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) self.assertEqual(1, len(recorded_calls)) self._assertCallsEqual( recorded_calls[0], Call(an_int=4, a_date=date(2019, 1, 1), a_templated_string="dag {} ran on {}.".format( self.dag.dag_id, DEFAULT_DATE.date().isoformat())) )
def test_python_operator_run(self): """Tests that the python callable is invoked on task run.""" task = PythonOperator( python_callable=self.do_run, task_id='python_operator', dag=self.dag) self.assertFalse(self.is_run()) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) self.assertTrue(self.is_run())
def test_echo_env_variables(self): """ Test that env variables are exported correctly to the python callback in the task. """ self.dag.create_dagrun( run_id='manual__' + DEFAULT_DATE.isoformat(), execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, state=State.RUNNING, external_trigger=False, ) t = PythonOperator(task_id='hive_in_python_op', dag=self.dag, python_callable=self._env_var_check_callback ) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
# All times in Airflow UTC. Set Start Time in PST? args = general.args conf = general.config schedule = general.schedule['sire'] start_date = general.start_date['sire'] #: Dag spec dag = DAG(dag_id='sire_docs', default_args=args, start_date=start_date, schedule_interval=schedule) sire_docs_latest_only = LatestOnlyOperator(task_id='sire_docs_latest_only', dag=dag) #: Get sire tables get_doc_tables = PythonOperator( task_id='get_sire_tables', python_callable=get_sire, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Execution rules #: sire_docs_latest_only must run before get_doc_tables get_doc_tables.set_upstream(sire_docs_latest_only) files = [f for f in os.listdir(conf['prod_data_dir'])] for f in files: if f.split('_')[0] == "sire": #: Upload sire prod files to S3 upload_doc_tables = S3FileTransferOperator( task_id='upload_{}'.format(f), source_base_path=conf['prod_data_dir'],
def fun_task_timeout_monitor(ds, dag, **op_kwargs): dag_ids = dag.dag_id msg = [{ "dag": dag, "db": "opay_dw", "table": "{dag_name}".format(dag_name=dag_ids), "partition": "country_code=NG/dt={pt}".format(pt=ds), "timeout": "3000" }] TaskTimeoutMonitor().set_task_monitor(msg) task_timeout_monitor = PythonOperator(task_id='task_timeout_monitor', python_callable=fun_task_timeout_monitor, provide_context=True, dag=dag) ##----------------------------------------- 变量 ---------------------------------------## db_name = "opay_dw" table_name = "app_opay_transaction_consume_scenario_sum_m" hdfs_path = "oss://opay-datalake/opay/opay_dw/" + table_name ##---- hive operator ---## def app_opay_transaction_consume_scenario_sum_m_sql_task(ds): HQL = ''' set mapred.max.split.size=1000000; set hive.exec.dynamic.partition.mode=nonstrict; set hive.exec.parallel=true; --default false
#: Get CFS data from FTP and save to temp folder get_cfs_data = BashOperator( task_id='get_cfs_data', bash_command=get_cfs_data(), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Process CFS data and save result to prod folder process_cfs_data = PythonOperator( task_id='process_cfs_data', python_callable=process_cfs_data, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload prod file to S3 cfs_to_S3 = S3FileTransferOperator( task_id='cfs_to_S3', source_base_path=conf['prod_data_dir'], source_key='pd_calls_for_service_'+curr_year+'_datasd.csv', dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='pd/pd_calls_for_service_'+curr_year+'_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify,
from datetime import datetime from airflow import DAG from airflow.operators.python_operator import PythonOperator dag_id = "just_say_hello" with DAG(dag_id=dag_id, start_date=datetime(2018, 11, 14), schedule_interval=None) as dag: def say_hello(): print("Hello Airflow!") PythonOperator(task_id="say_hello", python_callable=say_hello)
'start_date': today - timedelta(days=2), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG('chained_job', schedule_interval='@once', default_args=default_args) producer = PythonOperator( task_id='run_job_producer', python_callable=run_job, op_args=('75588', 300, '8edd9e11f4de44b39f666777ac79bfe1'), retries=1, dag=dag ) consumer = PythonOperator( task_id='run_job_consumer', python_callable=run_job, op_args=('75588', 300, '8edd9e11f4de44b39f666777ac79bfe1'), retries=1, dag=dag ) consumer.set_upstream(producer)
dag = DAG(dag_id='dsd_code_enforcement', default_args=args, start_date=start_date, schedule_interval=schedule['dsd_code_enforcement']) #: Latest Only Operator for dsd code enforcement dsd_ce_latest_only = LatestOnlyOperator( task_id='dsd_code_enf_latest_only', dag=dag) #: Download code enforcement files and unzip them. get_code_enf_files = PythonOperator( task_id='get_code_enf_files', python_callable=dfg.get_files, op_kwargs={'fname_list': fname_list, 'target_dir': dsd_temp_dir}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_code_enf_md = get_seaboard_update_dag('code-enforcement-violations.md', dag) #: Execution rules #: dsd_code_enf_latest_only must run before get_code_enf_files get_code_enf_files.set_upstream(dsd_ce_latest_only) for i in fname_list: #: Create fme shell command
return None # load new data to mongodb load_new_data_task = PythonOperator( task_id='load_new_data', python_callable=load_new_data, dag=dag) def extract_type(ds, **kwargs): year, month, day = ds.split('-') # 2016-04-22 c_ds = "%s/%s/%s" % (day, month, year) # 15/12/2014 count = 0 tp = kwargs['tp'] keyword = kwargs['keyword'] for andamento in Andamentos.objects(data=c_ds): texto_lw = andamento.texto.lower() if keyword in texto_lw: andamento.tipo = tp andamento.save() count += 1 return count for tp in PROGRESS_TYPES: extract_tipo_task = PythonOperator( task_id='extract_%s_task' % (tp,), python_callable=extract_type, op_kwargs={'tp': tp, 'keyword': PROGRESS_TYPES[tp]}, dag=dag, provide_context=True) extract_tipo_task.set_upstream(load_new_data_task)
##----------------------------------------- 任务超时监控 ---------------------------------------## def fun_task_timeout_monitor(ds, dag, **op_kwargs): dag_ids = dag.dag_id msg = [ {"dag": dag, "db": "oride_dw", "table": "{dag_name}".format(dag_name=dag_ids), "partition": "country_code=NG/dt={pt}".format(pt=ds), "timeout": "800"} ] TaskTimeoutMonitor().set_task_monitor(msg) task_timeout_monitor = PythonOperator( task_id='task_timeout_monitor', python_callable=fun_task_timeout_monitor, provide_context=True, dag=dag ) ##----------------------------------------- 脚本 ---------------------------------------## def dwd_oride_assets_sku_df_sql_task(ds): HQL = ''' set hive.exec.parallel=true; set hive.exec.dynamic.partition.mode=nonstrict; INSERT overwrite TABLE oride_dw.{table} partition(country_code,dt)
total_data['申万一级行业'].values, er): detail_info[str(code)] = { 'weight': w, 'industry': ind, 'zz500': bm_w, 'er': r } portfolio_dict = {'Date': prev_date, 'portfolio': detail_info} portfolio_collection.delete_many({'Date': prev_date}) portfolio_collection.insert_one(portfolio_dict) portfolio.to_csv( '~/mnt/sharespace/personal/licheng/portfolio/zz500_mutual_fund/{0}.csv' .format(prev_date.strftime('%Y-%m-%d')), encoding='gbk') return 0 run_this1 = PythonOperator(task_id='update_daily_portfolio_mutual_fund', provide_context=True, python_callable=update_daily_portfolio_mutual_fund, dag=dag) if __name__ == '__main__': update_daily_portfolio_mutual_fund(None, next_execution_date=dt.datetime( 2017, 6, 14))
start_date=start_date, schedule_interval=general.schedule['indicator_bacteria_tests']) #: Latest Only Operator for traffic_counts wtr_latest_only = LatestOnlyOperator(task_id='water_latest_only', dag=dag) # TODO - teach me how to be yearly # Pull out all indicator bac tests. get_indicator_bac_tests = PythonOperator( task_id='get_indicator_bac_tests', python_callable=get_indicator_bacteria_tests, op_kwargs={ 'date_start': '01-JUN-2014', 'date_end': (datetime.now() + timedelta(days=5)).strftime('%d-%b-%Y') }, provide_context=True, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) # Get last bacteria tests for any given point. get_latest_bac_tests = PythonOperator( task_id='get_latest_bac_tests', python_callable=get_latest_bac_tests, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag)
# schedule_interval defines the frequency you want to run a dag. # In this case every day at 5pm (cron notation) schedule_interval='0 17 * * *' ) # function called by the PythonOperator when `python_task` will be executed def print_hello(): print('Hello world :)') # binds the tasks to the dag using `with dag` context manager with dag: # task `dummy_task_id` is an instance of DummyOperator. # An Operator creates objects that become nodes in the dag. dummy_task = DummyOperator(task_id='dummy_task_id', retries=5 ) # task `hello_task_id` is an instance of PythonOperator. # PythonOperator executes a Python callable, in this case the # `print_hello` function python_task = PythonOperator(task_id='hello_task_id', python_callable=print_hello ) # Define tasks dependencies (using the bitshift operator `>>`) # tasks execution order: the first to be executed is the dummy_task. # If dummy_task succeeded, then `python_task` will be triggered dummy_task >> python_task
def pull_function(**kwargs): ti = kwargs['ti'] pulled_message = ti.xcom_pull(key='message', task_ids='new_push_task') print("Pulled Message: '%s'" % pulled_message) def new_push_function(**kwargs): message = 'This is the NEW pushed message.' ti = kwargs['ti'] ti.xcom_push(key="message", value=message) t1 = PythonOperator(task_id='push_task', python_callable=push_function, provide_context=True, dag=DAG) t2 = PythonOperator(task_id='pull_task', python_callable=pull_function, provide_context=True, dag=DAG) t3 = PythonOperator(task_id='new_push_task', python_callable=new_push_function, provide_context=True, dag=DAG) t1 >> t3 >> t2
import datetime as dt import airflow from airflow import DAG from airflow.operators.python_operator import PythonOperator default_args = { 'owner': 'me', 'start_date': dt.datetime(20, 5, 1), 'retries': 1, 'retry_delay': dt.timedelta(minutes=5), } def print_world(): print('world') with DAG('airflow_tutorial_v01', default_args=default_args, schedule_interval=None, ) as dag: print_world = PythonOperator(task_id='print_world', python_callable=print_world) print_world
if anything_new: return 'yes_generate_notification' else: return 'no_do_nothing' def generate_message(**context): _, all_comic_info = context['task_instance'].xcom_pull(task_ids='check_comic_info') print("產生要寄給 Slack 的訊息內容並存成檔案") with DAG('comic_app_v2', default_args=default_args, schedule_interval= '@daily') as dag: get_read_history = PythonOperator( task_id='get_read_history', python_callable=process_metadata, op_args=['read'] ) check_comic_info = PythonOperator( task_id='check_comic_info', python_callable=check_comic_info, provide_context=True ) decide_what_to_do = BranchPythonOperator( task_id='new_comic_available', python_callable=decide_what_to_do, provide_context=True )
if errorMessage: logging.error('DAP Processor Failure. See Exception') raise Exception(errorMessage) finally: print('Exit Status: {}'.format(exitStatus)) print('StdOut: {}'.format(stdOutput)) print('StdErr: {}'.format(errorMessage)) if sshConn: sshConn.close() # run_DAP_Processor = SSHOperator( # task_id = 'Run_DAP_Processor', # provide_context = True, # ssh_conn_id = 'DAP_App_Server', # # command = 'E:\\Airflow_Test\\DAP\DAPConsoleProcessor.exe -config "E:\\Airflow_Test\\DAP\\Configuration\\DAPTCPBrands.xml" -jobname "DAPTCPBrands.xml"', # command = 'whoami', # timeout = 3600, # do_xcom_push = True, # get_pty = True, # dag = dag # ) runDAPTask = PythonOperator(task_id='Run_DAP_Processor', provide_context=True, python_callable=runDAP, dag=dag)
schedule = general.schedule start_date = general.start_date['pd_col'] dag = DAG( dag_id='pd_col', default_args=args, start_date=start_date, schedule_interval=schedule['pd_col']) #: Latest Only Operator for pd_col pd_col_latest_only = LatestOnlyOperator( task_id='pd_col_latest_only', dag=dag) #: Get collisions data from FTP and save to temp folder get_collisions_data = PythonOperator( task_id='get_collisions_data', python_callable=get_collisions_data, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Process collisions data and save result to prod folder process_collisions_data = PythonOperator( task_id='process_collisions_data', python_callable=process_collisions_data, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload prod file to S3 collisions_to_S3 = S3FileTransferOperator(
dag = DAG('lesson2.exercise3', start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0), end_date=datetime.datetime(2019, 1, 1, 0, 0, 0, 0), schedule_interval='@monthly', max_active_runs=1) create_trips_table = PostgresOperator( task_id="create_trips_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_TRIPS_TABLE_SQL) copy_trips_task = PythonOperator( task_id='load_trips_from_s3_to_redshift', dag=dag, python_callable=load_trip_data_to_redshift, # TODO: ensure that we provide context to our Python Operator provide_context=True, ) create_stations_table = PostgresOperator( task_id="create_stations_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_STATIONS_TABLE_SQL, ) copy_stations_task = PythonOperator( task_id='load_stations_from_s3_to_redshift', dag=dag, python_callable=load_station_data_to_redshift,
'value': 'airflow' }] def print_stuff(): print("stuff!") def use_zip_binary(): rc = os.system("zip") assert rc == 0 # You don't have to use any special KubernetesExecutor configuration if you don't want to start_task = PythonOperator( task_id="start_task", python_callable=print_stuff, dag=dag ) # But you can if you want to one_task = PythonOperator( task_id="one_task", python_callable=print_stuff, dag=dag, executor_config={"KubernetesExecutor": {"image": "airflow/ci:latest"}} ) # Use the zip binary, which is only found in this special docker image two_task = PythonOperator( task_id="two_task", python_callable=use_zip_binary, dag=dag, executor_config={"KubernetesExecutor": {"image": "airflow/ci_zip:latest"}} ) # Limit resources on this operator/task with node affinity & tolerations
hr = moment.hour bucket_key_template = f'{source}/{year}/{month}/{day}/ypsource.json' get_new_json = S3KeySensor(task_id="get_new_json", poke_interval=60 * 2, timeout=60 * 60 * 3, bucket_key=bucket_key_template, bucket_name=b_name, wildcard_match=False, aws_conn_id="s3_task", dag=dag) # get_from_S3 = PythonOperator( # task_id='get_from_S3', # python_callable=get_file_from_s3, # dag=dag # ) upload_to_S3_task = PythonOperator( task_id='upload_file_to_S3', python_callable=upload_file_to_S3_with_hook, params={ 'filename': '/home/akorede/Documents/mycsv.csv', 'key': 'mycsv.csv', 'bucket_name': 'ypsource-bucket', }, provide_context=True, dag=dag) # Use arrows to set dependencies between tasks upload_to_S3_task.set_upstream(get_new_json)
# output files generated by this task and naming convention # is direction(from or to)_twitterHandle_date.csv # -------------------------------------------------------------------------------- fetch_tweets = PythonOperator( task_id='fetch_tweets', python_callable=fetchtweets, dag=dag) # -------------------------------------------------------------------------------- # Clean the eight files. In this step you can get rid of or cherry pick columns # and different parts of the text # -------------------------------------------------------------------------------- clean_tweets = PythonOperator( task_id='clean_tweets', python_callable=cleantweets, dag=dag) clean_tweets.set_upstream(fetch_tweets) # -------------------------------------------------------------------------------- # In this section you can use a script to analyze the twitter data. Could simply # be a sentiment analysis through algorithms like bag of words or something more # complicated. You can also take a look at Web Services to do such tasks # -------------------------------------------------------------------------------- analyze_tweets = PythonOperator( task_id='analyze_tweets', python_callable=analyzetweets, dag=dag)
spark_submit_task = SparkSubmitOperator( task_id='spark_submit_job', conn_id='spark_default', java_class='com.scaledata.softbug.datasources.apache.AccessParser', application=EXECUTABLE_PATH, # application_args=[' '.join(['{0}={1}'.format(k, v) for (k, v) in PARAMS.iteritems()])], # application_args=['{0}={1}'.format(k, v) for (k, v) in PARAMS.iteritems()], # application_args=['{0}={1}'.format(k, v) for (k, v) in PARAMS.iteritems()], application_args=["{{ti.xcom_pull(task_ids='push_xcom')}}"], total_executor_cores='1', executor_cores='1', executor_memory='2g', num_executors='2', name='spark-airflow-phoenix', verbose=True, driver_memory='1g', xcom_push='true', conf=config, dag=dag, ) dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag) push_xcom_task = PythonOperator(task_id='push_xcom', python_callable=push_xcom, dag=dag) pull_xcom_task = PythonOperator(task_id='pull_xcom', python_callable=pull_xcom, templates_dict={'_application_args': PARAMS}, dag=dag) dummy_operator >> push_xcom_task >> pull_xcom_task >> spark_submit_task
print("annotated!") def test_volume_mount(): with open('/foo/volume_mount_test.txt', 'w') as foo: foo.write('Hello') rc = os.system("cat /foo/volume_mount_test.txt") assert rc == 0 # You can use annotations on your kubernetes pods! start_task = PythonOperator( task_id="start_task", python_callable=print_stuff, dag=dag, executor_config={ "KubernetesExecutor": { "annotations": {"test": "annotation"} } } ) # You can mount volume or secret to the worker pod second_task = PythonOperator( task_id="four_task", python_callable=test_volume_mount, dag=dag, executor_config={ "KubernetesExecutor": { "volumes": [ { "name": "test-volume", "hostPath": {"path": "/tmp/"}, }, ],
dag.doc_md = __doc__ def response_check(response): """ Dumps the http response and returns True when the http call status is 200/success """ print(response) print(response.text) return response.status_code == 200 t2 = SimpleHttpOperator(task_id='heroku_coin', http_conn_id='heroku_conn', method='GET', endpoint='', headers={"Content-Type": "application/json"}, xcom_push=True, response_check=response_check, dag=dag) def print_hello(): return 'Hello world!' hello_operator = PythonOperator(task_id='hello_task', python_callable=print_hello, dag=dag) t2.set_upstream(hello_operator)
def my_sleeping_function(random_base): '''This is a function that will run within the DAG execution''' time.sleep(random_base) def print_context(ds, **kwargs): pprint(kwargs) print(ds) return 'Whatever you return gets printed in the logs' run_this = PythonOperator( task_id='print_the_context', provide_context=True, python_callable=print_context, dag=dag) for i in range(10): ''' Generating 10 sleeping task, sleeping from 0 to 9 seconds respectively ''' task = PythonOperator( task_id='sleep_for_'+str(i), python_callable=my_sleeping_function, op_kwargs={'random_base': float(i)/10}, dag=dag) task.set_upstream(run_this)
session = settings.Session() for x in OBJECTS_TO_EXPORT: result = session.execute(text(x[0])) stream_to_S3_fn(result, x[1]) session.close() return "OK" with DAG(dag_id=dag_id, schedule_interval=None, catchup=False, start_date=days_ago(1)) as dag: back_up_activedags_t = PythonOperator(task_id="back_up_activedags", python_callable=back_up_activedags) pause_dags_t = PythonOperator(task_id="pause_dags", python_callable=pause_dags) export_active_dags_t = PythonOperator(task_id="export_active_dags", python_callable=export_active_dags) export_variable_t = PythonOperator(task_id="export_variable", python_callable=export_variable) export_data_t = PythonOperator(task_id="export_data", python_callable=export_data, provide_context=True) # backup all active dag; pause the dags; export all the tables in the OBJECTS_TO_EXPORT; # export the active dags so they can be turned on in the new environment # Export variables. back_up_activedags_t >> pause_dags_t >> export_data_t pause_dags_t >> export_active_dags_t
dag_id='traffic_counts', default_args=args, start_date=start_date, schedule_interval=schedule) #: Latest Only Operator for traffic_counts tc_latest_only = LatestOnlyOperator( task_id='traffic_counts_latest_only', dag=dag) #: Downloads traffic counts xlsx from share get_traffic_counts = PythonOperator( task_id='get_traffic_counts', python_callable=get_traffic_counts, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Cleans the downloaded XLSX file, converts it to CSV data. clean_traffic_counts = PythonOperator( task_id='clean_traffic_counts', python_callable=clean_traffic_counts, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag ) #: Builds the prod file
server.login(username, password) server.sendmail(emailfrom, emailto, msg.as_string()) server.quit() with DAG('etl_us_marketing_daily_email', default_args=default_args, schedule_interval='@daily', concurrency=10, max_active_runs=1) as dag: hold_on = TimeSensor(task_id='hold_on', target_time=time(hour=12, minute=30), dag=dag) send_email = PythonOperator( task_id='generate_and_send_email', python_callable=generate_HTML_and_send_email, op_args=[], # The following is required to pass macros to the PythonOperator # See https://stackoverflow.com/a/45870153 provide_context=True, retries=0, dag=dag) remove_tmp_file = BashOperator(task_id='remove_fb_tmp_file', bash_command='rm -f {}'.format(name), dag=dag) hold_on >> send_email >> remove_tmp_file
df = df.astype('float64') df.to_csv('dags/c2k_final.csv') default_args = { 'owner': 'Israel Z', 'start_date': dt.datetime(2018, 5, 9), 'retries': 1, 'retry_delay': dt.timedelta(minutes=5), } with DAG('flow_pandas', default_args=default_args, schedule_interval='*/10 * * * *', ) as dag: download = PythonOperator(task_id='download', python_callable=download) dropn = PythonOperator(task_id='dropn', python_callable=dropn) fill = PythonOperator(task_id='fill', python_callable=fill) cast = PythonOperator(task_id='cast', python_callable=cast) # Dependencies dropn.set_upstream(download) fill.set_upstream(dropn) cast.set_upstream(fill)
# Use cron to define exact time. Eg. 8:15am would be "15 08 * * *" schedule_interval = "@daily" scriptpath = './singer_data/' # scriptpath = '/media/navneetsajwan/DAC4BFAEC4BF8AF15/Learn-Apache-Airflow-in-easy-way--main/project/singer_data/' # Define DAG: Set ID and assign default args and schedule interval dag = DAG( 'dag_3', default_args=default_args, schedule_interval=schedule_interval ) #=============================================================================== # extract data from mysql and store into csv file t1= PythonOperator( task_id='tap_mysql_target_csv', python_callable=tap_mysql_target_csv, dag=dag, ) #================================================================================ # create today dir if not exists t2= PythonOperator( task_id='today_dir', python_callable=today_dir, dag=dag, ) #================================================================================ # move extrated csv file into today dir folder t3= PythonOperator( task_id='move_files',
DEFAULT_DATE = datetime(2016, 1, 1) default_args = dict( start_date=DEFAULT_DATE, owner='airflow') def fail(): raise ValueError('Expected failure.') def success(ti=None, *args, **kwargs): if ti.execution_date != DEFAULT_DATE + timedelta(days=1): fail() return # DAG tests that tasks ignore all dependencies dag1 = DAG(dag_id='test_run_ignores_all_dependencies', default_args=dict(depends_on_past=True, **default_args)) dag1_task1 = PythonOperator( task_id='test_run_dependency_task', python_callable=fail, dag=dag1,) dag1_task2 = PythonOperator( task_id='test_run_dependent_task', python_callable=success, provide_context=True, dag=dag1,) dag1_task1.set_downstream(dag1_task2)
import airflow from airflow.models import DAG from airflow.operators.python_operator import PythonOperator args = {"start_date": airflow.utils.dates.days_ago(2), "owner": "royi"} dag = DAG(dag_id="plugin_other_dag", default_args=args, schedule_interval=None) def task1(): print("Hello World!") return "Hello World!" def task2(): print("Shalom Olam!") return "Shalom Olam!" def task3(): print("Ola Mundo!") return "Ola Mundo!" op1 = PythonOperator(task_id="task1", python_callable=task1, dag=dag) op2 = PythonOperator(task_id="task2", python_callable=task2, dag=dag) op3 = PythonOperator(task_id="task3", python_callable=task3, dag=dag) op1 >> op2 >> op3
#: Dag spec for dsd approvals dag = DAG(dag_id='dsd_approvals', default_args=args, start_date=start_date, schedule_interval=schedule) #: Latest Only Operator for dsd approvals. dsd_approvals_latest_only = LatestOnlyOperator( task_id='dsd_approvals_latest_only', dag=dag) #: Get most recent weekly permit approvals reports get_approvals_files = PythonOperator( task_id='get_approvals_files', python_callable=dfg.get_files, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, op_kwargs={'fname_list': fnames, 'target_dir': dsd_temp_dir}, dag=dag) #: dsd_approvals_latest_only must run before get_approvals_files get_approvals_files.set_upstream(dsd_approvals_latest_only) #: update github modified date (solar permits) update_solar_md = get_seaboard_update_dag('solar-permits.md', dag) for key in app.approval_dict: #: Consolidate weekly permitting data by scraping OpenDSD API
csvwriter.writerow(csv_header); for row in campaign_data: logging.info(row); csvwriter.writerow(row); except: pass except ValueError: pass download_data = PythonOperator( task_id='task_download_rtbiq_impressions_data' ,provide_context=True ,python_callable = download_rtbiq_impressions_data ) finish_download = DummyOperator(task_id = 'task_finish_download') finish_move_to_client_gcs = DummyOperator(task_id = 'task_finish_move_to_client_gcs') finish_upload_to_bq = DummyOperator(task_id = 'task_finish_upload_to_bq') complete = DummyOperator(task_id = 'task_complete') execution_date = '{{ ds_nodash }}' move_to_client_gcs = BashOperator( task_id = 'task_move_to_client_gcs' ,bash_command='gsutil -m mv ' + 'gs://' + config['gcp_composer_gcs_bucket'] + '/data/rtbiq_data/{0}*'.format(execution_date) + ' ' + 'gs://' + config['gcp_client_gcs_bucket'] + '/data/rtbiq_data/'
# All times in Airflow UTC. Set Start Time in PST? args = general.args conf = general.config schedule = general.schedule['public_art'] start_date = general.start_date['public_art'] #: Dag spec dag = DAG(dag_id='public_art', default_args=args, start_date=start_date, schedule_interval=schedule) public_art_latest_only = LatestOnlyOperator(task_id='public_art_latest_only', dag=dag) #: Get public art from NetX, process, output prod file get_public_art = PythonOperator( task_id='get_public_art', python_callable=get_public_art, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload prod art file to S3 upload_public_art = S3FileTransferOperator( task_id='upload_public_art', source_base_path=conf['prod_data_dir'], source_key='public_art_locations_datasd.csv', dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='public_art/public_art_locations_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify,
"eamil_on_retry": True, "retries": 0, "retry_delay": timedelta(minutes=5) } dag = DAG( "project_pipeline", description="Building the entire project", # train every first day of the month schedule_interval="@monthly", default_args=default_args, catchup=False) with dag: task_1_create_base_features = PythonOperator( task_id="generate_base_features", python_callable=Generate_base_features) task_2_create_historical_features = PythonOperator( task_id="generate_historic_features", python_callable=Generate_historical_features) task_3_create_advanced_features = PythonOperator( task_id="generate_advanced_features", python_callable=Generate_advanced_features) task_4_select_features = PythonOperator(task_id="feature_selection", python_callable=Feature_selection) task_5_train_lgb_model = PythonOperator(task_id="train_lgb_model", python_callable=Train_LGB_Model)
schedule = general.schedule['fd_incidents'] start_date = general.start_date['fd_incidents'] cur_yr = general.get_year() #: Dag spec dag = DAG(dag_id='fd_problem_nature', default_args=args, start_date=start_date, schedule_interval=schedule) #: Latest Only Operator for fd fd_latest_only = LatestOnlyOperator(task_id='fd_latest_only', dag=dag) #: Get fire_department data from DB get_fd_data = PythonOperator( task_id='get_fd_data', python_callable=get_fd_data, provide_context=True, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload prod fire_department_SD.csv file to S3 upload_fd_data = S3FileTransferOperator( task_id='upload_fd_data', source_base_path=conf['prod_data_dir'], source_key='/fd_problems_{}_datasd.csv'.format(cur_yr), dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='fd_cad/' + 'fd_problems_{}_datasd.csv'.format(cur_yr), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify,
return f"Data sending completed" with DAG( dag_id=f'populate_{Variable.get("postal",2650)}_estates', description=f"Populate estates to {TABLE_NAME}", default_args=args, # Start 10 minutes ago # days_ago(2) start_date=datetime.now(), schedule_interval="30 23 * * 1-5", ) as dag: push_bolig_postal = PythonOperator( task_id=f'load_{Variable.get("postal",2650)}_bolig_data', python_callable=get_bolig, op_args=[ Variable.get("postal", 2650), ], dag=dag, provide_context=True, ) process_completed = PythonOperator( task_id="mining_completed", dag=dag, python_callable=process_completed, provide_context=True, ) push_bolig_postal >> process_completed
start_date = general.start_date['ttcs'] #: Dag definition dag = DAG(dag_id='ttcs', default_args=args, start_date=start_date, schedule_interval=schedule['ttcs']) #: Latest Only Operator for ttcs ttcs_latest_only = LatestOnlyOperator( task_id='ttcs_latest_only', dag=dag) #: Get active businesses and save as .csv to temp folder get_active_businesses = PythonOperator( task_id='get_active_businesses', python_callable=get_active_businesses, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Process temp data and save as .csv to prod folder clean_data = PythonOperator( task_id='clean_data', python_callable=clean_data, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Geocode new entries and update production file geocode_data = PythonOperator(
start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0), end_date=datetime.datetime(2019, 1, 1, 0, 0, 0, 0), schedule_interval="@monthly", max_active_runs=1, ) create_trips_table = PostgresOperator( task_id="create_trips_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_TRIPS_TABLE_SQL, ) copy_trips_task = PythonOperator( task_id="load_trips_from_s3_to_redshift", dag=dag, python_callable=load_trip_data_to_redshift, provide_context=True, ) check_trips = PythonOperator( task_id="check_trips_data", dag=dag, python_callable=check_greater_than_zero, provide_context=True, params={"table": "trips"}, ) create_stations_table = PostgresOperator( task_id="create_stations_table", dag=dag, postgres_conn_id="redshift",
# pushes an XCom without a specific target, just by returning it return value_2 def puller(**kwargs): ti = kwargs['ti'] # get value_1 v1 = ti.xcom_pull(key=None, task_ids='push') assert v1 == value_1 # get value_2 v2 = ti.xcom_pull(task_ids='push_by_returning') assert v2 == value_2 # get both value_1 and value_2 v1, v2 = ti.xcom_pull(key=None, task_ids=['push', 'push_by_returning']) assert (v1, v2) == (value_1, value_2) push1 = PythonOperator( task_id='push', dag=dag, python_callable=push) push2 = PythonOperator( task_id='push_by_returning', dag=dag, python_callable=push_by_returning) pull = PythonOperator( task_id='puller', dag=dag, python_callable=puller) pull.set_upstream([push1, push2])
start_operator = DummyOperator(task_id='Begin_execution', dag=dag) # Create set set of TABLES in target Redshift Cluster / Postgres dtabase create_tables = PostgresOperator(task_id="create_redshift_tables", dag=dag, postgres_conn_id="redshift", sql='/sql_statements/create_tables.sql') ############# Load files to S3 # Load GNIS database file to S3 gnis_to_s3 = PythonOperator(task_id='gnis_to_s3', dag=dag, python_callable=load_to_s3, provide_context=True, op_kwargs={ 'location': 'tmp_data', 'filename': 'NationalFile_20200301.txt', 's3_bucket': 'dend-lake', 's3_key': 'gnis', 'aws_credentials_id': 'aws_credentials', }) ############# Load S3 -> Staging Tables # Load GNIS table to a staging table in Redshift staging_gnis_2_redshift = PythonOperator( task_id='staging_gnis_2_redshift', dag=dag, python_callable=load_data_to_redshift, provide_context=True, op_kwargs={ 's3_location': "s3://dend-lake/gnis/NationalFile_20200301.txt", 'target_table': 'gnis_staging',
dag6_task1 = DummyOperator( task_id='test_depends_on_past', depends_on_past=True, dag=dag6,) dag6_task2 = DummyOperator( task_id='test_depends_on_past_2', depends_on_past=True, dag=dag6,) dag6_task2.set_upstream(dag6_task1) # DAG tests that a deadlocked subdag is properly caught dag7 = DAG(dag_id='test_subdag_deadlock', default_args=default_args) subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args) subdag7_task1 = PythonOperator( task_id='test_subdag_fail', dag=subdag7, python_callable=fail) subdag7_task2 = DummyOperator( task_id='test_subdag_dummy_1', dag=subdag7,) subdag7_task3 = DummyOperator( task_id='test_subdag_dummy_2', dag=subdag7) dag7_subdag1 = SubDagOperator( task_id='subdag', dag=dag7, subdag=subdag7) subdag7_task1.set_downstream(subdag7_task2) subdag7_task2.set_downstream(subdag7_task3) # DAG tests that a Dag run that doesn't complete but has a root failure is marked running
session.delete(entry) logging.info("Finished Performing Delete") else: logging.warn("You're opted to skip deleting the db entries!!!") logging.info("Finished Running Cleanup Process") with DAG( DAG_ID, default_args=default_args, schedule_interval=SCHEDULE_INTERVAL, start_date=START_DATE ) as dag: close_session = PythonOperator( task_id='close_session', python_callable=close_session_function, ) print_configuration = PythonOperator( task_id='print_configuration', python_callable=print_configuration_function, provide_context=True, ) for db_object in DATABASE_OBJECTS: cleanup = PythonOperator( task_id='cleanup_' + str(db_object["airflow_db_model"].__name__), python_callable=cleanup_function, params=db_object, provide_context=True,
#: Dag spec for dsd permits dag = DAG(dag_id='dsd_permits', default_args=args, start_date=start_date, schedule_interval=schedule) #: Latest Only Operator for dsd permits. dsd_permits_latest_only = LatestOnlyOperator( task_id='dsd_permits_latest_only', dag=dag) #: Get permits reports get_permits_files = PythonOperator( task_id='get_permits_files', python_callable=get_permits_files, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Clean permits reports clean_data = PythonOperator( task_id='clean_data', python_callable=clean_data, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Join BIDs to permits join_bids = PythonOperator(
default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2019, 7, 22, 5, 0), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 0, 'retry_delay': timedelta(minutes=5), 'concurrency': 1 # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } with DAG('instances_auto_on', catchup=False, default_args=default_args, schedule_interval="0 8 * * *") as dag: opr_startup = PythonOperator(task_id='startup', python_callable=main, op_kwargs={ 'aws_acces_key_id': '{{ aws_access_key_id }}', 'aws_secret_access_key': '{{ aws_secret_key }}', 'aws_region': '{{ aws_region }}', 'role_arn': '{{ aws_role_arn }}', 'direct_login': True })
check_updates_with_judges_task = PythonOperator( task_id='check_updates_with_judges', python_callable=check_updates_with_judges, dag=dag) def extract_name(): # TODO: Criar função para extrair o nome do juiz do texto return None # http://blog.yhat.com/posts/named-entities-in-law-and-order-using-nlp.html def check_name(): # TODO: Verificar o nome extraido return None # Validar com uma base de nomes de JUIZES (portal da transparencia) extract_name_task = PythonOperator( task_id='extract_name_task', python_callable=extract_name, dag=dag) check_name_task = PythonOperator( task_id='check_name_task', python_callable=check_name, dag=dag) extract_name_task.set_upstream(check_updates_with_judges_task) check_name_task.set_upstream(extract_name_task)
def create_evaluate_ops(task_prefix, data_format, input_paths, prediction_path, metric_fn_and_keys, validate_fn, batch_prediction_job_id=None, project_id=None, region=None, dataflow_options=None, model_uri=None, model_name=None, version_name=None, dag=None): """ Creates Operators needed for model evaluation and returns. It gets prediction over inputs via Cloud ML Engine BatchPrediction API by calling MLEngineBatchPredictionOperator, then summarize and validate the result via Cloud Dataflow using DataFlowPythonOperator. For details and pricing about Batch prediction, please refer to the website https://cloud.google.com/ml-engine/docs/how-tos/batch-predict and for Cloud Dataflow, https://cloud.google.com/dataflow/docs/ It returns three chained operators for prediction, summary, and validation, named as <prefix>-prediction, <prefix>-summary, and <prefix>-validation, respectively. (<prefix> should contain only alphanumeric characters or hyphen.) The upstream and downstream can be set accordingly like: pred, _, val = create_evaluate_ops(...) pred.set_upstream(upstream_op) ... downstream_op.set_upstream(val) Callers will provide two python callables, metric_fn and validate_fn, in order to customize the evaluation behavior as they wish. - metric_fn receives a dictionary per instance derived from json in the batch prediction result. The keys might vary depending on the model. It should return a tuple of metrics. - validation_fn receives a dictionary of the averaged metrics that metric_fn generated over all instances. The key/value of the dictionary matches to what's given by metric_fn_and_keys arg. The dictionary contains an additional metric, 'count' to represent the total number of instances received for evaluation. The function would raise an exception to mark the task as failed, in a case the validation result is not okay to proceed (i.e. to set the trained version as default). Typical examples are like this: def get_metric_fn_and_keys(): import math # imports should be outside of the metric_fn below. def error_and_squared_error(inst): label = float(inst['input_label']) classes = float(inst['classes']) # 0 or 1 err = abs(classes-label) squared_err = math.pow(classes-label, 2) return (err, squared_err) # returns a tuple. return error_and_squared_error, ['err', 'mse'] # key order must match. def validate_err_and_count(summary): if summary['err'] > 0.2: raise ValueError('Too high err>0.2; summary=%s' % summary) if summary['mse'] > 0.05: raise ValueError('Too high mse>0.05; summary=%s' % summary) if summary['count'] < 1000: raise ValueError('Too few instances<1000; summary=%s' % summary) return summary For the details on the other BatchPrediction-related arguments (project_id, job_id, region, data_format, input_paths, prediction_path, model_uri), please refer to MLEngineBatchPredictionOperator too. :param task_prefix: a prefix for the tasks. Only alphanumeric characters and hyphen are allowed (no underscores), since this will be used as dataflow job name, which doesn't allow other characters. :type task_prefix: str :param data_format: either of 'TEXT', 'TF_RECORD', 'TF_RECORD_GZIP' :type data_format: str :param input_paths: a list of input paths to be sent to BatchPrediction. :type input_paths: list[str] :param prediction_path: GCS path to put the prediction results in. :type prediction_path: str :param metric_fn_and_keys: a tuple of metric_fn and metric_keys: - metric_fn is a function that accepts a dictionary (for an instance), and returns a tuple of metric(s) that it calculates. - metric_keys is a list of strings to denote the key of each metric. :type metric_fn_and_keys: tuple of a function and a list[str] :param validate_fn: a function to validate whether the averaged metric(s) is good enough to push the model. :type validate_fn: function :param batch_prediction_job_id: the id to use for the Cloud ML Batch prediction job. Passed directly to the MLEngineBatchPredictionOperator as the job_id argument. :type batch_prediction_job_id: str :param project_id: the Google Cloud Platform project id in which to execute Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s `default_args['project_id']` will be used. :type project_id: str :param region: the Google Cloud Platform region in which to execute Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s `default_args['region']` will be used. :type region: str :param dataflow_options: options to run Dataflow jobs. If None, then the `dag`'s `default_args['dataflow_default_options']` will be used. :type dataflow_options: dictionary :param model_uri: GCS path of the model exported by Tensorflow using tensorflow.estimator.export_savedmodel(). It cannot be used with model_name or version_name below. See MLEngineBatchPredictionOperator for more detail. :type model_uri: str :param model_name: Used to indicate a model to use for prediction. Can be used in combination with version_name, but cannot be used together with model_uri. See MLEngineBatchPredictionOperator for more detail. If None, then the `dag`'s `default_args['model_name']` will be used. :type model_name: str :param version_name: Used to indicate a model version to use for prediction, in combination with model_name. Cannot be used together with model_uri. See MLEngineBatchPredictionOperator for more detail. If None, then the `dag`'s `default_args['version_name']` will be used. :type version_name: str :param dag: The `DAG` to use for all Operators. :type dag: airflow.models.DAG :returns: a tuple of three operators, (prediction, summary, validation) :rtype: tuple(DataFlowPythonOperator, DataFlowPythonOperator, PythonOperator) """ # Verify that task_prefix doesn't have any special characters except hyphen # '-', which is the only allowed non-alphanumeric character by Dataflow. if not re.match(r"^[a-zA-Z][-A-Za-z0-9]*$", task_prefix): raise AirflowException( "Malformed task_id for DataFlowPythonOperator (only alphanumeric " "and hyphens are allowed but got: " + task_prefix) metric_fn, metric_keys = metric_fn_and_keys if not callable(metric_fn): raise AirflowException("`metric_fn` param must be callable.") if not callable(validate_fn): raise AirflowException("`validate_fn` param must be callable.") if dag is not None and dag.default_args is not None: default_args = dag.default_args project_id = project_id or default_args.get('project_id') region = region or default_args.get('region') model_name = model_name or default_args.get('model_name') version_name = version_name or default_args.get('version_name') dataflow_options = dataflow_options or \ default_args.get('dataflow_default_options') evaluate_prediction = MLEngineBatchPredictionOperator( task_id=(task_prefix + "-prediction"), project_id=project_id, job_id=batch_prediction_job_id, region=region, data_format=data_format, input_paths=input_paths, output_path=prediction_path, uri=model_uri, model_name=model_name, version_name=version_name, dag=dag) metric_fn_encoded = base64.b64encode(dill.dumps(metric_fn, recurse=True)) evaluate_summary = DataFlowPythonOperator( task_id=(task_prefix + "-summary"), py_options=["-m"], py_file="airflow.contrib.operators.mlengine_prediction_summary", dataflow_default_options=dataflow_options, options={ "prediction_path": prediction_path, "metric_fn_encoded": metric_fn_encoded, "metric_keys": ','.join(metric_keys) }, dag=dag) evaluate_summary.set_upstream(evaluate_prediction) def apply_validate_fn(*args, **kwargs): prediction_path = kwargs["templates_dict"]["prediction_path"] scheme, bucket, obj, _, _ = urlsplit(prediction_path) if scheme != "gs" or not bucket or not obj: raise ValueError("Wrong format prediction_path: %s", prediction_path) summary = os.path.join(obj.strip("/"), "prediction.summary.json") gcs_hook = GoogleCloudStorageHook() summary = json.loads(gcs_hook.download(bucket, summary)) return validate_fn(summary) evaluate_validation = PythonOperator( task_id=(task_prefix + "-validation"), python_callable=apply_validate_fn, provide_context=True, templates_dict={"prediction_path": prediction_path}, dag=dag) evaluate_validation.set_upstream(evaluate_summary) return evaluate_prediction, evaluate_summary, evaluate_validation
#: Dag spec dag = DAG(dag_id='special_events', default_args=args, start_date=start_date, schedule_interval=schedule) #: Latest Only Operator for special events se_latest_only = LatestOnlyOperator(task_id='se_latest_only', dag=dag) #: Get special events from DB get_special_events = PythonOperator( task_id='get_special_events', python_callable=get_special_events, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Process and geocode raw special events file process_special_events = PythonOperator( task_id='process_special_events', python_callable=process_special_events, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload prod SE file to S3 upload_special_events = S3FileTransferOperator(
dag = DAG('scripts', default_args=default_args, schedule_interval=timedelta(days=1)) def print_thing(thing, **kwargs): print(f'{kwargs.get("ds")}: {thing}') pprint(kwargs) def print_five(): for i in range(5): print(i) t1 = PythonOperator( task_id='print_thing', python_callable=print_thing, provide_context=True, op_kwargs=dict( thing='something', ), dag=dag, ) t2 = PythonOperator( task_id='print_five', python_callable=print_five, dag=dag, ) t1 >> t2
dag=dag) t2 = QuboleOperator( task_id='hive_s3_location', command_type="hivecmd", script_location="s3n://public-qubole/qbol-library/scripts/show_table.hql", notfiy=True, tags=['tag1', 'tag2'], # If the script at s3 location has any qubole specific macros to be replaced # macros='[{"date": "{{ ds }}"}, {"name" : "abc"}]', trigger_rule="all_done", dag=dag) t3 = PythonOperator( task_id='compare_result', provide_context=True, python_callable=compare_result, trigger_rule="all_done", dag=dag) t3.set_upstream(t1) t3.set_upstream(t2) options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(t3) join = DummyOperator(
part1 = MIMEText(subtit1, 'html') part2 = MIMEText(dados1, 'html') part3 = MIMEText(subtit2, 'html') part4 = MIMEText(dados2, 'html') part5 = MIMEText(subtit3, 'html') part6 = MIMEText(dados3, 'html') message.attach(sumario) message.attach(titulo) message.attach(registros) message.attach(part1) message.attach(part2) message.attach(part3) message.attach(part4) message.attach(part5) message.attach(part6) # conectaremos de forma segura usando SSL server = smtplib.SMTP_SSL(smtp_ssl_host, smtp_ssl_port) # para interagir com um servidor externo precisaremos # fazer login nele server.login(username, password) server.sendmail(from_addr, to_addrs, message.as_string()) server.quit() t1 = PythonOperator(task_id='popula_relatorios', python_callable=carrega_dados, dag=dag) t1
args = general.args conf = general.config schedule = general.schedule['campaign_fin'] start_date = general.start_date['campaign_fin'] cur_yr = general.get_year() #: Dag spec dag = DAG(dag_id='campaign_fin_reports', default_args=args, start_date=start_date, schedule_interval=schedule) campaign_fin_latest_only = LatestOnlyOperator(task_id='campaign_fin_latest_only', dag=dag) #: Get 460A transactions schedule_460A = PythonOperator( task_id='get_transactions_a', python_callable=get_transactions_a, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Get 460B1 transactions schedule_460B1 = PythonOperator( task_id='get_transactions_b', python_callable=get_transactions_b, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Get 460C transactions schedule_460C = PythonOperator(
def MakeCommonDag(name='istio_daily_flow_test', schedule_interval='15 9 * * *', monthly=False): """Creates the shared part of the daily/monthly dags.""" common_dag = DAG( name, default_args=default_args, schedule_interval=schedule_interval, ) def AirflowGetVariableOrBaseCase(var, base): try: return Variable.get(var) except KeyError: return base def GenerateTestArgs(**kwargs): """Loads the configuration that will be used for this Iteration.""" conf = kwargs['dag_run'].conf if conf is None: conf = dict() """ Airflow gives the execution date when the job is supposed to be run, however we dont backfill and only need to run one build therefore use the current date instead of the date that is passed in """ # date = kwargs['execution_date'] date = datetime.datetime.now() timestamp = time.mktime(date.timetuple()) # Monthly releases started in Nov 2017 with 0.3.0, so minor is # of months # from Aug 2017. minor_version = (date.year - 2017) * 12 + (date.month - 1) - 7 major_version = AirflowGetVariableOrBaseCase('major_version', 0) # This code gets information about the latest released version so we know # What version number to use for this round. r_minor = int(AirflowGetVariableOrBaseCase('released_version_minor', 0)) r_patch = int(AirflowGetVariableOrBaseCase('released_version_patch', 0)) # If we have already released a monthy for this mounth then bump # The patch number for the remander of the month. if r_minor == minor_version: patch = r_patch + 1 else: patch = 0 # If version is overriden then we should use it otherwise we use it's # default or monthly value. version = conf.get('VERSION') if monthly and not version: version = '{}.{}.{}'.format(major_version, minor_version, patch) default_conf = environment_config.get_airflow_config( version, timestamp, major=major_version, minor=minor_version, patch=patch, date=date.strftime('%Y%m%d'), rc=date.strftime('%H-%M-%S')) config_settings = dict(VERSION=default_conf['VERSION']) config_settings_name = [ 'PROJECT_ID', 'MFEST_URL', 'MFEST_FILE', 'GCS_STAGING_BUCKET', 'SVC_ACCT', 'GITHUB_ORG', 'GITHUB_REPO', 'GCS_GITHUB_PATH', 'TOKEN_FILE', 'GCR_STAGING_DEST', 'GCR_RELEASE_DEST', 'GCS_MONTHLY_RELEASE_PATH', 'DOCKER_HUB', 'GCS_BUILD_BUCKET', 'RELEASE_PROJECT_ID', ] for name in config_settings_name: config_settings[name] = conf.get(name) or default_conf[name] if monthly: config_settings['MFEST_COMMIT'] = conf.get( 'MFEST_COMMIT') or Variable.get('latest_sha') gcs_path = conf.get('GCS_MONTHLY_STAGE_PATH') if not gcs_path: gcs_path = default_conf['GCS_MONTHLY_STAGE_PATH'] else: config_settings['MFEST_COMMIT'] = conf.get( 'MFEST_COMMIT') or default_conf['MFEST_COMMIT'] gcs_path = conf.get( 'GCS_DAILY_PATH') or default_conf['GCS_DAILY_PATH'] config_settings['GCS_STAGING_PATH'] = gcs_path config_settings['GCS_BUILD_PATH'] = '{}/{}'.format( config_settings['GCS_BUILD_BUCKET'], gcs_path) config_settings[ 'GCS_RELEASE_TOOLS_PATH'] = '{}/release-tools/{}'.format( config_settings['GCS_BUILD_BUCKET'], gcs_path) config_settings['GCS_FULL_STAGING_PATH'] = '{}/{}'.format( config_settings['GCS_STAGING_BUCKET'], gcs_path) config_settings['ISTIO_REPO'] = 'https://github.com/{}/{}.git'.format( config_settings['GITHUB_ORG'], config_settings['GITHUB_REPO']) return config_settings generate_flow_args = PythonOperator( task_id='generate_workflow_args', python_callable=GenerateTestArgs, provide_context=True, dag=common_dag, ) get_git_commit_cmd = """ {% set settings = task_instance.xcom_pull(task_ids='generate_workflow_args') %} git config --global user.name "TestRunnerBot" git config --global user.email "*****@*****.**" git clone {{ settings.MFEST_URL }} green-builds || exit 2 pushd green-builds git checkout {{ settings.MFEST_COMMIT }} || exit 5 SHA=`grep {{ settings.GITHUB_ORG }}/{{ settings.GITHUB_REPO }} {{ settings.MFEST_FILE }} | cut -f 6 -d \\"` || exit 3 if [ -z ${SHA} ]; then echo "SHA not found" exit 6 fi popd git clone {{ settings.ISTIO_REPO }} istio-code pushd istio-code/release git checkout ${SHA} || exit 4 gsutil cp *.sh gs://{{ settings.GCS_RELEASE_TOOLS_PATH }}/data/release/ gsutil cp *.json gs://{{ settings.GCS_RELEASE_TOOLS_PATH }}/data/release/ popd pushd green-builds git rev-parse HEAD """ get_git_commit = BashOperator(task_id='get_git_commit', bash_command=get_git_commit_cmd, xcom_push=True, dag=common_dag) build_template = """ {% set settings = task_instance.xcom_pull(task_ids='generate_workflow_args') %} {% set m_commit = task_instance.xcom_pull(task_ids='get_git_commit') %} gsutil cp gs://{{ settings.GCS_RELEASE_TOOLS_PATH }}/data/release/*.json . gsutil cp gs://{{ settings.GCS_RELEASE_TOOLS_PATH }}/data/release/*.sh . chmod +x * ./start_gcb_build.sh -w -p {{ settings.PROJECT_ID \ }} -r {{ settings.GCR_STAGING_DEST }} -s {{ settings.GCS_BUILD_PATH }} \ -v "{{ settings.VERSION }}" \ -u "{{ settings.MFEST_URL }}" \ -t "{{ m_commit }}" -m "{{ settings.MFEST_FILE }}" \ -a {{ settings.SVC_ACCT }} """ # NOTE: if you add commands to build_template after start_gcb_build.sh then take care to preserve its return value build = BashOperator(task_id='run_cloud_builder', bash_command=build_template, dag=common_dag) test_command = """ chmod +x /home/airflow/gcs/data/githubctl {% set settings = task_instance.xcom_pull(task_ids='generate_workflow_args') %} git config --global user.name "TestRunnerBot" git config --global user.email "*****@*****.**" /home/airflow/gcs/data/githubctl \ --token_file="{{ settings.TOKEN_FILE }}" \ --op=dailyRelQual \ --hub=gcr.io/{{ settings.GCR_STAGING_DEST }} \ --gcs_path="{{ settings.GCS_BUILD_PATH }}" \ --tag="{{ settings.VERSION }}" """ run_release_quilification_tests = BashOperator( task_id='run_release_quilification_tests', bash_command=test_command, retries=0, dag=common_dag) copy_files = GoogleCloudStorageCopyOperator( task_id='copy_files_for_release', source_bucket=GetSettingTemplate('GCS_BUILD_BUCKET'), source_object=GetSettingTemplate('GCS_STAGING_PATH'), destination_bucket=GetSettingTemplate('GCS_STAGING_BUCKET'), dag=common_dag, ) generate_flow_args >> get_git_commit >> build run_release_quilification_tests.set_upstream(build) run_release_quilification_tests >> copy_files return common_dag, copy_files
dag = DAG( dag_id='budget', default_args=args, start_date=start_date, schedule_interval=schedule) #: Latest Only Operator for budget budget_latest_only = LatestOnlyOperator( task_id='budget_latest_only', dag=dag) get_accounts = PythonOperator( task_id='get_chart_of_accounts', python_callable=get_accounts_chart, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) get_capital_ptd = PythonOperator( task_id='get_capital_ptd', python_callable=get_capital_ptd, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) get_capital_fy = PythonOperator( task_id='get_capital_fy', python_callable=get_capital,
index=False, encoding=encode_set, mode='a') total_count += 1 print('total count: ' + str(total_count)) count += 1 # upload s3 for deal_ymd in year_month_data: contract_df_by_api = pd.read_csv(export_file_path + export_file_name + str(deal_ymd) + '.csv') contract_df_by_api.to_csv(csv_buffer) s3_resource.Object(bucket, export_file_name + str(deal_ymd) + '.csv').put(Body=csv_buffer.getvalue()) # send kafka broker # kafka_producer.produce(contract_df_by_api.to_json(orient='index')) # response에 담겨있는 Buckets의 이름만 가져와 buckets 변수에 배열로 저장. t1 = PythonOperator(task_id='task_1', provide_context=True, python_callable=print_variables, op_kwargs={'input_year': Variable.get("arg_year"), 'input_month': Variable.get("arg_month")}, dag=dag) t2 = PythonOperator(task_id='task_2', provide_context=True, python_callable=task_detached_contract_data, op_kwargs={'input_year': Variable.get("arg_year"), 'input_month': Variable.get("arg_month")}, dag=dag) t1 >> t2
def create_evaluate_ops(task_prefix, data_format, input_paths, prediction_path, metric_fn_and_keys, validate_fn, batch_prediction_job_id=None, project_id=None, region=None, dataflow_options=None, model_uri=None, model_name=None, version_name=None, dag=None): """ Creates Operators needed for model evaluation and returns. It gets prediction over inputs via Cloud ML Engine BatchPrediction API by calling MLEngineBatchPredictionOperator, then summarize and validate the result via Cloud Dataflow using DataFlowPythonOperator. For details and pricing about Batch prediction, please refer to the website https://cloud.google.com/ml-engine/docs/how-tos/batch-predict and for Cloud Dataflow, https://cloud.google.com/dataflow/docs/ It returns three chained operators for prediction, summary, and validation, named as <prefix>-prediction, <prefix>-summary, and <prefix>-validation, respectively. (<prefix> should contain only alphanumeric characters or hyphen.) The upstream and downstream can be set accordingly like: pred, _, val = create_evaluate_ops(...) pred.set_upstream(upstream_op) ... downstream_op.set_upstream(val) Callers will provide two python callables, metric_fn and validate_fn, in order to customize the evaluation behavior as they wish. - metric_fn receives a dictionary per instance derived from json in the batch prediction result. The keys might vary depending on the model. It should return a tuple of metrics. - validation_fn receives a dictionary of the averaged metrics that metric_fn generated over all instances. The key/value of the dictionary matches to what's given by metric_fn_and_keys arg. The dictionary contains an additional metric, 'count' to represent the total number of instances received for evaluation. The function would raise an exception to mark the task as failed, in a case the validation result is not okay to proceed (i.e. to set the trained version as default). Typical examples are like this: def get_metric_fn_and_keys(): import math # imports should be outside of the metric_fn below. def error_and_squared_error(inst): label = float(inst['input_label']) classes = float(inst['classes']) # 0 or 1 err = abs(classes-label) squared_err = math.pow(classes-label, 2) return (err, squared_err) # returns a tuple. return error_and_squared_error, ['err', 'mse'] # key order must match. def validate_err_and_count(summary): if summary['err'] > 0.2: raise ValueError('Too high err>0.2; summary=%s' % summary) if summary['mse'] > 0.05: raise ValueError('Too high mse>0.05; summary=%s' % summary) if summary['count'] < 1000: raise ValueError('Too few instances<1000; summary=%s' % summary) return summary For the details on the other BatchPrediction-related arguments (project_id, job_id, region, data_format, input_paths, prediction_path, model_uri), please refer to MLEngineBatchPredictionOperator too. :param task_prefix: a prefix for the tasks. Only alphanumeric characters and hyphen are allowed (no underscores), since this will be used as dataflow job name, which doesn't allow other characters. :type task_prefix: string :param data_format: either of 'TEXT', 'TF_RECORD', 'TF_RECORD_GZIP' :type data_format: string :param input_paths: a list of input paths to be sent to BatchPrediction. :type input_paths: list of strings :param prediction_path: GCS path to put the prediction results in. :type prediction_path: string :param metric_fn_and_keys: a tuple of metric_fn and metric_keys: - metric_fn is a function that accepts a dictionary (for an instance), and returns a tuple of metric(s) that it calculates. - metric_keys is a list of strings to denote the key of each metric. :type metric_fn_and_keys: tuple of a function and a list of strings :param validate_fn: a function to validate whether the averaged metric(s) is good enough to push the model. :type validate_fn: function :param batch_prediction_job_id: the id to use for the Cloud ML Batch prediction job. Passed directly to the MLEngineBatchPredictionOperator as the job_id argument. :type batch_prediction_job_id: string :param project_id: the Google Cloud Platform project id in which to execute Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s `default_args['project_id']` will be used. :type project_id: string :param region: the Google Cloud Platform region in which to execute Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s `default_args['region']` will be used. :type region: string :param dataflow_options: options to run Dataflow jobs. If None, then the `dag`'s `default_args['dataflow_default_options']` will be used. :type dataflow_options: dictionary :param model_uri: GCS path of the model exported by Tensorflow using tensorflow.estimator.export_savedmodel(). It cannot be used with model_name or version_name below. See MLEngineBatchPredictionOperator for more detail. :type model_uri: string :param model_name: Used to indicate a model to use for prediction. Can be used in combination with version_name, but cannot be used together with model_uri. See MLEngineBatchPredictionOperator for more detail. If None, then the `dag`'s `default_args['model_name']` will be used. :type model_name: string :param version_name: Used to indicate a model version to use for prediciton, in combination with model_name. Cannot be used together with model_uri. See MLEngineBatchPredictionOperator for more detail. If None, then the `dag`'s `default_args['version_name']` will be used. :type version_name: string :param dag: The `DAG` to use for all Operators. :type dag: airflow.DAG :returns: a tuple of three operators, (prediction, summary, validation) :rtype: tuple(DataFlowPythonOperator, DataFlowPythonOperator, PythonOperator) """ # Verify that task_prefix doesn't have any special characters except hyphen # '-', which is the only allowed non-alphanumeric character by Dataflow. if not re.match(r"^[a-zA-Z][-A-Za-z0-9]*$", task_prefix): raise AirflowException( "Malformed task_id for DataFlowPythonOperator (only alphanumeric " "and hyphens are allowed but got: " + task_prefix) metric_fn, metric_keys = metric_fn_and_keys if not callable(metric_fn): raise AirflowException("`metric_fn` param must be callable.") if not callable(validate_fn): raise AirflowException("`validate_fn` param must be callable.") if dag is not None and dag.default_args is not None: default_args = dag.default_args project_id = project_id or default_args.get('project_id') region = region or default_args.get('region') model_name = model_name or default_args.get('model_name') version_name = version_name or default_args.get('version_name') dataflow_options = dataflow_options or \ default_args.get('dataflow_default_options') evaluate_prediction = MLEngineBatchPredictionOperator( task_id=(task_prefix + "-prediction"), project_id=project_id, job_id=batch_prediction_job_id, region=region, data_format=data_format, input_paths=input_paths, output_path=prediction_path, uri=model_uri, model_name=model_name, version_name=version_name, dag=dag) metric_fn_encoded = base64.b64encode(dill.dumps(metric_fn, recurse=True)) evaluate_summary = DataFlowPythonOperator( task_id=(task_prefix + "-summary"), py_options=["-m"], py_file="airflow.contrib.operators.mlengine_prediction_summary", dataflow_default_options=dataflow_options, options={ "prediction_path": prediction_path, "metric_fn_encoded": metric_fn_encoded, "metric_keys": ','.join(metric_keys) }, dag=dag) evaluate_summary.set_upstream(evaluate_prediction) def apply_validate_fn(*args, **kwargs): prediction_path = kwargs["templates_dict"]["prediction_path"] scheme, bucket, obj, _, _ = urlsplit(prediction_path) if scheme != "gs" or not bucket or not obj: raise ValueError("Wrong format prediction_path: %s", prediction_path) summary = os.path.join(obj.strip("/"), "prediction.summary.json") gcs_hook = GoogleCloudStorageHook() summary = json.loads(gcs_hook.download(bucket, summary)) return validate_fn(summary) evaluate_validation = PythonOperator( task_id=(task_prefix + "-validation"), python_callable=apply_validate_fn, provide_context=True, templates_dict={"prediction_path": prediction_path}, dag=dag) evaluate_validation.set_upstream(evaluate_summary) return evaluate_prediction, evaluate_summary, evaluate_validation
import datetime import logging from airflow import DAG from airflow.operators.python_operator import PythonOperator def hello_world(): logging.info("Hello World") # # TODO: Add a daily `schedule_interval` argument to the following DAG # dag = DAG( "exercise2", start_date=datetime.datetime.now() - datetime.timedelta(days=2)) task = PythonOperator( task_id="hello_world_task", python_callable=hello_world, dag=dag)