def build(self, dag): print_configuration = PythonOperator( task_id='print_configuration', python_callable=AirflowDbCleanupDagBuilder. print_configuration_function, provide_context=True, dag=dag) for db_object in self.database_objects: cleanup = PythonOperator( task_id='cleanup_' + str(db_object["airflow_db_model"].__name__), python_callable=AirflowDbCleanupDagBuilder.cleanup_function, params=db_object, provide_context=True, dag=dag) print_configuration.set_downstream(cleanup) return dag
def check_previous_runs(**kwargs): context = kwargs current_run_id = context['dag_run'].run_id current_dag_id = context['dag_run'].dag_id # Connect to mysql and check for any errors for this DAG airflow_conn = MySqlHook(mysql_conn_id='deliverbi_mysql_airflow') l_error_count = 0 cmd_sql = f"select count(1) from airflow.dag_run where dag_id = '{current_dag_id}' " cmd_sql += f"and run_id <> '{current_run_id}' and state = 'failed'" print(cmd_sql) airflow_data = airflow_conn.get_records(sql=cmd_sql) for row in airflow_data: l_error_count = int((str(row[0]))) print("Found Previous Errors:" + str(l_error_count)) if l_error_count != 0: raise AirflowException( "Previous Run in Error so Failing the Current Run") # Tasks check_previous_run_status = PythonOperator(task_id='check_previous_run_status', provide_context=True, python_callable=check_previous_runs, dag=dag) task1 = DummyOperator(task_id='task1', retries=2, dag=dag) check_previous_run_status.set_downstream(task1)
readmission_classifier_train_predict_operator = PythonOperator( task_id='readmission_classifier_train_predict', python_callable=readmission_classifier_train_and_predict.train_and_predict, dag=dag) readmission_prob_to_likert_operator = PythonOperator( task_id='convert_to_likert', python_callable=readmission_tf_prob_to_likert.convert_to_likert, dag=dag) summary_report_operator = PythonOperator( task_id='make_summary_report', python_callable=create_report_summary.create_report, dag=dag) df_from_api_operator.set_downstream(structured_features_operator) structured_features_operator.set_downstream([ all_word2vec_clean_notes_operator, readmission_word2vec_clean_notes_operator, ner_clean_operator, readmission_classifier_prep_operator ]) readmission_word2vec_clean_notes_operator.set_downstream( readmission_word2vec_tokenize_notes_operator) readmission_word2vec_tokenize_notes_operator.set_downstream( readmission_word2vec_operator) readmission_word2vec_operator.set_downstream(readmission_one_hot_operator) all_word2vec_clean_notes_operator.set_downstream( all_word2vec_tokenize_notes_operator) all_word2vec_tokenize_notes_operator.set_downstream(all_word2vec_operator) all_word2vec_operator.set_downstream(infected_one_hot_operator)
# But you can if you want to one_task = PythonOperator( task_id="one_task", python_callable=print_stuff, dag=dag, executor_config={"KubernetesExecutor": { "image": "airflow:latest" }}) # Use the zip binary, which is only found in this special docker image two_task = PythonOperator( task_id="two_task", python_callable=use_zip_binary, dag=dag, executor_config={"KubernetesExecutor": { "image": "airflow:latest" }}) # Limit resources on this operator/task three_task = PythonOperator(task_id="three_task", python_callable=print_stuff, dag=dag, executor_config={ "KubernetesExecutor": { "request_memory": "128Mi", "limit_memory": "128Mi" } }) start_task.set_downstream([one_task, two_task, three_task])
# endregion # endregion with DAG( dag_id='website_statistics', default_args=default_args, schedule_interval=timedelta(minutes=1)) as dag: # Task for download from external source # opr_download_json = PythonOperator(task_id='download_json', python_callable=download_json, provide_context=True) opr_extract_json = PythonOperator( task_id='load_json', provide_context=True, python_callable=load_json, ) opr_transform_clickhouse = PythonOperator( task_id='move_to_merge_tree', provide_context=True, python_callable=move_to_merge_tree, ) # opr_extract_json.set_downstream(opr_download_json) opr_extract_json.set_downstream(opr_transform_clickhouse) # opr_transform_clickhouse.set_downstream(opr_extract_json)
'type': 'STRING' }, { 'name': 'predicted_monetary', 'type': 'FLOAT' }, { 'name': 'predictions', 'type': 'FLOAT' }], source_format="NEWLINE_DELIMITED_JSON", skip_leading_rows=1, destination_project_dataset_table="{}.{}.{}".format( PROJECT, dataset, 'predictions'), create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE", dag=dag).execute(kwargs) t3 = PythonOperator(task_id='list_predictions_files', dag=dag, python_callable=do_list_predictions_files) t4 = PythonOperator(task_id='load_to_bq', dag=dag, python_callable=do_load_to_bq) # How to link them t0_predict_cond.set_downstream([t1a, t1b]) t2.set_upstream([t1a, t1b]) t3.set_upstream([t1a, t1b]) t3.set_downstream(t4)
# returns the week day (monday, tuesday, etc.) def get_day(**kwargs): print(kwargs['ti']) kwargs['ti'].xcom_push(key='day', value=datetime.now().weekday()) # returns the name id of the task to launch (task_for_monday, task_for_tuesday, etc.) def branch(**kwargs): print(kwargs) return 'task_for_' + tabDays[kwargs['ti'].xcom_pull(task_ids='weekday', key='day')] # PythonOperator will retrieve and store into "weekday" variable the week day get_weekday = PythonOperator(task_id='weekday', python_callable=get_day, provide_context=True, dag=dag) # BranchPythonOperator will use "weekday" variable, and decide which task to launch next fork = BranchPythonOperator(task_id='branching', python_callable=branch, provide_context=True, dag=dag) # task 1, get the week day get_weekday.set_downstream(fork) # One dummy operator for each week day, all branched to the fork for day in range(0, 6): fork.set_downstream( DummyOperator(task_id='task_for_' + tabDays[day], dag=dag))
DEFAULT_DATE = datetime(2016, 1, 1) default_args = dict( start_date=DEFAULT_DATE, owner='airflow') def fail(): raise ValueError('Expected failure.') def success(ti=None, *args, **kwargs): if ti.execution_date != DEFAULT_DATE + timedelta(days=1): fail() return # DAG tests that tasks ignore all dependencies dag1 = DAG(dag_id='test_run_ignores_all_dependencies', default_args=dict(depends_on_past=True, **default_args)) dag1_task1 = PythonOperator( task_id='test_run_dependency_task', python_callable=fail, dag=dag1,) dag1_task2 = PythonOperator( task_id='test_run_dependent_task', python_callable=success, provide_context=True, dag=dag1,) dag1_task1.set_downstream(dag1_task2)
python_callable=get_endpoint, op_args=[e, SAVE_PATH, BASE_URL, API_KEYS], ) t_branch = BranchPythonOperator(task_id=branch_task_id, python_callable=row_count_branch, op_args=[ get_enpdpoints_task_id, file_to_gcs_task_id, zero_branch_task_id ], trigger_rule="all_done") t_gcs = FileToGoogleCloudStorageOperator( task_id=file_to_gcs_task_id, google_cloud_storage_conn_id='gcs_silo', bucket="deanslist", src="{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_path' )}}", #dst = "TEST/" + endpoint_name + "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_name') }}", dst=endpoint_name + "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_name') }}", dag=dag) t_zero_row = DummyOperator(task_id=zero_branch_task_id) t2.set_upstream(t1) t2.set_downstream(t_branch) t_branch.set_downstream(t_gcs) t_branch.set_downstream(t_zero_row)
execution_timeout=timedelta(hours=1), op_kwargs={'valuation_date': get_valuation_date()}, dag=dag) export_spot_scenarios_operator = PythonOperator( task_id='export_spot_scenarios_task', python_callable=export_spot_scenarios_run, execution_timeout=timedelta(hours=1), op_kwargs={'valuation_date': get_valuation_date()}, dag=dag) # ----------------------------------------------------------------------------------- # Operator Dependency Relationship # default close tasks basic_risks_default_close_operator.set_upstream(basic_position_operator) basic_instrument_contract_type_operator.set_downstream(basic_position_operator) # basic_otc_company_type_operator.set_downstream(eod_classic_scenarios_operator) basic_otc_company_type_operator.set_downstream( eod_spot_scenarios_by_market_default_close_operator) basic_otc_company_type_operator.set_downstream( eod_counter_party_market_risk_default_close_operator) basic_otc_company_type_operator.set_downstream( eod_counter_party_market_risk_by_underlyer_default_close_operator) # eod_classic_scenarios_operator.set_upstream(basic_position_operator) eod_spot_scenarios_by_market_default_close_operator.set_upstream( basic_position_operator) eod_position_default_close_operator.set_upstream( basic_risks_default_close_operator)
)) opr_pause = BashOperator(task_id='pause', bash_command="echo 'Paused for extraction.'") for t, s in sources_to_extract.items(): s['name'] = t s['dag'] = dag.dag_id opr_extract = PythonOperator(task_id=f"extract_{t}", python_callable=sources.extract_source, provide_context=True, op_kwargs=s) opr_extract.set_downstream(opr_pause) # Loop through the open datasets open_datasets = [ f for f in os.listdir( f"{os.environ['AIRFLOW_HOME']}/processes/{dag.dag_id}") if not f.startswith('_') and f.endswith('.yml') ] for od in open_datasets: od_name = od.split('.')[0] od_config = yaml.load( open(f"{os.environ['AIRFLOW_HOME']}/processes/{dag.dag_id}/{od}")) # loop through the views for d, v in od_config['views'].items():
subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args) subdag7_task1 = PythonOperator( task_id='test_subdag_fail', dag=subdag7, python_callable=fail) subdag7_task2 = DummyOperator( task_id='test_subdag_dummy_1', dag=subdag7,) subdag7_task3 = DummyOperator( task_id='test_subdag_dummy_2', dag=subdag7) dag7_subdag1 = SubDagOperator( task_id='subdag', dag=dag7, subdag=subdag7) subdag7_task1.set_downstream(subdag7_task2) subdag7_task2.set_downstream(subdag7_task3) # DAG tests that a Dag run that doesn't complete but has a root failure is marked running dag8 = DAG(dag_id='test_dagrun_states_root_fail_unfinished', default_args=default_args) dag8_task1 = DummyOperator( task_id='test_dagrun_unfinished', # The test will unset the task instance state after # running this test dag=dag8, ) dag8_task2 = PythonOperator( task_id='test_dagrun_fail', dag=dag8, python_callable=fail, )
# You don't have to use any special KubernetesExecutor configuration if you don't want to start_task = PythonOperator( task_id="start_task", python_callable=print_stuff, dag=dag ) # But you can if you want to one_task = PythonOperator( task_id="one_task", python_callable=print_stuff, dag=dag, executor_config={"KubernetesExecutor": {"image": "airflow/ci:latest"}} ) # Use the zip binary, which is only found in this special docker image two_task = PythonOperator( task_id="two_task", python_callable=use_zip_binary, dag=dag, executor_config={"KubernetesExecutor": {"image": "airflow/ci_zip:latest"}} ) # Limit resources on this operator/task with node affinity & tolerations three_task = PythonOperator( task_id="three_task", python_callable=print_stuff, dag=dag, executor_config={ "KubernetesExecutor": {"request_memory": "128Mi", "limit_memory": "128Mi", "tolerations": tolerations, "affinity": affinity}} ) start_task.set_downstream([one_task, two_task, three_task])
branch_b = PythonOperator( task_id='branch_b', python_callable=print_branchb, dag=dag) # 指定归属的dag def print_branchc(): return 'Hello branchc!' branch_c = PythonOperator( task_id='branch_c', python_callable=print_branchc, dag=dag) # 指定归属的dag #------------------------------------------------------------------------------- def decide_which_path(): if 1 > 1: return "branch_a" else: return "branch_b" branch_task = BranchPythonOperator( task_id='run_this_first', python_callable=decide_which_path, trigger_rule="all_done", dag=dag) #------------------------------------------------------------------------------- # dependencies branch_task.set_downstream(branch_a) #适配层以及中间层、应用层都依赖于branch_a branch_task.set_downstream(branch_b) branch_a.set_downstream(branch_c)
'region': region, 'bucket': bucket }) # launch sagemaker batch transform job and wait until it completes batch_transform_task = SageMakerTransformOperator( task_id='batch_predicting', dag=dag, config=transform_config, aws_conn_id='airflow-sagemaker', wait_for_completion=True, check_interval=30) # Cleanup task, deletes ALL SageMaker endpoints and model artifacts # Uncomment below clean_up_task to clean up sagemaker endpoint resources and model artifacts # clean_up_task = PythonOperator( # task_id='clean_up', # dag=dag, # python_callable=clean_up.clean_up, # op_kwargs={'region': region, "bucket": bucket} # ) init.set_downstream(sm_proc_job_task) sm_proc_job_task.set_downstream(train_model_task) train_model_task.set_downstream(inference_pipeline_task) inference_pipeline_task.set_downstream(batch_transform_task) # Uncomment line below to disable clean up task # batch_transform_task.set_downstream(clean_up_task)
str(airflow_db_model.__name__) + "(s):") for entry in entries_to_delete: logging.info("\tEntry: " + str(entry) + ", Date: " + str(entry.__dict__[str(age_check_column).split(".")[1]])) logging.info("Process will be Deleting " + str(len(entries_to_delete)) + " " + str(airflow_db_model.__name__) + "(s)") if ENABLE_DELETE: logging.info("Performing Delete...") # using bulk delete query.delete(synchronize_session=False) session.commit() logging.info("Finished Performing Delete") else: logging.warn("You're opted to skip deleting the db entries!!!") logging.info("Finished Running Cleanup Process") for db_object in DATABASE_OBJECTS: cleanup_op = PythonOperator(task_id='cleanup_' + str(db_object["airflow_db_model"].__name__), python_callable=cleanup_function, params=db_object, provide_context=True, dag=dag) print_configuration.set_downstream(cleanup_op)
os.system(' '.join(['gsutil rm', BUCKET_LOC + filename])) os.system(' '.join(['gsutil cp', full_filename, BUCKET_LOC])) def process_local(): # recreate the draft-kings.csv file return 0 dag = DAG(dag_id='dk_data', description='Download and Process DraftKings Data', default_args=default_args, schedule_interval='0 14 * * 3') source_to_local = PythonOperator(task_id='source_to_local', python_callable=source_to_local, dag=dag) local_to_gs = PythonOperator(task_id='local_to_gs', python_callable=local_to_gs, dag=dag) # process_local = PythonOperator( # task_id='process_local', # python_callable=process_local, # dag=dag) # setting dependencies # source_to_local.set_downstream([process_local, local_to_gs]) source_to_local.set_downstream(local_to_gs)
'start_date': datetime(2020, 3, 3), 'retries': 3, 'retry_delay': timedelta(minutes=1) } dag = DAG('CTK_JNJ_MASTER', default_args=default_args, dagrun_timeout=timedelta(days=1), description="jnj mastering starting with tokenizing", schedule_interval='0 1 * * *') # 0. dummy op_starting = DummyOperator(task_id='execute', dag=dag) # 1. tokenize op_tokenize = PythonOperator(task_id=f'tkn_daily', python_callable=tkn_daily, dag=dag) # 2. brand_classification op_brand_classification = PythonOperator( task_id='jnj_brand_classification', python_callable=jnj_brand_classification, dag=dag) # 3. mastering s_sp_item op_s_sp_item_result = PythonOperator(task_id='s_sp_item_result', python_callable=s_sp_item_result, dag=dag) op_starting.set_downstream(op_tokenize) op_tokenize.set_downstream(op_brand_classification) op_brand_classification.set_downstream(op_s_sp_item_result)
# create dag and schedule a load interval every day at midnight (7am UTC) dag = DAG('extract_and_load', catchup=False, default_args=default_args, schedule_interval=timedelta(days=1), max_active_runs=1) # task to create table if it does not exist task_create_table = PostgresOperator( task_id='task_create_table', sql='./extract_load_pipeline/sql/create_postgres_table.sql', postgres_conn_id='my_local_db', dag=dag) # extracts bq to a gcs bucket as csv task_bq_to_gcs = PythonOperator( task_id='task_bq_to_gcs', python_callable=bq_to_gcs, provide_context=True, op_kwargs={'start_date': default_args['start_date']}, dag=dag) # loads postgres table from csv task_gcs_to_postgres = PythonOperator(task_id='task_gcs_to_postgres', python_callable=load_table, provide_context=True, dag=dag) task_create_table.set_downstream(task_bq_to_gcs) task_bq_to_gcs.set_downstream(task_gcs_to_postgres)
'Content-Type': 'application/json' }).json()['log'] for line in lines: logging.info(line) time.sleep(10) if statement_status == 'success': #curl -X DELETE localhost:8998/batches/53 requests.delete(statement_url, headers={'Content-Type': 'application/json'}) final_statement_status = 'success' return if final_statement_status == 'dead': logging.info('Statement exception: ' + lines.json()['log']) for trace in statement_response.json()['output']['traceback']: logging.info(trace) raise ValueError('Final Statement Status: ' + final_statement_status) logging.info('Final Statement Status: ' + final_statement_status) generatePerfilInput = PythonOperator(task_id='generatePerfilInput', python_callable=generate_perfil_input, dag=dag) launchPerfilTraining = PythonOperator(task_id='launchPerfilTraining', python_callable=launch_perfil_training, dag=dag) generatePerfilInput.set_downstream(launchPerfilTraining) #generatePerfilInput >> launchPerfilTraining
'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 0, # Defaults to not retrying.. fails if first attempt fails. 'retry_delay': timedelta( minutes=5 ) # Won't be used if retries left at 0 but retries can be overriden } #default pip airflow install is 1.7 # using the Airflow 1.8 context manager feature. # it would be `with DAG() as dag:` and all the operators in that scope would have dag=dag by default dag = DAG('pysparkexec', default_args=default_args) adapt_model = PythonOperator(task_id='adapt_model', dag=dag, python_callable=sophia_air.adapt_model, provide_context=False) create_env = BashOperator(task_id='create_env', dag=dag, bash_command='sleep 5 && echo "slept"') run_it = PythonOperator(task_id='run_it', dag=dag, python_callable=sophia_air.run_it) # in 1.8 it will be # get_git >> run_it << get_cluster adapt_model.set_downstream(run_it) create_env.set_downstream(run_it)
python_callable=preprocess.preprocess, op_kwargs=config["preprocess_data"]) train_task= PythonOperator( task_id='train', dag=dag, provide_context=False, python_callable=preprocess.preprocess, op_kwargs=config["preprocess_data"]) model_task= PythonOperator( task_id='model', dag=dag, provide_context=False, python_callable=preprocess.preprocess, op_kwargs=config["preprocess_data"]) deploy_task= PythonOperator( task_id='deploy', dag=dag, provide_context=False, python_callable=preprocess.preprocess, op_kwargs=config["preprocess_data"]) # set the dependencies between tasks init.set_downstream(process_task) process_task.set_downstream(train_task) train_task.set_downstream(model_task) model_task.set_downstream(deploy_task)
task_id="start_task", python_callable=print_stuff, dag=dag, executor_config={ "KubernetesExecutor": { "annotations": {"test": "annotation"} } } ) # You can mount volume or secret to the worker pod second_task = PythonOperator( task_id="four_task", python_callable=test_volume_mount, dag=dag, executor_config={ "KubernetesExecutor": { "volumes": [ { "name": "test-volume", "hostPath": {"path": "/tmp/"}, }, ], "volume_mounts": [ { "mountPath": "/foo/", "name": "test-volume", }, ] } } ) start_task.set_downstream(second_task)
["/Users/ravimuthyala/AirflowSparkTestCode/receipts.csv"], 'driver_memory': '1g', 'executor_cores': 1, 'num_executors': 1, 'executor_memory': '1g' } spark_submit_operator = SparkSubmitOperator(task_id='Spark_Scala_Submit_Job', dag=dag, **spark_config) emailNotify = EmailOperator(task_id='email_notification', to='*****@*****.**', subject='Spark Submit Job Alert', html_content='Airflow Spark Submit Job Done', dag=dag) t1Failed = EmailOperator(dag=dag, trigger_rule=TriggerRule.ONE_FAILED, task_id="SparkJobFailed", to=["*****@*****.**"], subject="Spark job Failed", html_content='<h3>Spark job has failed</h3>') python_operator.set_downstream(spark_submit_operator) spark_submit_operator.set_downstream(emailNotify) t1Failed.set_upstream([spark_submit_operator]) if __name__ == '__main__': dag.cli()
# [START howto_operator_python] def print_context(ds, **kwargs): """Print the Airflow context and ds variable from the context.""" pprint(kwargs) print(ds) return 'Whatever you return gets printed in the logs' def dag_run(context, dag_run_obj): print("[dag_run] %s" % dag_run_obj) return dag_run_obj run_this = PythonOperator( task_id='print_the_context', provide_context=True, python_callable=print_context, dag=dag, ) trigger_hdfs = TriggerDagRunOperator2( task_id="trigger_the_dag", trigger_dag_id="example_python_operator", python_callable=dag_run, execution_date="{{ execution_date }}", dag=dag, ) run_this.set_downstream(trigger_hdfs)
def create_subdag(dag_parent, label, team): dag_id_child = "%s.%s" % (dag_parent.dag_id, label) schema = team["schema"][label] dag = DAG( dag_id=dag_id_child, default_args=dag_parent.default_args, schedule_interval=dag_parent.schedule_interval, ) # Find the corresponding operator and its parameters fn, operator_params = find_label_operator(schema["qos"]) # Label is declared but there is no node in Neo4j count = team["labels"][label] if not count: DummyOperator(task_id="{}.notask".format(label), dag=dag) return dag, operator_params.get("dependencies") if count < 100: length = count else: frac, length = math.modf(count / 100) if frac: length += 1 chunks = { "{}.chunk.{}".format(label, i): i for i in range(0, count, int(length)) } tasks = [] for name, skip in chunks.items(): # All custom operators share these parameters params = { "app": app, "team": team, "label": label, "skip": skip, "length": length, **operator_params, } tasks.append(fn(task_id=name, dag=dag, params=params)) with dag: delete_redis_avg_op = PythonOperator( task_id="{}.del_redis_average".format(label), provide_context=True, python_callable=delete_redis_avg, params={ "app": app, "team": team, "label": label }, ) before_subdag_task = BeforeSubdagOperator( task_id="{}.before_subdag".format(label), params={ "app": app, "team": team, "label": label, "count": count }, ) after_subdag_task = AfterSubdagOperator( task_id="{}.after_subdag".format(label), params={ "app": app, "team": team, "label": label }, ) after_chunks_task = DummyOperator(task_id="{}.dummy".format(label)) average_op = AverageOperator( task_id="{}.average".format(label), params={ "app": app, "team": team, "label": label }, ) daily_worst_op = DailyWorstOperator( task_id="{}.daily_worst".format(label), params={ "app": app, "team": team, "label": label }, ) before_subdag_task.set_downstream(delete_redis_avg_op) delete_redis_avg_op.set_downstream(tasks) after_chunks_task.set_upstream(tasks) after_chunks_task.set_downstream([average_op, daily_worst_op]) after_subdag_task.set_upstream([average_op, daily_worst_op]) return dag, operator_params.get("dependencies")
execution_dates = list((datetime_range(start=start_date, end=end_date))) for i, ex_date in enumerate(execution_dates): ed = ex_date.strftime('%Y-%m-%d') ep_template = {'sdt': ed} get_enpdpoints_task_id = "get_{0}_dl_endpoint_{1}".format( endpoint_name, ed) file_to_gcs_task_id = "{0}_{1}_to_gcs".format(endpoint_name, ed) t2 = PythonOperator(task_id=get_enpdpoints_task_id, python_callable=get_endpoint_with_dates, op_args=[SAVE_PATH, BASE_URL, API_KEYS], templates_dict=ep_template) t3 = FileToGoogleCloudStorageOperator( task_id=file_to_gcs_task_id, google_cloud_storage_conn_id='gcs_silo', bucket="deanslist", src="{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_path' )}}", dst="TEST/" + endpoint_name + "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_name') }}", dag=dag) t2.set_upstream(t1) t2.set_downstream(t3)
task_id="print_task", provide_context=True, python_callable=print_context, dag=dag, ) def sleep(seconds): time.sleep(seconds) def make_sleep_task(task_name, dag): seconds = random.randint(1, 3) task = PythonOperator( task_id=task_name, python_callable=sleep, op_kwargs={"seconds": float(seconds) / 10}, dag=dag, ) return task # print_task > sleep_task first_sleep = make_sleep_task("first_sleep", dag) last_sleep = make_sleep_task("last_sleep", dag) print_task.set_downstream(last_sleep) # sleep_task > print_task first_sleep >> print_task
) dag6_task2.set_upstream(dag6_task1) # DAG tests that a deadlocked subdag is properly caught dag7 = DAG(dag_id='test_subdag_deadlock', default_args=default_args) subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args) subdag7_task1 = PythonOperator(task_id='test_subdag_fail', dag=subdag7, python_callable=fail) subdag7_task2 = DummyOperator( task_id='test_subdag_dummy_1', dag=subdag7, ) subdag7_task3 = DummyOperator(task_id='test_subdag_dummy_2', dag=subdag7) dag7_subdag1 = SubDagOperator(task_id='subdag', dag=dag7, subdag=subdag7) subdag7_task1.set_downstream(subdag7_task2) subdag7_task2.set_downstream(subdag7_task3) # DAG tests that a Dag run that doesn't complete but has a root failure is marked running dag8 = DAG(dag_id='test_dagrun_states_root_fail_unfinished', default_args=default_args) dag8_task1 = DummyOperator( task_id= 'test_dagrun_unfinished', # The test will unset the task instance state after # running this test dag=dag8, ) dag8_task2 = PythonOperator( task_id='test_dagrun_fail', dag=dag8, python_callable=fail,
'start_date': datetime.utcnow(), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 0, 'retry_delay': timedelta(minutes=5), } dag = DAG('twitter', default_args=default_args, schedule_interval=timedelta(minutes=5)) twitter_transformer = TwitterTransformer() twitter_loader = TwitterLoader() analyze_tweets = PythonOperator( task_id='analyze_tweets', provide_context=True, python_callable=twitter_transformer.process, dag=dag) transfer_to_elastic = PythonOperator( task_id='transfer_to_elastic', provide_context=True, python_callable=twitter_loader.load_into_elastic, dag=dag) analyze_tweets.set_downstream(transfer_to_elastic)
'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1) # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('bollinger', default_args=default_args, schedule_interval="*/30 6-14 * * 1-5") t1 = PythonOperator( task_id='bb', python_callable=calc_bb.get_spy_data, #schedule_interval="0 13 * * *", provide_context=True, dag=dag, ) t2 = PythonOperator( task_id='plot_bb', python_callable=calc_bb.plot, dag=dag, ) t1.set_downstream(t2)
<tr><td><b> Task ID: </b></td><td>{{ task_instance.task_id }}</td></tr> <tr><td><b> Execution Date: </b></td><td>{{ task_instance.execution_date }}</td></tr> <tr><td><b> Start Date: </b></td><td>{{ task_instance.start_date }}</td></tr> <tr><td><b> End Date: </b></td><td>{{ task_instance.end_date }}</td></tr> <tr><td><b> Host Name: </b></td><td>{{ task_instance.hostname }}</td></tr> <tr><td><b> Unix Name: </b></td><td>{{ task_instance.unixname }}</td></tr> <tr><td><b> Job ID: </b></td><td>{{ task_instance.job_id }}</td></tr> <tr><td><b> Queued Date Time: </b></td><td>{{ task_instance.queued_dttm }}</td></tr> <tr><td><b> Log URL: </b></td><td><a href="{{ task_instance.log_url }}">{{ task_instance.log_url }}</a></td></tr> </table> <h2>Processes Killed</h2> <ul> {% for process_killed in task_instance.xcom_pull(task_ids='kill_halted_tasks', key='kill_halted_tasks.processes_to_kill') %} <li>Process {{loop.index}}</li> <ul> {% for key, value in process_killed.iteritems() %} <li>{{ key }}: {{ value }}</li> {% endfor %} </ul> {% endfor %} </ul> </body> </html> """, dag=dag) kill_halted_tasks.set_downstream(email_or_not_branch) email_or_not_branch.set_downstream(send_processes_killed_email)
o = PythonOperator( task_id='United_Kingdom', provide_context=True, python_callable=UK, dag=dag, ) p = PythonOperator( task_id='Generate_Heat_Map', provide_context=True, python_callable=map, dag=dag, ) #---------------------------# # Dependencies # #---------------------------# # a = root a.set_downstream(b) # b = bottleneck to three threads b.set_downstream(c) c.set_downstream([d, e, f, g, h, i, j, k, l, m, n, o]) p.set_upstream([d, e, f, g, h, i, j, k, l, m, n, o])
dag=cell_image_analysis_2channels_dag, ) def prepare_cellprofiler_csv(ds, **kwargs): """Prepare the cellprofiler csv based on the args""" df = get_cell_images_df(**kwargs) kwargs['ti'].xcom_push(key='cell_images_df', value=df) return prepare_cellprofiler_csv_op = PythonOperator( task_id='prepare_cellprofiler_csv', provide_context=True, python_callable=prepare_cellprofiler_csv, dag=cell_image_analysis_2channels_dag) prepare_cellprofiler_csv_op.set_downstream(image_conversion_dag) cellprofiler_tasks = cell_image_analysis_generate_cellprofiler_task( cell_image_analysis_2channels_dag) cellprofiler_branch_tasks = cell_image_analysis_generate_decide_run_cellprofiler( cell_image_analysis_2channels_dag) image_conversion_dag.set_downstream(cellprofiler_branch_tasks) cell_image_analysis_no_images_to_run_op.set_upstream(cellprofiler_branch_tasks) for idx, cellprofiler_branch_task in enumerate(cellprofiler_branch_tasks): cellprofiler_branch_task.set_downstream(cellprofiler_tasks[idx]) cell_image_analysis_combine_cellprofiler_csvs.set_upstream(cellprofiler_tasks)
'service_name': service_name, 'machine_service_name': machine_service_name }, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) service_tasks.append(get_task) #: join_council_districts must run before get_task get_task.set_upstream(create_prod_files) if i == 'pothole': #: get_task must run before sonar potholes get_task.set_downstream(create_potholes_sonar) filename = conf['prod_data_dir'] + "/get_it_done_*.csv" files = [os.path.basename(x) for x in glob.glob(filename)] for index, file_ in enumerate(files): file_name = file_.split('.')[0] name_parts = file_name.split('_') task_name = '_'.join(name_parts[3:-2]) md_name = '-'.join(name_parts[3:-2]) #: Upload prod gid file to S3 upload_task = S3FileTransferOperator( task_id='upload_' + task_name, source_base_path=conf['prod_data_dir'], source_key='get_it_done_{}_requests_datasd.csv'.format(