def __init__(self, component_name, task_id, parent_dag, input_dict, output_dict, exec_properties, driver_options, driver_class, executor_class, additional_pipeline_args, metadata_connection_config, logger_config): super(_TfxWorker, self).__init__( dag_id=task_id, schedule_interval=None, start_date=parent_dag.start_date, user_defined_filters={'b64encode': base64.b64encode}) adaptor = airflow_adapter.AirflowAdapter( component_name=component_name, input_dict=input_dict, output_dict=output_dict, exec_properties=exec_properties, driver_options=driver_options, driver_class=driver_class, executor_class=executor_class, additional_pipeline_args=additional_pipeline_args, metadata_connection_config=metadata_connection_config, logger_config=logger_config) # Before the executor runs, check if the artifact already exists checkcache_op = python_operator.BranchPythonOperator( task_id=task_id + '.checkcache', provide_context=True, python_callable=adaptor.check_cache_and_maybe_prepare_execution, op_kwargs={ 'uncached_branch': task_id + '.exec', 'cached_branch': task_id + '.noop_sink', }, dag=self) tfx_op = python_operator.PythonOperator( task_id=task_id + '.exec', provide_context=True, python_callable=adaptor.python_exec, op_kwargs={ 'cache_task_name': task_id + '.checkcache', }, dag=self) noop_sink_op = dummy_operator.DummyOperator(task_id=task_id + '.noop_sink', dag=self) publishexec_op = python_operator.PythonOperator( task_id=task_id + '.publishexec', provide_context=True, python_callable=adaptor.publish_exec, op_kwargs={ 'cache_task_name': task_id + '.checkcache', 'exec_task_name': task_id + '.exec', }, dag=self) tfx_op.set_upstream(checkcache_op) publishexec_op.set_upstream(tfx_op) noop_sink_op.set_upstream(checkcache_op)
def add_export_task(toggle, task_id, python_callable, dependencies=None): if toggle: def python_callable_with_fallback(**kwargs): for index, provider_uri in enumerate(provider_uris): kwargs['provider_uri'] = provider_uri try: python_callable(**kwargs) break except Exception as e: if index < (len(provider_uris) - 1): logging.exception( 'An exception occurred. Trying another uri') else: raise e operator = python_operator.PythonOperator( task_id=task_id, python_callable=python_callable_with_fallback, provide_context=True, execution_timeout=timedelta(hours=15), dag=dag, ) if dependencies is not None and len(dependencies) > 0: for dependency in dependencies: if dependency is not None: dependency >> operator return operator else: return None
def create_python_operator(dag, workflow, job): from airflow.operators import python_operator return python_operator.PythonOperator( dag=dag, task_id=job.id, python_callable=callable_factory(job, workflow.dt_as_datetime), retries=job.retry_count, retry_delay=timedelta(seconds=job.retry_pause_sec), provide_context=True)
def call(self, dag): t_up = self.f_task(dag) t = python_operator.PythonOperator( task_id=self.id, python_callable=self.run, provide_context=True, templates_dict={"result": fairflow.utils.xcom_result(t_up)}, dag=dag) t.set_upstream(t_up) return t
def call(self, dag): tasks = [fop(dag) for fop in self.fops] t = python_operator.PythonOperator( task_id=self.id, python_callable=self.run, provide_context=True, templates_dict={ ut.task_id : utils.xcom_result(ut) for ut in tasks }, dag=dag ) t.set_upstream(tasks) return t
def call(self, dag): tasks_upstream = [fop(dag) for fop in self.fops_upstream ] # instantiate all upstream tasks t_sum = python_operator.PythonOperator( task_id=self.id, python_callable=self.run, provide_context=True, templates_dict={ ut.task_id: fairflow.utils.xcom_result(ut) for ut in tasks_upstream }, dag=dag) t_sum.set_upstream(tasks_upstream) return t_sum
def load_subdag(parent_dag_name, child_dag_name, args): dag_subdag = DAG( dag_id='{0}.{1}'.format(parent_dag_name, child_dag_name), default_args=args, schedule_interval="@daily", ) with dag_subdag: for i in range(3): t = python_operator.PythonOperator( task_id='load_subdag_{0}'.format(i), python_callable=sleepFortask, op_kwargs={'key1': i}, dag=dag_subdag) return dag_subdag
def _add_update_airflow_variable_task( dag: models.DAG) -> python_operator.PythonOperator: """Adds a airflow variable with the new dataset id. Args: dag: The dag that the task needs to be added to. Returns: PythonOperator used to update airflow variable within a DAG. """ return python_operator.PythonOperator( task_id=_UPDATE_AIRFLOW_VARS_TASK, python_callable=_set_model_var, provide_context=True, dag=dag, )
def add_export_task(toggle, task_id, python_callable, dependencies=None): if toggle: operator = python_operator.PythonOperator( task_id=task_id, python_callable=python_callable, provide_context=True, execution_timeout=timedelta(hours=15), dag=dag, ) if dependencies is not None and len(dependencies) > 0: for dependency in dependencies: if dependency is not None: dependency >> operator return operator else: return None
def build_python_operator(operator_ref, dag_ref): """ Builds a DAG operator of type: PythonOperator. Args: operator_ref (string): the definition of the operator dag_ref (string): the reference to the dag to associate this operator """ dynamic_func = {} exec("\n".join(operator_ref['function_def']), dynamic_func) op = python_operator.PythonOperator( task_id=operator_ref['task_id'], python_callable=dynamic_func[operator_ref['function_name']], dag=dag_ref) return op
def call(self, dag): """Instantiate upstream tasks, this task and set dependencies. Returns: task""" model_tasks = [ # instantiate tasks for running the different models f(dag) # by calling their FOperators on the current `dag` for f in self. fops_models # notice that we do not know about the models upstream dependencies! ] t = python_operator.PythonOperator( task_id=self.__class__.__name__, python_callable=self.compare, provide_context=True, templates_dict={ "model_taskids": [mt.task_id for mt in model_tasks] }, dag=dag) t.set_upstream(model_tasks) return t
def add_save_checkpoint_tasks(dependencies=None): def save_checkpoint(execution_date, **kwargs): with TemporaryDirectory() as tempdir: local_path = os.path.join(tempdir, "checkpoint.txt") remote_path = "checkpoint/block_date={block_date}/load_complete_checkpoint.txt".format( block_date=execution_date.strftime("%Y-%m-%d")) open(local_path, mode='a').close() upload_to_gcs(gcs_hook=GoogleCloudStorageHook( google_cloud_storage_conn_id="google_cloud_default"), bucket=output_bucket, object=remote_path, filename=local_path) save_checkpoint_task = python_operator.PythonOperator( task_id='save_checkpoint', python_callable=save_checkpoint, provide_context=True, execution_timeout=timedelta(hours=1), dag=dag, ) if dependencies is not None and len(dependencies) > 0: for dependency in dependencies: dependency >> save_checkpoint_task return save_checkpoint_task
} # Main Dataflow task that will process and load the input delimited file. # TODO: Specify the type of operator we need to call to invoke DataFlow dataflow_task = dataflow_operator.DataFlowPythonOperator( task_id="process-delimited-and-push", py_file=DATAFLOW_FILE, options=job_args) # Here we create two conditional tasks, one of which will be executed # based on whether the dataflow_task was a success or a failure. success_move_task = python_operator.PythonOperator( task_id='success-move-to-completion', python_callable=move_to_completion_bucket, # A success_tag is used to move # the input file to a success # prefixed folder. op_args=[COMPLETION_BUCKET, SUCCESS_TAG], provide_context=True, trigger_rule=TriggerRule.ALL_SUCCESS) failure_move_task = python_operator.PythonOperator( task_id='failure-move-to-completion', python_callable=move_to_completion_bucket, # A failure_tag is used to move # the input file to a failure # prefixed folder. op_args=[COMPLETION_BUCKET, FAILURE_TAG], provide_context=True, trigger_rule=TriggerRule.ALL_FAILED)
# Any task you create within the context manager is automatically added to the # DAG object. with models.DAG( 'composer_sample_simple_greeting', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # [END composer_simple_define_dag_airflow_1] # [START composer_simple_operators_airflow_1] def greeting(): import logging logging.info('Hello World!') # An instance of an operator is called a task. In this case, the # hello_python task calls the "greeting" Python function. hello_python = python_operator.PythonOperator( task_id='hello', python_callable=greeting) # Likewise, the goodbye_bash task calls a Bash script. goodbye_bash = bash_operator.BashOperator( task_id='bye', bash_command='echo Goodbye.') # [END composer_simple_operators_airflow_1] # [START composer_simple_relationships_airflow_1] # Define the order in which the tasks complete by using the >> and << # operators. In this example, hello_python executes before goodbye_bash. hello_python >> goodbye_bash # [END composer_simple_relationships_airflow_1] # [END composer_simple_airflow_1]
'start_date': datetime.datetime(2021, 1, 5, 8, 0, 0), } dag = airflow.DAG('synth_dag', 'catchup=False', default_args=default_args, schedule_interval=datetime.timedelta(minutes=15)) def call_synth_info_api(): url = "https://synthinfo-ue.a.run.app/upload" r = requests.get(url) print(r) synthInfoCall = python_operator.PythonOperator( task_id="call_the_api", python_callable=call_synth_info_api, dag=dag) synthGCStoBQSync = GoogleCloudStorageToBigQueryOperator( task_id='gcs_to_bq', bucket='synth-info', source_objects=['synthinfo.csv'], schema_fields=[{ "mode": "NULLABLE", "name": "address", "type": "STRING" }, { "mode": "NULLABLE", "name": "collateralToken", "type": "STRING" }, { "mode": "NULLABLE",
'PRZ_{{dag_run.conf["name"][:dag_run.conf["name"].rfind("/")]}}.{{"_".join(dag_run.conf["name"][:dag_run.conf["name"].rfind(".")].split("_")[2:])}}', 'fields': g_fields, 'load_dt': '{{ dag_run.conf["bqTimestamp"]}}', 'op_dict': g_operations_dict } # Main Dataflow task TSK_dataflow_file_ingestion = dataflow_operator.DataFlowPythonOperator( task_id="tsk-dataflow-file-ingestion", py_file=DATAFLOW_FILE, options=job_args) # Upon Dataflow task success the TSK_move_into_arc_bucket starts TSK_move_into_arc_bucket = python_operator.PythonOperator( task_id='TSK_move_into_arc_bucket', python_callable=DPLF_move_into_arc_bucket, op_args=[g_output_bucket], provide_context=True, trigger_rule=TriggerRule.ALL_SUCCESS) # Upon Dataflow task failure the TSK_move_into_inv_bucket starts TSK_move_into_inv_bucket = python_operator.PythonOperator( task_id='TSK_move_into_inv_bucket', python_callable=DPLF_move_into_inv_bucket, op_args=[g_failed_bucket], provide_context=True, trigger_rule=TriggerRule.ONE_FAILED) DPLF_ConsistencyCheck = python_operator.PythonOperator( task_id='DPLF_ConsistencyCheck', python_callable=DPLF_ConsistencyCheck, provide_context=True)
from airflow.operators import bash_operator from airflow.operators import python_operator yesterday = datetime.datetime.combine( datetime.datetime.today() - datetime.timedelta(1), datetime.datetime.min.time()) default_dag_args = {'start_date': yesterday} with models.DAG('running_python_and_bash_operator', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: def hello_world(): print('Hello World!') return 1 def greeting(): print('Greetings from GCP! Happy shopping.') return 'Greeting successfully printed.' hello_world_greeting = python_operator.PythonOperator( task_id='python_1', python_callable=hello_world) sales_greeting = python_operator.PythonOperator(task_id='python_2', python_callable=greeting) bash_greeting = bash_operator.BashOperator( task_id='bye_bash', bash_command='echo Goodbye! Hope to see you soon.') hello_world_greeting >> sales_greeting >> bash_greeting
def task_completion(status, **kwargs): if status == SUCCESS_TAG: logging.info('successfully processed minutes document %s', kwargs['dag_run'].conf['event_date']) else: logging.info('failure in processing minutes document', kwargs['dag_run'].conf['event_date']) with models.DAG(dag_id='MinutesProcessing', description='A Dag Triggering Minutes Processing Job', schedule_interval=None, default_args=DEFAULT_DAG_ARGS) as dag: # Args required for the Dataflow job. downloadminutes = python_operator.PythonOperator(task_id='downloadminutes', python_callable=download_minutes, op_args=[GCP_BUCKET, TARGET_EVENT], provide_context=True) # use template for the xcom job_args = { 'input': "{{ task_instance.xcom_pull(task_ids='downloadminutes') }}", 'output': OUTPUT_FILE_PATH } dataflow_task = dataflow_operator.DataflowTemplateOperator( template=DATAFLOW_MINUTES_TEMPLATE, task_id="processminutes", parameters=job_args) # Here we create two conditional tasks, one of which will be executed # based on whether the dataflow_task was a success or a failure.
job_args = { "input": 'gs://{{ dag_run.conf["bucket"] }}/{{ dag_run.conf["name"] }}', "output": models.Variable.get("bq_output_table"), "fields": models.Variable.get("input_field_names"), } # Main Dataflow task that will process and load the input delimited file. dataflow_task = dataflow_operator.DataFlowPythonOperator( task_id="process-data", py_file=DATAFLOW_FILE, options=job_args) # trigger on sucess success_move_task = python_operator.PythonOperator( task_id="success-move-to-completion", python_callable=move_to_completion_bucket, op_args=[COMPLETION_BUCKET, SUCCESS_TAG], provide_context=True, trigger_rule=TriggerRule.ALL_SUCCESS, ) # trigger on failure failure_move_task = python_operator.PythonOperator( task_id="failure-move-to-completion", python_callable=move_to_completion_bucket, op_args=[COMPLETION_BUCKET, FAILURE_TAG], provide_context=True, trigger_rule=TriggerRule.ALL_FAILED, ) # After moving the bucket send email or other type of notification about success or failure success_message = "Successfully processed the latest file, moved to gs://{}.".format(
'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'start_date': airflow.utils.dates.days_ago(1), } dag = DAG('python_dag', 'catchup=False', default_args=default_args, schedule_interval="@once",) start_dag = dummy_operator.DummyOperator( task_id='start', default_args=default_args, dag=dag, ) def python_greeting(**kwargs): context = kwargs print("Dag: ", context['dag_run'].dag_id) print("Task: ", context['task'].task_id) print("Current Date Time: ", datetime.datetime.now()) print('Hello Python!') python_dag = python_operator.PythonOperator( task_id='python_func_call', python_callable=python_greeting, provide_context=True, default_args=default_args, dag=dag, ) start_dag.set_downstream(python_dag)
default_args = {'start_date': YESTERDAY} dag = airflow.DAG('simple_workflow_dag', default_args=default_args, schedule_interval=None) bash_operator_task = bash_operator.BashOperator( task_id='bash_operator_example_task', bash_command='echo "Hello from Airflow Bash Operator"', dag=dag) def python_operator_func(): print("Hello from Airflow Python Operator") python_operator_task = python_operator.PythonOperator( task_id='python_operator_example_task', python_callable=python_operator_func, dag=dag) kubernetes_pod_operator_task = kubernetes_pod_operator.KubernetesPodOperator( task_id='k8s_pod_operator_example_task', name='k8s_pod_example', namespace='default', image='bash', cmds=['echo'], arguments=['"Hello from Airflow Kubernetes Pod Operator"'], dag=dag)
} # Define a DAG (directed acyclic graph) of tasks. # Any task you create within the context manager is automatically added to the # DAG object. dag = DAG( 'cathay_download_open_quiz', default_args=default_dag_args, # schedule_interval=timedelta(hours=6), # schedule_interval='@daily', schedule_interval='@once', is_paused_upon_creation=False) Q1_python = python_operator.PythonOperator( task_id='Question_1', python_callable=Question_1, dag=dag, ) Q2_Q3_python = python_operator.PythonOperator( task_id='Question_2__Question_3', python_callable=Question_2__Question_3, dag=dag, ) Q4_python = python_operator.PythonOperator( task_id='Question_4', python_callable=Question_4, dag=dag, )
""" Simple DAG for using Airflow """ import datetime import logging from airflow import models from airflow.operators import bash_operator from airflow.operators import python_operator DEFAULT_DAG_ARGS = {'start_date': datetime.datetime(2018, 1, 1)} with models.DAG('composer_sample_greeting', schedule_interval=datetime.timedelta(days=1), default_args=DEFAULT_DAG_ARGS) as dag: def _hello_python(): """ A method here """ logging.info('Hello World!') HELLO_PYTHON = python_operator.PythonOperator( task_id='HELLO_PYTHON', python_callable=_hello_python) GOODBYE_BASH = bash_operator.BashOperator(task_id='GOODBYE_BASH', bash_command='echo Goodbye') HELLO_PYTHON >> GOODBYE_BASH
import datetime from scipy import stats from airflow import models from airflow.operators import python_operator yesterday = datetime.datetime.combine( datetime.datetime.today() - datetime.timedelta(1), datetime.datetime.min.time()) default_dag_args = { 'start_date': yesterday, 'retries': 1, 'retry_delay': datetime.timedelta(minutes=2) } with models.DAG('finding_the_most_common_element', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: def print_most_common_number(): num = stats.mode(["9", "5", "2", "5", "1", "6"]) print(num) return ('Successfully printed most common element!') printing_most_common_element = python_operator.PythonOperator( task_id='most_common_number', python_callable=print_most_common_number) printing_most_common_element
'corrected_dataset_id', 'corrected_table_name', 'commitments_table_name', 'enable_cud_cost_attribution'] ENV_VARS = get_env_variables(KEY_LIST) # Create temp tables for each of the three queries ENV_VARS['distribute_commitments_table'] = 'temp_distribute_commitments_table' ENV_VARS['project_label_credit_breakout_table'] = 'temp_project_label_credit_data_table' ENV_VARS['temp_commitments_table_name'] = 'temp_commitments_table' # Convert string to bool because environment variables are strings. ENV_VARS['enable_cud_cost_attribution'] = ( ENV_VARS['enable_cud_cost_attribution'].lower() == 'true' ) bq_client = bigquery.Client() FORMAT_COMMITMENT_TABLE = python_operator.PythonOperator( task_id='format_commitment_table', python_callable=format_commitment_table, op_kwargs={'env_vars': ENV_VARS} ) PROJECT_LABEL_CREDIT_QUERY = python_operator.PythonOperator( task_id='project_label_credit_query', python_callable=project_label_credit, op_kwargs={'bq_client': bq_client, 'env_vars': ENV_VARS} ) DISTRIBUTE_COMMITMENTS_QUERY = python_operator.PythonOperator( task_id='distribute_commitments', python_callable=distribute_commitments, op_kwargs={'bq_client': bq_client, 'env_vars': ENV_VARS}) BILLING_OUTPUT_QUERY = python_operator.PythonOperator(
'depends_on_past': False, 'retries': 0, 'provide_context': True, 'retry_delay': datetime.timedelta(minutes=5), 'start_date': airflow.utils.dates.days_ago(1), } slack_dag = DAG( 'slack_dag', 'catchup=False', default_args=default_args, on_failure_callback=task_fail_slack_alert, schedule_interval="@once", ) def div_method(**kwargs): print(kwargs) nv = 0 / 0 print(nv) div_by_zero = python_operator.PythonOperator( task_id='div_by_zero', python_callable=div_method, provide_context=True, dag=slack_dag, ) div_by_zero
if DELETE_TABLES: client.delete_table(source_table_ref) logging.info( '***** DELETE: -> Table {}:{} deleted.'.format( BQ_DATASET_NAME, each_temp)) else: logging.info( "Table [" + each_temp + "] is still in streaming mode and cannot be processed!" + " *** Seconds elapsed: " + str(detla_in_seconds) + "/" + str(BUFFER_SECONDS) + " seconds!") except Exception as err: logging.info("--> Error Detail: " + err.message) break logging.info("Total number of rows processed: " + str(row_counter)) def sample_function(): import logging logging.info('Hello from the sample function!') start_dag = python_operator.PythonOperator(task_id='Pre_Tasks', python_callable=sample_function) consolidation = python_operator.PythonOperator( task_id='BQ_Table_Consolidation', python_callable=run_table_consolidation) end_dag = python_operator.PythonOperator(task_id='Cleanup_Tasks', python_callable=sample_function) start_dag >> consolidation >> end_dag
# 'priority_weight': 10, } # Define a DAG (directed acyclic graph) of tasks. # Any task you create within the context manager is automatically added to the # DAG object. with models.DAG( 'bindexis_end2end', schedule_interval=datetime.timedelta(days=1), # or in cron Format default_args=default_dag_args) as dag: # An instance of an operator is called a task. In this case, the # hello_python task calls the "greeting" Python function. bindexis_python = python_operator.PythonOperator( task_id='bindexis-dataload-start', python_callable=def_bindexis_dataload.bindexis_dataload, op_kwargs={'user_bindexis': Variable.get("user_bindexis"), 'pw_bindexis': Variable.get("password_bindexis")}, retries=2) # Likewise, the goodbye_bash task calls a Bash script. end_bash = bash_operator.BashOperator( task_id='bindexis-end', bash_command='echo bindexis-dataload-end.') # Define the order in which the tasks complete by using the >> and << # operators. In this example, bindexis_python executes before end_bash. bindexis_python >> end_bash # Send email confirmation #email_summary = EmailOperator(
trigger_dag(dag_id=dag_to_trigger, run_id='{}_{}'.format(file_name, uuid4()), conf=json.dumps({'file': file_name}), execution_date=None, replace_microseconds=False) files_triggered.append(file_name) logger.info('triggered %s for %s files: %s' % (dag_to_trigger, len(files_triggered), files_triggered)) dag = DAG('trigger_process_zip_dag', default_args=default_args, schedule_interval=SCHEDULE_INTERVAL) task_1 = python_operator.PythonOperator( task_id='get_zip_files_to_process', python_callable=get_zip_files_to_process, dag=dag ) task_2 = python_operator.PythonOperator( task_id='run_dag_for_each_file', provide_context=True, python_callable=run_dag_for_each_file, op_args=[process_zip_dag.dag.dag_id], dag=dag ) task_1.set_downstream(task_2)
APITags = DataFlowPythonOperator(py_file=pipeline_api_tags, options={ 'input': tags_path, 'temp_location': temp_bucket, 'project': project }, task_id='apicallpipeline') # Comments and Answers reports sql = 'SELECT question_id FROM `{0}.{1}` WHERE creation_date >= TIMESTAMP("{2}")'.format( dataset, table_question, yesterday_dash_string) Query = python_operator.PythonOperator(task_id='Query', python_callable=QueryToGCS, op_kwargs={'sql': sql}) CommentsExport = python_operator.PythonOperator( task_id='CommentsExport', python_callable=CommentsToGCS) AnswerExport = python_operator.PythonOperator(task_id='AnswerExport', python_callable=AnswersToGCS) comment_file = '{}_{}.json'.format(comment_export, yesterday_string) answer_file = '{}_{}.json'.format(answer_export, yesterday_string) CommentToGCS = GoogleCloudStorageToGoogleCloudStorageOperator( task_id="Comment_to_GSC", source_bucket=source_bucket, source_object="data/{}".format(comment_file), destination_bucket=destination_bucket,