isalnum = a_string.isalnum() #print('Is String Alphanumeric :', isalnum) alphanumeric_filter = filter(str.isalnum, a_string) alphanumeric_string = "".join(alphanumeric_filter) #remove / from file path return alphanumeric_string.replace("/", "__") with models.DAG( 'import_ingestion', # Continue to run DAG once per day schedule_interval='@once', default_args=default_dag_args) as dag: start = DummyOperator(task_id='start') wait = DummyOperator(task_id='wait', trigger_rule="all_done") end = DummyOperator(task_id='end', trigger_rule="all_done") for blob in blobs: #print(blob.name) print_file = BashOperator(task_id='print_file_' + get_alphanumeric_task_id(blob.name), bash_command='echo "hello "+blob.name', dag=dag) start.set_downstream(print_file) print_file.set_downstream(wait) wait >> end
default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime.utcnow(), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG('kubernetes_hello_world', default_args=default_args, schedule_interval=timedelta(minutes=10)) start = DummyOperator(task_id='start', dag=dag) passing = KubernetesPodOperator(namespace='default', image="python:3.6", cmds=["python", "-c"], arguments=["print('hello world')"], labels={"foo": "bar"}, name="passing-test", task_id="passing-task", get_logs=True, dag=dag) failing = KubernetesPodOperator(namespace='default', image="ubuntu:16.04", cmds=["python", "-c"], arguments=["print('hello world')"],
from airflow.models import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import PythonOperator args = {"owner": "Airflow", "start_date": airflow.utils.dates.days_ago(10)} dag = DAG( dag_id="third_exercise", default_args=args, schedule_interval="@daily", dagrun_timeout=timedelta(minutes=60), ) def print_date(execution_date, **kwargs): print("The execution_date is: {}".format(execution_date)) print_execution_time = PythonOperator( task_id="task1", dag=dag, python_callable=print_date, provide_context=True ) the_end = DummyOperator(task_id="the_end", dag=dag) for seconds in {1, 5, 10}: print_execution_time >> BashOperator( task_id="sleep_{}".format(seconds), bash_command="sleep {}".format(seconds), dag=dag, ) >> the_end
import airflow from airflow import DAG from airflow.operators.dummy_operator import DummyOperator args = { 'owner': 'Airflow', 'start_date': airflow.utils.dates.days_ago(9), } with DAG(dag_id='exercise3', default_args=args, schedule_interval='* 45 13 ? * SUN,TUE,THU *') as dag: task1 = DummyOperator(task_id='task1') task2 = DummyOperator(task_id='task2') task3 = DummyOperator(task_id='task3') task4 = DummyOperator(task_id='task4') task5 = DummyOperator(task_id='task5') task1 >> task2 >> [task3, task4] >> task5
from datetime import datetime from airflow import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import PythonOperator def print_hello(): return "Hello world!" dag = DAG( "hello_world", description="Simple tutorial DAG", schedule_interval="0 12 * * *", start_date=datetime(2017, 3, 20), catchup=False, ) dummy_operator = DummyOperator(task_id="dummy_task", retries=3, dag=dag) hello_operator = PythonOperator( task_id="hello_task", python_callable=print_hello, dag=dag ) dummy_operator >> hello_operator
'task_types': 'TaskType.INGEST' } args = { 'start_date': datetime.utcnow(), 'provide_context': True, 'owner': 'airflow', } auth_conn = HttpHook.get_connection('test_netrc') http_conn = HttpHook('GET', 'test_netrc') redis_hook = RedisHook(redis_conn_id='redis_default') dag = DAG(dag_id='vlass_execute', default_args=args, schedule_interval=None) start_task = DummyOperator(task_id='start_task', dag=dag) end_task = DummyOperator(task_id='end_task', dag=dag) # provide_context in default_args above must be True to get the kwargs values def get_file_names(**kwargs): prev_date = kwargs['prev_execution_date'] next_date = kwargs['next_execution_date'] redis_conn = redis_hook.get_conn() redis_keys = redis_conn.keys('vlass_*') results = [] for r in redis_keys: key_datetime = datetime.strptime(r[6:], '%Y_%m_%d_%H_%M_%S') if prev_date < key_datetime < next_date: results.append(redis_conn.get(r).decode('utf-8').split()[1:])
cwl_workflow1 = os.path.join(pipeline_name, 'pipeline.cwl') cwl_workflow2 = os.path.join('portal-containers', 'ome-tiff-offsets.cwl') cwl_workflow3 = os.path.join('portal-containers', 'sprm-to-json.cwl') def build_dataset_name(**kwargs): return '{}__{}__{}'.format( dag.dag_id, kwargs['dag_run'].conf['parent_submission_id'], pipeline_name), # prepare_cwl1 = PythonOperator( # python_callable=utils.clone_or_update_pipeline, # task_id='prepare_cwl1', # op_kwargs={'pipeline_name': cwl_workflow1} # ) prepare_cwl1 = DummyOperator(task_id='prepare_cwl1') def build_cwltool_cmd1(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] tmpdir = utils.get_tmp_dir_path(run_id) print('tmpdir: ', tmpdir) data_dir = ctx['parent_lz_path'] print('data_dir: ', data_dir) cwltool_dir = get_cwltool_bin_path() command = [ 'env', 'PATH=%s:%s' % (cwltool_dir, os.environ['PATH']), 'cwltool', os.fspath(PIPELINE_BASE_DIR / cwl_workflow1),
def test_not_skipping_external(self): latest_task = LatestOnlyOperator(task_id='latest', dag=self.dag) downstream_task = DummyOperator(task_id='downstream', dag=self.dag) downstream_task2 = DummyOperator(task_id='downstream_2', dag=self.dag) downstream_task.set_upstream(latest_task) downstream_task2.set_upstream(downstream_task) self.dag.create_dagrun( run_id="manual__1", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, external_trigger=True, ) self.dag.create_dagrun( run_id="manual__2", start_date=timezone.utcnow(), execution_date=timezone.datetime(2016, 1, 1, 12), state=State.RUNNING, external_trigger=True, ) self.dag.create_dagrun( run_id="manual__3", start_date=timezone.utcnow(), execution_date=END_DATE, state=State.RUNNING, external_trigger=True, ) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances } self.assertEqual( { timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success' }, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances } self.assertEqual( { timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success' }, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_2') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances } self.assertEqual( { timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success' }, exec_date_to_downstream_state)
def test_skipping_non_latest(self): latest_task = LatestOnlyOperator(task_id='latest', dag=self.dag) downstream_task = DummyOperator(task_id='downstream', dag=self.dag) downstream_task2 = DummyOperator(task_id='downstream_2', dag=self.dag) downstream_task3 = DummyOperator(task_id='downstream_3', trigger_rule=TriggerRule.NONE_FAILED, dag=self.dag) downstream_task.set_upstream(latest_task) downstream_task2.set_upstream(downstream_task) downstream_task3.set_upstream(downstream_task) self.dag.create_dagrun( run_id="scheduled__1", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, ) self.dag.create_dagrun( run_id="scheduled__2", start_date=timezone.utcnow(), execution_date=timezone.datetime(2016, 1, 1, 12), state=State.RUNNING, ) self.dag.create_dagrun( run_id="scheduled__3", start_date=timezone.utcnow(), execution_date=END_DATE, state=State.RUNNING, ) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task3.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances } self.assertEqual( { timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success' }, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances } self.assertEqual( { timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success' }, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_2') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances } self.assertEqual( { timezone.datetime(2016, 1, 1): None, timezone.datetime(2016, 1, 1, 12): None, timezone.datetime(2016, 1, 2): 'success' }, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_3') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances } self.assertEqual( { timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success' }, exec_date_to_downstream_state)
def test_lineage(self, _get_backend): backend = mock.Mock() send_mock = mock.Mock() backend.send_lineage = send_mock _get_backend.return_value = backend dag = DAG(dag_id='test_prepare_lineage', start_date=DEFAULT_DATE) f1 = File("/tmp/does_not_exist_1") f2 = File("/tmp/does_not_exist_2") f3 = File("/tmp/does_not_exist_3") with dag: op1 = DummyOperator(task_id='leave1', inlets={"datasets": [ f1, ]}, outlets={"datasets": [ f2, ]}) op2 = DummyOperator(task_id='leave2') op3 = DummyOperator(task_id='upstream_level_1', inlets={"auto": True}, outlets={"datasets": [ f3, ]}) op4 = DummyOperator(task_id='upstream_level_2') op5 = DummyOperator( task_id='upstream_level_3', inlets={"task_ids": ["leave1", "upstream_level_1"]}) op1.set_downstream(op3) op2.set_downstream(op3) op3.set_downstream(op4) op4.set_downstream(op5) ctx1 = {"ti": TI(task=op1, execution_date=DEFAULT_DATE)} ctx2 = {"ti": TI(task=op2, execution_date=DEFAULT_DATE)} ctx3 = {"ti": TI(task=op3, execution_date=DEFAULT_DATE)} ctx5 = {"ti": TI(task=op5, execution_date=DEFAULT_DATE)} func = mock.Mock() func.__name__ = 'foo' # prepare with manual inlets and outlets prep = prepare_lineage(func) prep(op1, ctx1) self.assertEqual(len(op1.inlets), 1) self.assertEqual(op1.inlets[0], f1) self.assertEqual(len(op1.outlets), 1) self.assertEqual(op1.outlets[0], f2) # post process with no backend post = apply_lineage(func) post(op1, ctx1) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() prep(op2, ctx2) self.assertEqual(len(op2.inlets), 0) post(op2, ctx2) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() prep(op3, ctx3) self.assertEqual(len(op3.inlets), 1) self.assertEqual(op3.inlets[0].qualified_name, f2.qualified_name) post(op3, ctx3) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock() # skip 4 prep(op5, ctx5) self.assertEqual(len(op5.inlets), 2) post(op5, ctx5) self.assertEqual(send_mock.call_count, 1) send_mock.reset_mock()
import airflow from airflow.models import DAG from datetime import datetime, timedelta from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import BranchPythonOperator args = { 'owner': 'airflow', 'start_date': datetime(2019, 1, 14), } #fetch current hour curr_time = int(datetime.now().strftime('%H')) dag = DAG(dag_id='BranchPython_example', default_args=args) run_this_first = DummyOperator(task_id='run_this_first', dag=dag) options = [ 'branch_a-10_15', 'branch_b-16_20', 'branch_c-21__', 'branch_d-last' ] def func_condition(): #compair hours if (curr_time > 10 and curr_time <= 15): return options[0] elif (curr_time > 15 and curr_time <= 20): return options[1] elif (curr_time > 20): return options[2] else: return options[3]
from airflow import DAG from airflow.operators.python_operator import PythonOperator from airflow.operators.dummy_operator import DummyOperator from datetime import datetime dag = DAG("FailingpythonTask", start_date=datetime(2019, 9, 12), schedule_interval="@daily", catchup=False) def myPythonFunction(): # raise Exception print("Zooooo Except bbbbbbbbbbb momooooooooooooooooo") raise Exception with dag: t1 = PythonOperator(task_id="t1", dag=dag, python_callable=myPythonFunction) t2 = DummyOperator(task_id="t2", trigger_rule="all_done") t1 >> t2 if __name__ == "__main__": dag.cli()
finally: if conn is not None: conn.close() print('Closed Database connection') def total_run(): for currency in currency_list: update_table(currency) default_args = { 'owner': 'yurii', 'depends_on_past': False, 'start_date': datetime(2018, 6, 1), 'email': ['*****@*****.**'], 'email_on_failure': ['*****@*****.**'], 'email_on_retry': False, 'retries': 1, 'backfill': False, } money_dag = DAG('exchange_rate', default_args=default_args, catchup=False, schedule_interval='@hourly') t1 = PythonOperator(task_id='do_all', python_callable=total_run, dag=money_dag) t0 = DummyOperator(task_id='do_nothing', dag=money_dag) t1 >> t0
rm_tmp_tables_pre = rm_tmp_tables("_pre") sqlalchemy_create_objects_from_schema = SqlAlchemyCreateObjectOperator( task_id="sqlalchemy_create_objects_from_schema", data_schema_name=DAG_ID) add_cdc_ids = [ PostgresOperator( task_id=f"add_cdc_id_to_{table}", sql= f"ALTER TABLE {table} ADD COLUMN IF NOT EXISTS cdc_id BIGINT UNIQUE NOT NULL ", ) for table in table_mappings.values() ] join_parallel_tasks = DummyOperator(task_id="join_parallel_tasks") postgres_create_tables_like = [ PostgresTableCopyOperator( task_id=f"postgres_create_tables_like_{table}", source_table_name=table, target_table_name=f"{TMP_TABLE_PREFIX}{table}", # Only copy table definitions. Don't do anything else. truncate_target=False, copy_data=False, drop_source=False, ) for table in table_mappings.values() ] def _transform_csv_files(**kwargs: Any) -> None: """Transform CSV files to have suitable headers and columns for DB insertion.
"query": "SELECT ListingId, __time, ListOfficeName FROM \"{datasource}\" WHERE \"__time\" BETWEEN TIMESTAMP '{yesterday}' AND TIMESTAMP '{today}'" .format(datasource=harPropDatasource, yesterday=yesterday, today=today) } druidQuery = json.dumps(druidJson) mlsUrl = harUrl + ' {d}'.format(d=yesterday) indexTemplate = downloadTemplate(templateUrl) druidIndexSpec = createIndexSpec(indexTemplate, validationDatasource, intervals, 'nvl("dummyCol1", \'Druid\')') mlsIndexSpec = createIndexSpec(indexTemplate, validationDatasource, intervals, 'nvl("dummyCol1", \'MLS\')') start = DummyOperator(task_id='start') wait = BashOperator(task_id='wait-for-15m', bash_command="sleep 15m") loadDruid = KubernetesPodOperator( namespace='data', image="truongretell/druiddataloader:latest", image_pull_policy='Always', cmds=[ "sh", "-c", "dotnet DruidDataLoader.dll '{link}' '/shared-data' 'har-validation' '{yesterday}' '{query}'" .format(link=druidUrl, yesterday=yesterday, query=druidQuery) ], task_id="load-property-sold-validation-task-" + str(yesterday), name="load-property-sold-validation-task-" + str(yesterday), volumes=[volume],
} dag = DAG('manga', default_args=default_args, description='dag for ETL manga', schedule_interval='50 * * * *') run_etl = PythonOperator(task_id="myextract", python_callable=myextract, dag=dag) transf = BranchPythonOperator(task_id="branching", python_callable=mytransforme, provide_context=True, dag=dag) continue1 = DummyOperator(task_id='continue', dag=dag) Stop = DummyOperator(task_id='stop', dag=dag) loadData = PythonOperator(task_id="loadData", python_callable=myload, dag=dag) alertLoic = EmailOperator(task_id='send_email', to='[email protected] ', subject='Airflow Alert', html_content=""" <h3>Email Test</h3> """, dag=dag) run_etl >> transf transf >> [continue1, Stop] continue1 >> loadData loadData >> alertLoic
from airflow.operators.bash_operator import BashOperator from airflow.operators.dummy_operator import DummyOperator args = { "owner": "Airflow", "start_date": airflow.utils.dates.days_ago(2), } dag = DAG( dag_id="my_third_dag", default_args=args, schedule_interval=timedelta(minutes=150), dagrun_timeout=timedelta(minutes=60), ) run_this_last = DummyOperator(task_id="run_this_last", dag=dag) # [START howto_operator_bash] run_this1 = BashOperator( task_id="echo_1", bash_command="echo 1", dag=dag, ) # [START howto_operator_bash] run_this2 = BashOperator( task_id="echo_2", bash_command="echo 2", dag=dag, )
task_id=f'sub_dag_task3', dag=sub_dag, ) t4 = DummyOperator( task_id=f'sub_dag_task4', dag=sub_dag, ) t1 >> [t2, t3] >> t4 return sub_dag dag = DAG(dag_id='sub_dag_example', schedule_interval=None, start_date=datetime(2020, 1, 1), default_args={"owner": "airflow_lesson"}) start = DummyOperator(task_id='start', dag=dag) end = DummyOperator(task_id='end', dag=dag) r_task = DummyOperator(task_id=f'some_task', dag=dag) r_task_2 = DummyOperator(task_id=f'another_task', dag=dag) for i in range(0, 15): sub_dags = SubDagOperator(task_id=f'do_sub_dags_{i}', subdag=prepare_sub_dag( dag.dag_id, child_dag=f'do_sub_dags_{i}'), dag=dag) r_task_2 >> sub_dags >> end start >> r_task >> r_task_2
'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', 'pool': 'kube', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG( 'PyhtonDockerHub', default_args=default_args, schedule_interval=timedelta(days=1), dagrun_timeout=timedelta(minutes=5), ) start = DummyOperator(task_id='run_this_first', dag=dag) boom = KubernetesPodOperator( namespace='airflow', image="python:3.6-stretch", image_pull_policy="Always", cmds=["python", "-c"], arguments=["print('hello world')"], name="python", task_id="startPython", is_delete_operator_pod=True, hostnetwork=False, dag=dag, in_cluster=False, )
from airflow.utils.dates import days_ago args = { 'owner': 'airflow', 'start_date': days_ago(2), } dag = DAG( dag_id='asdasd', default_args=args, schedule_interval='0 0 * * *', dagrun_timeout=timedelta(minutes=60), ) run_this_last = DummyOperator( task_id='run_this_last', dag=dag, ) # [START howto_operator_bash] run_this = BashOperator( task_id='run_after_loop', bash_command='echo 1', dag=dag, ) # [END howto_operator_bash] run_this >> run_this_last for i in range(3):
'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2), 'provide_context': True } dag = DAG( "Data_Platform_ETL", schedule_interval="@daily", default_args=args) process_start = DummyOperator( task_id='process_start', dag=dag ) # Function to perform ETL task for MySQL Table def fetch_data_from_MySQL(table_name,props,database,s3_bucket,s3_prefix ,**kwargs): try: table_exec_config = Variable.get("table_run_config", deserialize_json=True) if( table_name not in table_exec_config): print("Table not exists") full_refresh_flag = True else : print("Table exists")
# AWS_KEY = os.environ.get('AWS_KEY') # AWS_SECRET = os.environ.get('AWS_SECRET') default_args = { 'owner': 'udacity', 'start_date': datetime(2019, 1, 12), } dag = DAG('udac_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='@hourly' ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag ) stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag ) load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag )
'temp_tables': 'temp_tables.json' }, dag=dag) conn_id = 'hw4_test_db' check_db_task = BranchPythonOperator(task_id='check_db', python_callable=check_db, op_kwargs={ 'conn_id': conn_id, 'success_task_name': 'process_orders', 'failed_task_name': 'db_not_reachable' }, dag=dag) db_not_reachable_task = DummyOperator(task_id='db_not_reachable', dag=dag) notify_error_task = PythonOperator( task_id='notify_error', python_callable=send_message_from_file, op_kwargs={ 'token': Variable.get('HW3_TELEGRAM_BOT_TOKEN_TEST'), 'chat_id': Variable.get('HW3_TELEGRAM_CHAT_ID_TEST'), 'message_file_path': error_file_path, }, trigger_rule='one_failed', dag=dag) all_success_task = DummyOperator(task_id='all_success', dag=dag) check_db_task >> [process_orders_task, db_not_reachable_task]
"depends_on_past": False, "start_date": datetime(2019, 1, 24), "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "on_failure_callback": slack.task_fail_slack_alert, "retries": 0, } tpu_supported_models = Variable.get("tpu_training_supported_models").split(",") # distributed_training = Variable.get("distributed_training") dag = DAG("6-train_model", default_args=default_args, catchup=False, schedule_interval=None) start_task = DummyOperator(task_id="start_task", dag=dag) end_task = DummyOperator(task_id="end_task", dag=dag) package_tensorflow_libs_cmd = f"cd {TENSORFLOW_OBJECT_DETECTION_RESEARCH_FOLDER} && object_detection/dataset_tools/create_pycocotools_package.sh /tmp/pycocotools && python setup.py sdist && (cd slim && python setup.py sdist)" package_tensorflow_libs_with_dependencies = BashOperator( task_id="package_tensorflow_libs_with_dependencies", bash_command=package_tensorflow_libs_cmd, dag=dag, ) for json_file in glob(f"{AIRFLOW_TRAINABLE_FOLDER}/*.json"): training_name = file_ops.get_filename(json_file, with_extension=False) now = datetime.now().strftime("%Y%m%dT%H%M")
'catchup': False, 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 0, 'retry_delay': timedelta(minutes=5), } dag = DAG('dag_that_executes_via_k8s_executor', default_args=default_args, schedule_interval=timedelta(minutes=30), max_active_runs=1, concurrency=10) # Generate 2 tasks tasks = ["task{}".format(i) for i in range(1, 3)] example_dag_complete_node = DummyOperator(task_id="example_dag_complete", dag=dag) org_dags = [] for task in tasks: bash_command = 'echo HELLO' org_node = BashOperator(task_id="{}".format(task), bash_command=bash_command, wait_for_downstream=False, retries=5, dag=dag) org_node.set_downstream(example_dag_complete_node)
# start, done = duplex_graph(duplex_method, extract_features_dag) # # duplex_start >> start # done >> duplex_done # ################################################# # Feature extraction ################################################# files_for_features = ["_".join(t) for t in product(FILE_NAMES, DUPLEX_DICT.keys())] files_for_features.sort() NUMBER_OF_CHUNKS = 5 split_start = DummyOperator( task_id=f"split_start", dag=extract_features_dag) split_done = DummyOperator( task_id=f"split_done", dag=extract_features_dag) for f in files_for_features: t = PythonOperator( task_id=f"split_{f}", python_callable=split_file, op_kwargs={'infile': f"{data_step_path}duplex/{f}.csv", 'dir': Path(data_step_path) / "duplex"/ "split_files",
args = { 'owner': 'airflow', } dag = DAG( dag_id='Jonathan_bash_operator', default_args=args, schedule_interval='0 0 * * *', start_date=days_ago(2), dagrun_timeout=timedelta(minutes=60), tags=['example'] ) task1 = DummyOperator( task_id='Start', dag=dag ) # [START howto_operator_bash] task2 = BashOperator( task_id='bash_jacobo', bash_command='echo "esta es una ejecucion normal"', dag=dag, ) # [END howto_operator_bash] task1 >> task2 #if __name__ == "__main__": # dag.cli()
from airflow.utils.dates import days_ago from airflow.models import Variable DAG_ID = "transfer_data_gcp_to_aws_dag" # To have bucket names parametrized AWS_BUCKET = Variable.get("AWS_BUCKET") GCP_BUCKET = Variable.get("GCP_BUCKET") def_args = {'start_date': days_ago(1), 'owner': 'cm0'} transfer_dag = DAG(dag_id=DAG_ID, schedule_interval=None, default_args=def_args) start = DummyOperator(task_id="start", dag=transfer_dag) transfer_operator = GoogleCloudStorageToS3Operator( task_id="transfer_gcp_to_aws", bucket=GCP_BUCKET, dest_s3_key=AWS_BUCKET, replace=True, dag=transfer_dag) end = DummyOperator(task_id="end", dag=transfer_dag) # For troubleshooting GCP and AWS resources access, you can uncomment below operators which will list # both buckets. Not recommendable for recursive listing or for buckets without inner folders (where there are # many files directly on the bucket root). Also required to comment line 41. # list_gcs = GoogleCloudStorageListOperator(task_id="list_gcs", bucket=GCP_BUCKET, dag=transfer_dag)
pass_value=0, task_id='validate_no_dups_movies', sql=SqlQueries.validate_no_dups_movies, use_legacy_sql=False) validate_no_dups_person = BigQueryValueCheckOperator( dag=dag, pass_value=0, task_id='validate_no_dups_person', sql=SqlQueries.validate_no_dups_person, use_legacy_sql=False) ########################### # Key stages tasks ########################### start_operator = DummyOperator(task_id='start_dag', dag=dag) staging_complete = DummyOperator(task_id='staging_complete', dag=dag) analytics_complete = DummyOperator(task_id='analytics_complete', dag=dag) validation_complete = DummyOperator(task_id='analytics_complete', dag=dag) end_operator = DummyOperator(task_id='finished_dag', dag=dag) ########################### # Tasks Dependencies ########################### # Stage all IMDB data start_operator >> stage_imdb_name_basics >> staging_complete start_operator >> stage_imdb_title_ratings >> staging_complete start_operator >> stage_imdb_title_principals >> staging_complete start_operator >> stage_imdb_title_basics >> staging_complete
from airflow.operators.python_operator import ShortCircuitOperator args = { 'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2), } dag = DAG(dag_id='example_short_circuit_operator', default_args=args) cond_true = ShortCircuitOperator( task_id='condition_is_True', python_callable=lambda: True, dag=dag, ) cond_false = ShortCircuitOperator( task_id='condition_is_False', python_callable=lambda: False, dag=dag, ) true_1, true_2 = [ DummyOperator(task_id='true_' + str(i), dag=dag) for i in [1, 2] ] false_1, false_2 = [ DummyOperator(task_id='false_' + str(i), dag=dag) for i in [1, 2] ] cond_true >> true_1 >> true_2 cond_false >> false_1 >> false_2