task_id="load_source_data", python_callable=load_source_data, dag=dag, ) task_validate_source_data_load = GreatExpectationsOperator( task_id="validate_source_data_load", checkpoint_name="source_data_load.chk", dag=dag, data_context_root_dir=ge_root_dir, ) task_run_dbt_dag = DbtRunOperator(task_id="run_dbt_dag", dag=dag) task_validate_analytical_output = GreatExpectationsOperator( task_id="validate_analytical_output", checkpoint_name="analytical_output.chk", dag=dag, data_context_root_dir=ge_root_dir, ) task_publish = PythonOperator(task_id="publish", python_callable=publish_to_prod, dag=dag) task_validate_source_data.set_downstream(task_load_source_data) task_load_source_data.set_downstream(task_validate_source_data_load) task_validate_source_data_load.set_downstream(task_run_dbt_dag) task_run_dbt_dag.set_downstream(task_validate_analytical_output) task_validate_analytical_output.set_downstream(task_publish)
from airflow.utils.timezone import datetime DEFAULT_DATE = datetime(2016, 1, 1) default_args = dict( start_date=DEFAULT_DATE, owner='airflow') def fail(): raise ValueError('Expected failure.') def success(ti=None, *args, **kwargs): if ti.execution_date != DEFAULT_DATE + timedelta(days=1): fail() # DAG tests that tasks ignore all dependencies dag1 = DAG(dag_id='test_run_ignores_all_dependencies', default_args=dict(depends_on_past=True, **default_args)) dag1_task1 = PythonOperator( task_id='test_run_dependency_task', python_callable=fail, dag=dag1) dag1_task2 = PythonOperator( task_id='test_run_dependent_task', python_callable=success, dag=dag1) dag1_task1.set_downstream(dag1_task2)
tune_model_task = SageMakerTuningOperator( task_id='model_tuning', dag=dag, config=tuner_config, # aws_conn_id='airflow-sagemaker', wait_for_completion=True, check_interval=30) # launch sagemaker batch transform job and wait until it completes batch_transform_task = SageMakerTransformOperator( task_id='predicting', dag=dag, config=transform_config, # aws_conn_id='airflow-sagemaker', wait_for_completion=True, check_interval=30, trigger_rule=TriggerRule.ONE_SUCCESS) cleanup_task = DummyOperator(task_id='cleaning_up', dag=dag) # set the dependencies between tasks init.set_downstream(preprocess_task) preprocess_task.set_downstream(prepare_task) prepare_task.set_downstream(branching) branching.set_downstream(tune_model_task) branching.set_downstream(train_model_task) tune_model_task.set_downstream(batch_transform_task) train_model_task.set_downstream(batch_transform_task) batch_transform_task.set_downstream(cleanup_task)
'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': False, 'retries': 2, 'retry_delay': timedelta(minutes=5), 'start_date': datetime(2020, 9, 24) } with DAG('marketvol', default_args=default_args, description='A simple DAG', schedule_interval='0 18 * * 1-5') as dag: t1 = PythonOperator(task_id='Tesla_Stonk_Data', python_callable=stonk('TSLA')) t2 = PythonOperator(task_id='Apple_Stonk_Data', python_callable=stonk('AAPL')) t3 = BashOperator(task_id='Tesla Bash Command', bash_command='mv *_data.csv tmp/data/2020/2020-09-24') t4 = BashOperator(task_id='Apple Bash Command', bash_command='mv *_data.csv tmp/dat/2020/2020-09-24') t5 = PythonOperator(task_id='Run query on downloaded data', python_callable=python_command) t1.set_downstream(t3) t2.set_downstream(t4) [t3, t4] >> t5
logging.info("Performing Delete...") # using bulk delete query.delete(synchronize_session=False) session.commit() logging.info("Finished Performing Delete") else: logging.warn("You've opted to skip deleting the db entries. " "Set ENABLE_DELETE to True to delete entries!!!") logging.info("Finished Running Cleanup Process") except ProgrammingError as e: logging.error(e) logging.error( str(airflow_db_model) + " is not present in the metadata." "Skipping...") for db_object in DATABASE_OBJECTS: cleanup_op = PythonOperator(task_id="cleanup_" + str(db_object["airflow_db_model"].__name__), python_callable=cleanup_function, params=db_object, provide_context=True, dag=dag) print_configuration.set_downstream(cleanup_op) # [END composer_metadb_cleanup]
db_connection.init_query('DROP TABLE IF EXISTS Sales') db_connection.init_query(query=('''CREATE TABLE Sales( Region varchar(256), Country varchar(256), `Item Type` varchar(20), `Sales Channel` varchar(20), `Order Priority` varchar(20), `Order Date` DATE NOT NULL, `Order ID` int(20) NOT NULL, `Ship Date` DATE NOT NULL, `Units Sold` int(20) NOT NULL, `Unit Price` int(20) NOT NULL, `Unit Cost` int(20) NOT NULL, `Total Revenue` int(20) NOT NULL, `Total Cost` int(20) NOT NULL, `Total Profit` int(20) NOT NULL)''')) data.csv_load_to_db(filename, destination_folder, db_connection) run_extract = PythonOperator(task_id='extract', python_callable=extract, dag=dag) run_transform_load = PythonOperator(task_id='ransform and load', python_callable=transform_load, dag=dag) run_extract.set_downstream(run_transform_load)