def test_check_operators(self): conn_id = "sqlite_default" captain_hook = BaseHook.get_hook(conn_id=conn_id) # quite funny :D captain_hook.run("CREATE TABLE operator_test_table (a, b)") captain_hook.run("insert into operator_test_table values (1,2)") op = CheckOperator( task_id='check', sql="select count(*) from operator_test_table", conn_id=conn_id, dag=self.dag) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) op = ValueCheckOperator( task_id='value_check', pass_value=95, tolerance=0.1, conn_id=conn_id, sql="SELECT 100", dag=self.dag) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) captain_hook.run("drop table operator_test_table")
def execute(self, context): # redshift_hook = PostgresHook(postgres_conn_id = self.conn_id) for table in self.tables: if CheckOperator(sql="SELECT * FROM {}".format(table), conn_id=self.conn_id): self.log.info( f"Data quality check for table {table} not pass!") else: self.log.info(f"Data quality check for table {table} pass!")
def test_execute_not_all_records_are_true(self, mock_get_db_hook): mock_get_db_hook.return_value.get_first.return_value = ["data", ""] with self.assertRaises(AirflowException): CheckOperator(sql='sql').execute()
def test_execute_no_records(self, mock_get_db_hook): mock_get_db_hook.return_value.get_first.return_value = [] with self.assertRaises(AirflowException): CheckOperator(sql='sql').execute()
table='staging_happiness', s3_bucket='{{ params.s3_bucket }}', s3_key='happiness' ) stage_temperature = S3ToRedshiftTransfer( task_id='stage_temperature_to_redshift', schema='{{ params.redshift_schema }}', table='staging_temperature', s3_bucket='{{ params.s3_bucket }}', s3_key='temperature' ) check_tweets = CheckOperator( task_id='check_staging_tweets_table', sql='SELECT count(*) FROM public.staging_tweets', conn_id='{{ redshift_conn_id }}' ) check_happiness = ValueCheckOperator( task_id='check_staging_happiness_table', sql='SELECT count(*) FROM public.staging_happiness', pass_value=155, conn_id='{{ redshift_conn_id }}' ) check_temperature = ValueCheckOperator( task_id='check_staging_temperature_table', sql='SELECT count(*) FROM public.staging_temperature', pass_value=8235082, conn_id='{{ redshift_conn_id }}'
'spark.hadoop.fs.s3a.impl': 'org.apache.hadoop.fs.s3a.S3AFileSystem', 'spark.hadoop.fs.s3a.access.key': os.environ.get('AWS_ACCESS_KEY_ID', ''), 'spark.hadoop.fs.s3a.secret.key': os.environ.get('AWS_SECRET_ACCESS_KEY', ''), 'spark.hadoop.fs.s3a.endpoint': "{}:{}".format(os.environ.get('AWS_SERVER', ''), os.environ.get('AWS_PORT', '')), 'spark.hadoop.fs.s3a.connection.ssl.enabled': 'false', 'spark.hadoop.fs.s3a.path.style.access': 'true', 'spark.hadoop.fs.s3.impl': 'org.apache.hadoop.fs.s3a.S3AFileSystem' } spark = SparkSubmitOperator(task_id='fetch_csv_from_s3_and_update_postgres', dag=dag, conf=spark_conf, application='{spark_dir}/s3topostgres.py'.format( spark_dir=SPARK_DIRECTORY), application_args=['-f', FILE, '-t', TABLE]) check = CheckOperator(task_id='check_demo_contains_data', conn_id='local_pg', sql='SELECT COUNT(*) FROM {table}'.format(table=TABLE), dag=dag) spark >> check
set_operations_stats_extra_attributes.set_upstream(insert_operations_stats) set_operations_stats_extra_attributes.set_downstream(start_checks) set_tide_data = PythonOperator( task_id="set_tide_data", python_callable=set_tide_data_fn, provide_context=True, dag=dag, ) set_tide_data.set_upstream(set_operations_stats_extra_attributes) set_tide_data.set_downstream(start_checks) for check_name, query in checks().items(): t = CheckOperator( task_id="check_consistency_" + check_name, sql=query, conn_id="postgresql_local", dag=dag, ) t.set_upstream(start_checks) t.set_downstream(end_checks) # Remove temporary CSV files for table in ["operations_stats_extras", "operations_valides"]: t = BashOperator( task_id="delete_output_csv_" + table, bash_command="rm " + out_path(table), dag=dag, ) t.set_upstream(end_checks) # Trigger DAG to generate final open data files
dag = DAG("xcom_demo", default_args=default_args, schedule_interval=timedelta(1)) ddl_task = SQLTemplatedPythonOperator( task_id='ddl', python_callable=run_and_push, templates_dict={"script": "templates/ddl.sql.jinja2"}, provide_context=True, dag=dag ) read_and_insert_task = PythonOperator( task_id='read_and_insert', python_callable=read_and_insert, provide_context=True, dag=dag ) with open("dags/templates/dq_check.sql.jinja2") as f: check_sql = f.read() dq_check_task = CheckOperator( task_id="dq_check", sql=check_sql, conn_id='postgres_default', dag=dag ) ddl_task >> read_and_insert_task >> dq_check_task
from airflow.operators.check_operator import CheckOperator, IntervalCheckOperator, ValueCheckOperator from dags.ml_project.scripts.trainig import training from dags.ml_project.scripts.evaluation import evaluate CONN_ID = 'dev_postgres' with DAG(dag_id='ml_project', description='ML project', schedule_interval='0 8 * * *', start_date=datetime(2020, 1, 6)) as dag: enter_point = DummyOperator(task_id='enter_point') check_interaction_data = CheckOperator( task_id='check_interaction_data', sql= 'SELECT COUNT(1) FROM interaction WHERE interaction_date = CURRENT_DATE', conn_id=CONN_ID) check_interaction_intervals = IntervalCheckOperator( task_id='check_interaction_intervals', table='interaction', metrics_thresholds={ 'COUNT(*)': 1.5, 'MAX(amount)': 1.3, 'MIN(amount)': 1.4, 'SUM(amount)': 1.3 }, date_filter_column='interaction_date', days_back=5, conn_id=CONN_ID)