Exemple #1
0
    def test_check_operators(self):

        conn_id = "sqlite_default"

        captain_hook = BaseHook.get_hook(conn_id=conn_id)  # quite funny :D
        captain_hook.run("CREATE TABLE operator_test_table (a, b)")
        captain_hook.run("insert into operator_test_table values (1,2)")

        op = CheckOperator(
            task_id='check',
            sql="select count(*) from operator_test_table",
            conn_id=conn_id,
            dag=self.dag)
        op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)

        op = ValueCheckOperator(
            task_id='value_check',
            pass_value=95,
            tolerance=0.1,
            conn_id=conn_id,
            sql="SELECT 100",
            dag=self.dag)
        op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)

        captain_hook.run("drop table operator_test_table")
    def execute(self, context):
        # redshift_hook = PostgresHook(postgres_conn_id = self.conn_id)

        for table in self.tables:
            if CheckOperator(sql="SELECT * FROM {}".format(table),
                             conn_id=self.conn_id):
                self.log.info(
                    f"Data quality check for table {table} not pass!")
            else:
                self.log.info(f"Data quality check for table {table} pass!")
    def test_execute_not_all_records_are_true(self, mock_get_db_hook):
        mock_get_db_hook.return_value.get_first.return_value = ["data", ""]

        with self.assertRaises(AirflowException):
            CheckOperator(sql='sql').execute()
    def test_execute_no_records(self, mock_get_db_hook):
        mock_get_db_hook.return_value.get_first.return_value = []

        with self.assertRaises(AirflowException):
            CheckOperator(sql='sql').execute()
        table='staging_happiness',
        s3_bucket='{{ params.s3_bucket }}',
        s3_key='happiness'
    )

    stage_temperature = S3ToRedshiftTransfer(
        task_id='stage_temperature_to_redshift',
        schema='{{ params.redshift_schema }}',
        table='staging_temperature',
        s3_bucket='{{ params.s3_bucket }}',
        s3_key='temperature'
    )

    check_tweets = CheckOperator(
        task_id='check_staging_tweets_table',
        sql='SELECT count(*) FROM public.staging_tweets',
        conn_id='{{ redshift_conn_id }}'
    )

    check_happiness = ValueCheckOperator(
        task_id='check_staging_happiness_table',
        sql='SELECT count(*) FROM public.staging_happiness',
        pass_value=155,
        conn_id='{{ redshift_conn_id }}'
    )

    check_temperature = ValueCheckOperator(
        task_id='check_staging_temperature_table',
        sql='SELECT count(*) FROM public.staging_temperature',
        pass_value=8235082,
        conn_id='{{ redshift_conn_id }}'
Exemple #6
0
    'spark.hadoop.fs.s3a.impl':
    'org.apache.hadoop.fs.s3a.S3AFileSystem',
    'spark.hadoop.fs.s3a.access.key':
    os.environ.get('AWS_ACCESS_KEY_ID', ''),
    'spark.hadoop.fs.s3a.secret.key':
    os.environ.get('AWS_SECRET_ACCESS_KEY', ''),
    'spark.hadoop.fs.s3a.endpoint':
    "{}:{}".format(os.environ.get('AWS_SERVER', ''),
                   os.environ.get('AWS_PORT', '')),
    'spark.hadoop.fs.s3a.connection.ssl.enabled':
    'false',
    'spark.hadoop.fs.s3a.path.style.access':
    'true',
    'spark.hadoop.fs.s3.impl':
    'org.apache.hadoop.fs.s3a.S3AFileSystem'
}

spark = SparkSubmitOperator(task_id='fetch_csv_from_s3_and_update_postgres',
                            dag=dag,
                            conf=spark_conf,
                            application='{spark_dir}/s3topostgres.py'.format(
                                spark_dir=SPARK_DIRECTORY),
                            application_args=['-f', FILE, '-t', TABLE])

check = CheckOperator(task_id='check_demo_contains_data',
                      conn_id='local_pg',
                      sql='SELECT COUNT(*) FROM {table}'.format(table=TABLE),
                      dag=dag)

spark >> check
set_operations_stats_extra_attributes.set_upstream(insert_operations_stats)
set_operations_stats_extra_attributes.set_downstream(start_checks)

set_tide_data = PythonOperator(
    task_id="set_tide_data",
    python_callable=set_tide_data_fn,
    provide_context=True,
    dag=dag,
)
set_tide_data.set_upstream(set_operations_stats_extra_attributes)
set_tide_data.set_downstream(start_checks)

for check_name, query in checks().items():
    t = CheckOperator(
        task_id="check_consistency_" + check_name,
        sql=query,
        conn_id="postgresql_local",
        dag=dag,
    )
    t.set_upstream(start_checks)
    t.set_downstream(end_checks)

# Remove temporary CSV files
for table in ["operations_stats_extras", "operations_valides"]:
    t = BashOperator(
        task_id="delete_output_csv_" + table,
        bash_command="rm " + out_path(table),
        dag=dag,
    )
    t.set_upstream(end_checks)

# Trigger DAG to generate final open data files
Exemple #8
0
dag = DAG("xcom_demo", default_args=default_args, schedule_interval=timedelta(1))

ddl_task = SQLTemplatedPythonOperator(
    task_id='ddl',
    python_callable=run_and_push,
    templates_dict={"script": "templates/ddl.sql.jinja2"},
    provide_context=True,
    dag=dag
)

read_and_insert_task = PythonOperator(
    task_id='read_and_insert',
    python_callable=read_and_insert,
    provide_context=True,
    dag=dag
)

with open("dags/templates/dq_check.sql.jinja2") as f:
    check_sql = f.read()

dq_check_task = CheckOperator(
    task_id="dq_check",
    sql=check_sql,
    conn_id='postgres_default',
    dag=dag
)

ddl_task >> read_and_insert_task >> dq_check_task

Exemple #9
0
from airflow.operators.check_operator import CheckOperator, IntervalCheckOperator, ValueCheckOperator

from dags.ml_project.scripts.trainig import training
from dags.ml_project.scripts.evaluation import evaluate

CONN_ID = 'dev_postgres'

with DAG(dag_id='ml_project',
         description='ML project',
         schedule_interval='0 8 * * *',
         start_date=datetime(2020, 1, 6)) as dag:
    enter_point = DummyOperator(task_id='enter_point')

    check_interaction_data = CheckOperator(
        task_id='check_interaction_data',
        sql=
        'SELECT COUNT(1) FROM interaction WHERE interaction_date = CURRENT_DATE',
        conn_id=CONN_ID)

    check_interaction_intervals = IntervalCheckOperator(
        task_id='check_interaction_intervals',
        table='interaction',
        metrics_thresholds={
            'COUNT(*)': 1.5,
            'MAX(amount)': 1.3,
            'MIN(amount)': 1.4,
            'SUM(amount)': 1.3
        },
        date_filter_column='interaction_date',
        days_back=5,
        conn_id=CONN_ID)