Beispiel #1
0
 def test_sql_sensor_postgres(self):
     t = SqlSensor(
         task_id='sql_sensor_check',
         conn_id='postgres_default',
         sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES",
         dag=self.dag
     )
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
Beispiel #2
0
 def test_sql_sensor(self):
     op = SqlSensor(task_id='hdfs_sensor_check',
                    conn_id='presto_default',
                    sql="SELECT 'x' FROM airflow.static_babynames LIMIT 1;",
                    dag=self.dag)
     op.run(start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_ti_state=True)
    def test_unsupported_conn_type(self):
        t = SqlSensor(
            task_id='sql_sensor_check',
            conn_id='redis_default',
            sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES",
            dag=self.dag
        )

        with self.assertRaises(AirflowException):
            t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
Beispiel #4
0
    def test_sql_sensor_postgres_poke(self, mock_hook):
        t = SqlSensor(
            task_id='sql_sensor_check',
            conn_id='postgres_default',
            sql="SELECT 1",
        )

        mock_hook.get_connection('postgres_default').conn_type = "postgres"
        mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records

        mock_get_records.return_value = []
        self.assertFalse(t.poke(None))

        mock_get_records.return_value = [[None]]
        self.assertFalse(t.poke(None))

        mock_get_records.return_value = [['None']]
        self.assertTrue(t.poke(None))

        mock_get_records.return_value = [[0.0]]
        self.assertFalse(t.poke(None))

        mock_get_records.return_value = [[0]]
        self.assertFalse(t.poke(None))

        mock_get_records.return_value = [['0']]
        self.assertTrue(t.poke(None))

        mock_get_records.return_value = [['1']]
        self.assertTrue(t.poke(None))
    def test_sql_sensor_postgres_poke_failure(self, mock_hook):
        t = SqlSensor(task_id='sql_sensor_check',
                      conn_id='postgres_default',
                      sql="SELECT 1",
                      failure=lambda x: x in [1])

        mock_hook.get_connection('postgres_default').conn_type = "postgres"
        mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records

        mock_get_records.return_value = []
        self.assertFalse(t.poke(None))

        mock_get_records.return_value = [[1]]
        self.assertRaises(AirflowException, t.poke, None)
    def test_sql_sensor_postgres_poke(self, mock_hook):
        t = SqlSensor(
            task_id='sql_sensor_check',
            conn_id='postgres_default',
            sql="SELECT 1",
        )

        mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records

        mock_get_records.return_value = []
        self.assertFalse(t.poke(None))

        mock_get_records.return_value = [['1']]
        self.assertTrue(t.poke(None))
Beispiel #7
0
    def test_Operator(self,test_dag):

        sensor_task = SqlSensor(
            task_id = 'postgres_sensor',
            conn_id = 'postgres_dwh',
            sql = 'select * from staging.audit_runs limit 1;',
            poke_interval=3,
            timeout=10,
            dag=test_dag
        )

        test_operator = multiplyby5_operator.MultiplyBy5Operator(
            task_id='test_operator',
            my_operator_param=10,
            dag=test_dag
        )

        get_value = PythonOperator(
            task_id='get_value',
            python_callable=self.xcom_value,
            dag=test_dag
        )
        
        # sensor_task >> test_operator >> get_value
        self.run_task(sensor_task,test_dag)
        self.run_task(test_operator,test_dag)
        self.run_task(get_value,test_dag)


        assert str(self.value) == '50'
    def test_sql_sensor_postgres_poke_success(self, mock_hook):
        op = SqlSensor(task_id='sql_sensor_check',
                       conn_id='postgres_default',
                       sql="SELECT 1",
                       success=lambda x: x in [1])

        mock_hook.get_connection('postgres_default').conn_type = "postgres"
        mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records

        mock_get_records.return_value = []
        self.assertFalse(op.poke(None))

        mock_get_records.return_value = [[1]]
        self.assertTrue(op.poke(None))

        mock_get_records.return_value = [['1']]
        self.assertFalse(op.poke(None))
    def test_sql_sensor_postgres_poke_fail_on_empty(self, mock_hook):
        t = SqlSensor(task_id='sql_sensor_check',
                      conn_id='postgres_default',
                      sql="SELECT 1",
                      fail_on_empty=True)

        mock_hook.get_connection('postgres_default').conn_type = "postgres"
        mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records

        mock_get_records.return_value = []
        self.assertRaises(AirflowException, t.poke, None)
    def test_sql_sensor_postgres_poke_invalid_success(self, mock_hook):
        t = SqlSensor(
            task_id='sql_sensor_check',
            conn_id='postgres_default',
            sql="SELECT 1",
            success=[1],
        )

        mock_hook.get_connection('postgres_default').conn_type = "postgres"
        mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records

        mock_get_records.return_value = [[1]]
        self.assertRaises(AirflowException, t.poke, None)
    def test_sql_sensor_mysql(self):
        t1 = SqlSensor(
            task_id='sql_sensor_check',
            conn_id='mysql_default',
            sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES",
            dag=self.dag
        )
        t1.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)

        t2 = SqlSensor(
            task_id='sql_sensor_check',
            conn_id='mysql_default',
            sql="SELECT count(%s) FROM INFORMATION_SCHEMA.TABLES",
            parameters=["table_name"],
            dag=self.dag
        )
        t2.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
Beispiel #12
0
    def test_sql_sensor_postgres(self):
        op1 = SqlSensor(task_id='sql_sensor_check_1',
                        conn_id='postgres_default',
                        sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES",
                        dag=self.dag)
        op1.run(start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_ti_state=True)

        op2 = SqlSensor(task_id='sql_sensor_check_2',
                        conn_id='postgres_default',
                        sql="SELECT count(%s) FROM INFORMATION_SCHEMA.TABLES",
                        parameters=["table_name"],
                        dag=self.dag)
        op2.run(start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_ti_state=True)
    def test_sql_sensor_postgres_check_allow_null(self, mock_hook):
        t1 = SqlSensor(task_id='sql_sensor_check',
                       conn_id='postgres_default',
                       sql="SELECT NULL",
                       allow_null=True)

        mock_hook.get_connection('postgres_default').conn_type = "postgres"
        mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records

        mock_get_records.return_value = [[None]]
        self.assertTrue(t1.poke(None))

        t2 = SqlSensor(task_id='sql_sensor_check',
                       conn_id='postgres_default',
                       sql="SELECT NULL",
                       allow_null=False)

        mock_hook.get_connection('postgres_default').conn_type = "postgres"
        mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records

        mock_get_records.return_value = [[None]]
        self.assertFalse(t2.poke(None))
Beispiel #14
0
                         group by user_id_uuid, user_id
),
all_past_pageviews as (select user_id_uuid, user_id, max(date(time_received)) as ymd from staging.user_page_log
                       where date(time_received) <= '%(YYDY)s-%(YYDM)s-%(YYDD)s'
                       group by user_id_uuid, user_id)
select daily_pageviews.user_id_uuid, daily_pageviews.user_id, '%(YDY)s-%(YDM)s-%(YDD)s'::date as ymd, has_wallet, pageviews,
coalesce(all_past_pageviews.ymd, '-Infinity'::date) as last_visit
from daily_pageviews left join all_past_pageviews on daily_pageviews.user_id_uuid = all_past_pageviews.user_id_uuid
"""

delete_cmd = "DELETE FROM agg_visitor_previous_visit WHERE ymd = '%(YDY)s-%(YDM)s-%(YDD)s' "

sensor = SqlSensor(
    task_id="sensor",
    conn_id='postgres_data_warehouse',
    sql=sense_sql % MACROS,
    poke_interval=15 * 60,  # minutes
    timeout=60 * 60 * 13,  # 13h, S3 dag has 12h timeout
    dag=dag)

delete = PostgresOperator(task_id="delete",
                          postgres_conn_id='postgres_data_warehouse',
                          sql=delete_cmd % MACROS,
                          dag=dag)

insert = PostgresOperator(task_id="insert",
                          postgres_conn_id='postgres_data_warehouse',
                          sql=insert_cmd % MACROS,
                          dag=dag)

delete.set_upstream(sensor)
Beispiel #15
0
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('generate_bounty_view_board',
          default_args=default_args,
          schedule_interval='@hourly')

sensor = SqlSensor(task_id="sensor",
                   conn_id='postgres_data_warehouse',
                   sql=sense_sql % {
                       'dy': DY,
                       'dm': DM,
                       'dd': DD,
                       'dh': DH,
                       'lhdy': LHDY,
                       'lhdm': LHDM,
                       'lhdd': LHDD,
                       'lhdh': LHDH
                   },
                   dag=dag)

move_rows_to_partitions = PostgresOperator(
    task_id="move_rows_to_partitions",
    postgres_conn_id='postgres_data_warehouse',
    sql=insert_sql % {
        'dy': DY,
        'dm': DM,
        'dd': DD,
        'dh': DH,
dag = airflow.DAG('opos_source_sqoop_di',
                  schedule_interval=schedule_interval,
                  concurrency=15,
                  max_active_runs=1,
                  default_args=args)

dag_monitor = airflow.DAG('opos_source_sqoop_di_monitor',
                          schedule_interval=schedule_interval,
                          default_args=args)

##----------------------------------------- 数据采集前元数据校验 ---------------------------------------##
import_data_validate = SqlSensor(task_id="import_data_validate",
                                 conn_id='mysql_pre_ptsp_db',
                                 sql='''
        select 
        count(1)
        from 
        pre_ptsp_db.pre_opos_payment_order_sync_status
        where DATE_FORMAT(sync_date,"%Y-%m-%d") = '{{ ds }}' and sync_status = 1
    ''',
                                 dag=dag)

import_order_bd_validate = SqlSensor(task_id="import_order_bd_validate",
                                     conn_id='mysql_pre_ptsp_db',
                                     sql='''
        select 
        count(1)
        from 
        pre_ptsp_db.pre_opos_payment_order_bd_sync_status
        where DATE_FORMAT(sync_date,"%Y-%m-%d") = '{{ ds }}' and sync_status = 1
    ''',
                                     dag=dag)
Beispiel #17
0
dag = airflow.DAG('oride_source_sqoop_df',
                  schedule_interval=schedule_interval,
                  concurrency=40,
                  max_active_runs=1,
                  default_args=args)

dag_monitor = airflow.DAG('oride_source_sqoop_df_monitor',
                          schedule_interval=schedule_interval,
                          default_args=args)

check_data_driver_records_finish = SqlSensor(
    task_id="check_data_driver_records_finish",
    conn_id='sqoop_db',
    sql='''
        select
            count(1)
        from
            oride_data.data_driver_records_finish
        where
            from_unixtime(day, "%Y-%m-%d") = '{{ macros.ds_add(ds, -1) }}'
    ''',
    dag=dag)

##----------------------------------------- 任务超时监控 ---------------------------------------##


def fun_task_timeout_monitor(ds, db_name, table_name, **op_kwargs):
    tb = [{
        "db": db_name,
        "table": table_name,
        "partition": "dt={pt}".format(pt=ds),
        "timeout": "7200"
Beispiel #18
0
def dbt_dags_factory(
    dwh_engine,
    dwh_conn_id,
    project_name,
    dbt_schema_name,
    airflow_conn_id,
    dag_base_name='DBT_run',
    analytics_reader=None, # list of users of DWH who are read-only
    schedule_interval=timedelta(hours=1),
    start_date=datetime(2019,1,1),
    default_args=None,
    folder=None,
    models=None,
    exclude=None,
):

    if analytics_reader:
        for statement in ('insert', 'update', 'delete',
            'drop', 'create', 'select', ';', 'grant'):
            for reader in analytics_reader:
                if (statement in reader.lower()):
                    raise Exception('Error! The analytics reader {0} ' \
                        + 'is invalid.'.format(reader))

        #analytics_reader = analytics_reader.split(',')
        analytics_reader_sql = f'\nGRANT USAGE ON SCHEMA "{dbt_schema_name}"'
        analytics_reader_sql += ' TO "{0}";'
        analytics_reader_sql += f'''
        \nGRANT SELECT ON ALL TABLES IN SCHEMA "{dbt_schema_name}"''' + ' TO "{0}";'
        analytics_reader_sql = ''.join([
            analytics_reader_sql.format(i) for i in analytics_reader
        ])

    if models and not (type(models) == str):
        models = ' --models ' + ' '.join(models)
    else:
        models = ''

    if exclude and not (type(exclude) == str):
        exclude = ' --exclude ' + ' '.join(exclude)
    else:
        exclude = ''

    flags = models + exclude

    dag = DAG(
        dag_base_name,
        catchup=False,
        max_active_runs=1,
        schedule_interval=schedule_interval,
        start_date=start_date,
        default_args=default_args,
    )

    dag_full_refresh = DAG(
        dag_base_name + '_full_refresh',
        catchup=False,
        max_active_runs=1,
        schedule_interval=None,
        start_date=start_date,
        default_args=default_args,
    )

    folder = folder or (os.environ.get('AIRFLOW_HOME') or conf.get('core', 'airflow_home')).replace(
        'airflow_home/airflow',
        'dbt_home',
    )

    bash_command = '''
    cd {1}
    source env/bin/activate
    cd {2}
    dbt {0}
    '''.format(
        '{0}',
        folder,
        project_name,
    )

    sensor_sql = """
        SELECT
            CASE WHEN COUNT(*) = 0 THEN 1 ELSE 0 END -- only run if exatly equal to 0
        FROM public.dag_run
        WHERE dag_id IN ('{0}', '{1}')
        and state = 'running'
        and not (run_id = '{2}')
    """.format(
        dag._dag_id,
        dag_full_refresh._dag_id,
        '{{ run_id }}',
    )

    # refactor?! not coupled to values in profiles.yml!
    if dwh_engine == EC.DWH_ENGINE_POSTGRES:
        conn = BaseHook.get_connection(dwh_conn_id)
        env = {
            'DBT_DWH_HOST': str(conn.host),
            'DBT_DWH_USER': str(conn.login),
            'DBT_DWH_PASS': str(conn.password),
            'DBT_DWH_PORT': str(conn.port),
            'DBT_DWH_DBNAME': str(conn.schema),
            'DBT_DWH_SCHEMA': dbt_schema_name,
            'DBT_PROFILES_DIR': folder,
        }
    elif dwh_engine == EC.DWH_ENGINE_SNOWFLAKE:
        analytics_conn = BaseHook.get_connection(dwh_conn_id)
        analytics_conn_extra = analytics_conn.extra_dejson
        env = {
            'DBT_ACCOUNT': analytics_conn_extra.get(
                'account',
                analytics_conn.host,
            ),
            'DBT_USER': analytics_conn.login,
            'DBT_PASS': analytics_conn.password,
            'DBT_ROLE': analytics_conn_extra.get('role'),
            'DBT_DB': analytics_conn_extra.get('database'),
            'DBT_WH': analytics_conn_extra.get('warehouse'),
            'DBT_SCHEMA': dbt_schema_name,
            'DBT_PROFILES_DIR': folder,
        }
    else:
        raise ValueError('DWH type not implemented!')

    #with dag:
    snsr = SqlSensor(
        task_id='sense_dbt_conflict_avoided',
        conn_id=airflow_conn_id,
        sql=sensor_sql,
        dag=dag,
    )

    dbt_seed = BashOperator(
        task_id='run_dbt_seed',
        bash_command=bash_command.format('seed'),
        env=env,
        dag=dag,
    )

    dbt_run = BashOperator(
        task_id='run_dbt',
        bash_command=bash_command.format('run' + flags),
        env=env,
        dag=dag,
    )

    dbt_test = BashOperator(
        task_id='test_dbt',
        bash_command=bash_command.format('test' + flags),
        env=env,
        dag=dag,
    )

    dbt_docs = BashOperator(
        task_id='create_dbt_docs',
        bash_command=bash_command.format('docs generate'),
        env=env,
        dag=dag,
    )

    snsr >> dbt_seed >> dbt_run >> dbt_test

    if analytics_reader:
        # This should not occur when using Snowflake
        read_rights = PostgresOperator(
            task_id='grant_access_to_read_users',
            sql=analytics_reader_sql,
            postgres_conn_id=dwh_conn_id,
            dag=dag,
        )
        dbt_test >> read_rights >> dbt_docs
    else:
        dbt_test >> dbt_docs


    #with dag_full_refresh:
    snsr = SqlSensor(
        task_id='sense_dbt_conflict_avoided',
        conn_id=airflow_conn_id,
        sql=sensor_sql,
        dag=dag_full_refresh,
    )

    dbt_seed = BashOperator(
        task_id='run_dbt_seed',
        bash_command=bash_command.format('seed'),
        env=env,
        dag=dag_full_refresh,
    )

    dbt_run = BashOperator(
        task_id='run_dbt',
        bash_command=bash_command.format('run --full-refresh' + flags),
        env=env,
        dag=dag_full_refresh,
    )

    dbt_test = BashOperator(
        task_id='test_dbt',
        bash_command=bash_command.format('test' + flags),
        env=env,
        dag=dag_full_refresh,
    )

    dbt_docs = BashOperator(
        task_id='create_dbt_docs',
        bash_command=bash_command.format('docs generate'),
        env=env,
        dag=dag_full_refresh,
    )

    snsr >> dbt_seed >> dbt_run >> dbt_test

    if analytics_reader:
        read_rights = PostgresOperator(
            task_id='grant_access_to_read_users',
            sql=analytics_reader_sql,
            postgres_conn_id=dwh_conn_id,
            dag=dag_full_refresh,
        )
        dbt_test >> read_rights >> dbt_docs
    else:
        dbt_test >> dbt_docs
    return (dag, dag_full_refresh)
          schedule_interval='@hourly',
          max_active_runs=1,
          catchup=False)

# select posts that include 'china' or 'hong kong' in the title within the previous hour
QUERY = """
        SELECT *
        FROM {table}
        WHERE (post_title ILIKE '%hong kong%' OR post_title ILIKE '%china%')
        AND creation_datetime BETWEEN DATE_TRUNC('hour', NOW() - interval '1 hour') AND DATE_TRUNC('hour', NOW())
        """.format(table=config['reddit_news_db']['table'])

sensor = SqlSensor(
    task_id="news_sensor",
    soft_fail=True,  # mark task as SKIPPED on failure
    conn_id=config['Airflow']['postgres_conn_id'],
    sql=QUERY,
    poke_interval=1,  # fail right away if no records are found
    timeout=1,
    dag=dag)

email_alert = EmailOperator(
    task_id='email_alert',
    to=['*****@*****.**'],
    subject='China/Hong Kong news available',
    html_content=
    '<h1>There is news about China or Hong Kong within the previous hour</h1>',
    dag=dag)

sensor >> email_alert
Beispiel #20
0
    'start_date': datetime(2019, 1, 21, 0),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('generate_user_page_log', default_args=default_args,
          schedule_interval='@hourly')

sensor = SqlSensor(
    task_id="sensor",
    conn_id='postgres_data_warehouse',
    sql=sense_sql % {'dy':DY, 'dm':DM, 'dd':DD, 'dh':DH, 'lhdy':LHDY,
                     'lhdm':LHDM, 'lhdd':LHDD, 'lhdh':LHDH},
    timeout=60 * 60 * 13, # 13h timeout (S3 load dag has 12h timeout)
    poke_interval=15 * 60, # 15m between pokes
    dag=dag
)

create_partitions = PostgresOperator(
    task_id="create_table",
    postgres_conn_id='postgres_data_warehouse',
    sql=create_sql % {'dy':DY, 'dm':DM, 'dd':DD, 'dh':DH, 'lhdy':LHDY,
                     'lhdm':LHDM, 'lhdd':LHDD, 'lhdh':LHDH},
    dag=dag
)

move_rows_to_partitions = PostgresOperator(
    task_id="move_rows_to_partitions",
Beispiel #21
0
def fun_task_timeout_monitor(ds, db_name, table_name, **op_kwargs):
    tb = [{
        "db": db_name,
        "table": table_name,
        "partition": "dt={pt}".format(pt=ds),
        "timeout": "7200"
    }]

    TaskTimeoutMonitor().set_task_monitor(tb)


import_data_validate = SqlSensor(task_id="import_data_validate",
                                 conn_id='ofood_db',
                                 sql='''
        select 
        count(1)
        from food_operapay_co.jh_crontab_log
        where day = '{{ ds_nodash }}' and crontab_status = 'bi-ok'
    ''',
                                 dag=dag)
'''
导入数据的列表
db_name,table_name,conn_id,prefix_name
'''
#

table_list = [

    # 订单相关数据
    ("food_operapay_co", "jh_order", "ofood_db", "base"),
    ("food_operapay_co", "jh_waimai_order", "ofood_db", "base"),