Exemple #1
0
 def test_dbt_run(self, mock_run_cli):
     operator = DbtRunOperator(
         task_id='run',
         dag=self.dag
     )
     operator.execute(None)
     mock_run_cli.assert_called_once_with('run')
Exemple #2
0
execution_date = '{{ yesterday_ds }}'
now = datetime.datetime.now()
execution_hour = datetime.datetime.now().replace(microsecond=0, second=0, minute=0) - timedelta(hours=1)

dbt_tags = 'tag:daily' if now.hour == 1 else 'tag:hourly'

default_args = {
    'dir': '/checkout-dot-com'
}


with DAG(dag_id='dbt', default_args=default_args, schedule_interval='@hourly') as dag:
    dbt_run = DbtRunOperator(
        task_id = 'dbt_run',
        models = dbt_tags,
        vars = {
            'execution_date': execution_date,
            'execution_hour': execution_hour
        }
        retries = 1
    )

    dbt_test = DbtTestOperator(
        task_id='dbt_test',
        # Run will only fail if a test does, no need to retry.
        retries=0
    )

    dbt_run >> dbt_test
Exemple #3
0
from airflow import DAG
from airflow_dbt.operators.dbt_operator import (DbtSeedOperator,
                                                DbtSnapshotOperator,
                                                DbtRunOperator,
                                                DbtTestOperator)
from airflow.utils.dates import days_ago

default_args = {
    'dir': '/home/klox-dev/.venv/bin/dbt',
    'start_date': days_ago(0)
}

with DAG(dag_id='dbt_airflow',
         default_args=default_args,
         schedule_interval='@once') as dag:

    dbt_seed = DbtSeedOperator(task_id='dbt_seed', )

    dbt_snapshot = DbtSnapshotOperator(task_id='dbt_snapshot', )

    dbt_run = DbtRunOperator(task_id='dbt_run', )

    dbt_test = DbtTestOperator(
        task_id='dbt_test',
        retries=
        0,  # Failing tests would fail the task, and we don't want Airflow to try again
    )

    dbt_seed >> dbt_snapshot >> dbt_run >> dbt_test
Exemple #4
0
                                                  'schema':
                                                  'example',
                                                  'table':
                                                  'taxi_zone_lookup',
                                                  'data_asset_name':
                                                  'taxi_zone_lookup'
                                              },
                                              'expectation_suite_name':
                                              'custom_sql_query.warning'
                                          }],
                                          data_context_root_dir=GE_ROOT_DIR,
                                          dag=dag)

dbt_run = DbtRunOperator(task_id='dbt_run',
                         dir=DBT_PROJECT_DIR,
                         profiles_dir=DBT_ROOT_DIR,
                         target=DBT_TARGET,
                         dag=dag)

validate_transform = GreatExpectationsOperator(
    task_id='validate_transform',
    expectation_suite_name='taxi_zone_incremental.source',
    batch_kwargs={
        'datasource': 'spark-thrift-server',
        'schema': 'example',
        'table': 'taxi_zone_incremental',
        'data_asset_name': 'taxi_zone_incremental'
    },
    data_context_root_dir=GE_ROOT_DIR,
    dag=dag)
Exemple #5
0
        'datasource': 'challenge_src'
    },
    dag=dag)

sync_task = BashOperator(
    task_id='sync_validations',
    depends_on_past=False,
    bash_command=templated_command,
    params={
        'task_name': 'Done with validations',
        'start_date': default_args['start_date']
    },
    dag=dag,
)

dbt_task = DbtRunOperator(task_id='dbt', dag=dag)
# dbt_task = DummyOperator(task_id='dbt', dag=dag)

done_task = BashOperator(
    task_id='done',
    depends_on_past=False,
    bash_command=templated_command,
    params={
        'task_name': 'All done',
        'start_date': default_args['start_date']
    },
    dag=dag,
)

start_task >> [valid_prod_task, valid_cust_task, valid_ordr_task
               ] >> sync_task >> dbt_task >> done_task
Exemple #6
0
        'ACL': 'public-read'
    }
}

dbt_vars = {
    'DBT_USER': Variable.get('DBT_USER'),
    'DBT_PASSWORD': Variable.get('DBT_PASSWORD')
}

with DAG(dag_id='covid_19_bokeh_app_etl',
         default_args=default_args,
         schedule_interval="0 */3 * * *") as dag:

    dbt_seed = DbtSeedOperator(task_id='dbt_seed', vars=dbt_vars)

    dbt_run = DbtRunOperator(task_id='dbt_run', vars=dbt_vars)

    dbt_test = DbtTestOperator(task_id='dbt_test', vars=dbt_vars, retries=0)

    dbt_seed >> dbt_run >> dbt_test

    extract_jobs = {
        'jhu_cases': jhu_cases_etl,
        'jhu_deaths': jhu_deaths_etl,
        'jhu_lookup': jhu_lookup_etl,
        'jhu_us_cases': jhu_us_cases_etl,
        'jhu_us_deaths': jhu_us_deaths_etl,
        'local_uk_data': local_uk_data_etl,
        'owid_global_vaccinations': owid_global_vaccinations_etl,
        'bloomberg_global_vaccinations': bloomberg_global_vaccinations_etl
    }
default_args = json.loads(Variable.get('covid19'))
default_args.update({"retry_delay":  timedelta(minutes=default_args["retry_delay"])})


dbt_dir = os.environ["DBT_DIR"]
dbt_profiles_dir = os.environ["DBT_PROFILES_DIR"]

with DAG( 'covid19_dbt',
    default_args=default_args,
    description='Managing dbt data pipeline',
    schedule_interval='@daily') as dag:

    ingest_covid19_day_task = Covid19ToIngestions(task_id='ingest_covid19_day_to_dbt', dag=dag)

    dbt_run = DbtRunOperator(
        task_id='dbt_run',
        dir = dbt_dir,
        profiles_dir=dbt_profiles_dir,
        models='covid19_stats_materialized'
    )

    dbt_test = DbtTestOperator(
        task_id='dbt_test',
        dir=dbt_dir,
        profiles_dir=dbt_profiles_dir,
        retries=0,  # Failing tests would fail the task, and we don't want Airflow to try again
        models='covid19_stats_materialized'
    )

    ingest_covid19_day_task >> dbt_run >> dbt_test
Exemple #8
0
from airflow_dbt.operators.dbt_operator import DbtRunOperator, DbtTestOperator

default_args = {'retries': 0, 'start_date': datetime.datetime(2019, 8, 10)}

dag = DAG(
    'analytics_pipeline',
    default_args=default_args,
    schedule_interval=None,  # set to '0 8 * * *' for daily at 8am
    catchup=False,
    # tells airflow where sql is
    template_searchpath='/home/preidy/Code/analytics_pipeline/airflow/sql/')

extract_and_load = BigQueryOperator(
    task_id='extract_and_load',
    sql='extract_sql.sql',
    destination_dataset_table='pat-scratch.analytics_pipeline.raw__nyc_311',
    write_disposition='WRITE_TRUNCATE',  # overwrite entire table if it exists,
    use_legacy_sql=False,
    location='US',
    project_id='pat-scratch',
    dag=dag)

dbt_run = DbtRunOperator(task_id='dbt_run',
                         dir='/home/preidy/Code/analytics_pipeline/nyc311/',
                         dag=dag)

dbt_test = DbtTestOperator(task_id='dbt_test',
                           dir='/home/preidy/Code/analytics_pipeline/nyc311/',
                           dag=dag)

extract_and_load >> dbt_run >> dbt_test