Exemple #1
0
 def test_postgres_operator_test_multi(self):
     sql = [
         "CREATE TABLE IF NOT EXISTS test_airflow (dummy VARCHAR(50))",
         "TRUNCATE TABLE test_airflow",
         "INSERT INTO test_airflow VALUES ('X')",
     ]
     from airflow.providers.postgres.operators.postgres import PostgresOperator
     op = PostgresOperator(
         task_id='postgres_operator_test_multi', sql=sql, dag=self.dag)
     op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
Exemple #2
0
    def test_vacuum(self):
        """
        Verifies the VACUUM operation runs well with the PostgresOperator
        """

        sql = "VACUUM ANALYZE;"
        op = PostgresOperator(task_id='postgres_operator_test_vacuum',
                              sql=sql,
                              dag=self.dag,
                              autocommit=True)
        op.run(start_date=DEFAULT_DATE,
               end_date=DEFAULT_DATE,
               ignore_ti_state=True)
    def delete_data(symbol, symbol_id, start_ts, end_ts, granularity):
        """
        Deletes rows from table specified by a time range - (start_ts, end_ts)
        before writing to the table.

        Args
            - symbol (str): coinpair which Binance API recognises.
            - start_ts (int): time range beginning, in epoch timestamp
            - end_ts (int): time range end, in epoch timestamp
            - granularity (str): Time period of granularity.
                ex: 1m, 5m, 15m.
                See, Binance's API documentation for further options.

        Returns
            - Returns JSON object, containing Pandas Dataframe.
        """
        table = "ticker_data_{:s}".format(granularity)
        CMD = """
            DELETE FROM {table}
            WHERE close_time BETWEEN {start_ts} AND {end_ts}
                AND symbol_id = {symbol_id}
        """.format(table=table,
                   start_ts=start_ts,
                   end_ts=end_ts,
                   symbol_id=symbol_id)

        PostgresOperator(
            task_id="delete_data_{:s}_{:s}".format(symbol, table),
            postgres_conn_id=POSTGRES_CONN_ID,
            sql=CMD,
        )
        return True
    def test_overwrite_schema(self):
        """
        Verifies option to overwrite connection schema
        """

        sql = "SELECT 1;"
        op = PostgresOperator(
            task_id='postgres_operator_test_schema_overwrite',
            sql=sql,
            dag=self.dag,
            autocommit=True,
            database="foobar",
        )

        from psycopg2 import OperationalError
        try:
            op.run(start_date=DEFAULT_DATE,
                   end_date=DEFAULT_DATE,
                   ignore_ti_state=True)
        except OperationalError as e:
            assert 'database "foobar" does not exist' in str(e)
Exemple #5
0
    def test_postgres_operator_test(self):
        sql = """
        CREATE TABLE IF NOT EXISTS test_airflow (
            dummy VARCHAR(50)
        );
        """
        op = PostgresOperator(task_id='basic_postgres', sql=sql, dag=self.dag)
        op.run(start_date=DEFAULT_DATE,
               end_date=DEFAULT_DATE,
               ignore_ti_state=True)

        autocommit_task = PostgresOperator(
            task_id='basic_postgres_with_autocommit',
            sql=sql,
            dag=self.dag,
            autocommit=True)
        autocommit_task.run(start_date=DEFAULT_DATE,
                            end_date=DEFAULT_DATE,
                            ignore_ti_state=True)
def covid_per_popgroup_subdag(parent_dag_id, child_dag_id, args):
    with DAG(
            dag_id=f'{parent_dag_id}.{child_dag_id}',
            default_args=args,
            #start_date= datetime(2021,5,2), #days_ago(2), #datetime.datetime.now(), #days_ago(2),
            #schedule_interval = '@once',
            #tags=['covid'],
    ) as dag:

        last_date_popgroup_task = get_last_date_popgroup(
            task_id="last_date_popgroup_task")

        current_dir = os.path.dirname(os.path.abspath(__file__))
        root_dir = os.path.dirname(current_dir)
        download_recent_cdc_task = SparkSubmitOperator(
            task_id="download_recent_cdc_task",
            conn_id="spark_default",
            application=os.path.join(root_dir, "python",
                                     "stage_recent_cdc.py"),
            application_args=[
                "--apptoken",
                Variable.get("socrata_apptoken"),
                "--last_date",
                "{{ti.xcom_pull( task_ids = 'last_date_popgroup_task', key = 'last_cdc_date')}}",
            ],
        )

        insert_covid_pergroup_task = PostgresOperator(
            task_id="insert_covid_pergroup_task",
            postgres_conn_id="postgres_default",
            sql="""
                    INSERT INTO covid_per_popgroup(cdc_case_earliest_dt, sex_id, age_group_id, race_ethnicity_id, count)
                    SELECT cdc_case_earliest_dt, sex_id, age_group_id, race_ethnicity_id, count
                    FROM recent_cdc AS n
                    JOIN  dim_age_group AS a ON a.age_group = n.age_group
                    JOIN dim_sex AS s ON s.sex = n.sex
                    JOIN dim_race_ethnicity AS e ON e.race = n.race_ethnicity_combined
                    ;
                """)
        last_date_popgroup_task >> download_recent_cdc_task >> insert_covid_pergroup_task
        return dag
default_args = {
    'email_on_retry': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=5)
}

with DAG('avocado_dag',
         default_args=default_args,
         description='Forecasting avocado prices',
         schedule_interval='*/10 * * * *',
         start_date=datetime(2020, 1, 1),
         catchup=False) as dag:

    creating_table = PostgresOperator(task_id='creating_table',
                                      sql='sql/CREATE_TABLE_ACCURACIES.sql',
                                      postgres_conn_id='postgres')

    downloading_data = PythonOperator(task_id='downloading_data',
                                      python_callable=download_dataset)

    sanity_check = PythonOperator(task_id="sanity_check",
                                  python_callable=check_dataset)

    waiting_for_data = FileSensor(task_id='waiting_for_data',
                                  fs_conn_id='fs_default',
                                  filepath='avocado.csv',
                                  poke_interval=15)

    n_estimators = [100, 150]
    max_features = ['auto', 'sqrt']
        'password': user['login']['password'],
        'email': user['email']
    })
    processed_user.to_csv('/tmp/processed_user.csv', index=None, header=False)


with DAG('user_processing_postgres',
         schedule_interval='@daily',
         start_date=datetime(2021, 1, 1),
         catchup=True) as dag:
    create_table = PostgresOperator(task_id='create_table',
                                    postgres_conn_id='db_postgres',
                                    sql='''
            CREATE TABLE IF NOT EXISTS test.users (
                firstname TEXT NOT NULL,
                lastname TEXT NOT NULL,
                country TEXT NOT NULL,
                username TEXT NOT NULL,
                password TEXT NOT NULL,
                email TEXT NOT NULL PRIMARY KEY
            );
            ''')

    is_api_available = HttpSensor(task_id='is_api_available',
                                  http_conn_id='user_api',
                                  endpoint='api/')

    extracting_usr = SimpleHttpOperator(
        task_id='extracting_user',
        http_conn_id='user_api',
        endpoint='api/',
        method='GET',
Exemple #9
0
import datetime
from airflow import DAG
from airflow.providers.postgres.operators.postgres import PostgresOperator
from airflow.utils.dates import days_ago

default_args = {'owner': 'Dylan Bragdon'}

with DAG(
        dag_id='create_postgres_tables',
        start_date=days_ago(2),
        schedule_interval=None,
        default_args=default_args,
        catchup=False,
) as dag:
    create_post_table = PostgresOperator(task_id='create_tables',
                                         postgres_conn_id='hackernews_connect',
                                         sql="sql/post_schema.sql")
Exemple #10
0
def covid_subdag(parent_dag_id, child_dag_id, args):
    with DAG(
            dag_id=f'{parent_dag_id}.{child_dag_id}',
            default_args=args,
            #start_date= datetime(2021,5,2), #days_ago(2), #datetime.datetime.now(), #days_ago(2),
            #schedule_interval = '@once',
            #tags=['covid'],
    ) as dag:

        download_nyt_task = download_fromweb(task_id="download_nyt")

        create_recent_per_county_table = PostgresOperator(
            task_id="create_recent_per_county_table",
            postgres_conn_id="postgres_default",
            sql=sql_covid_counties.q_create_recent_per_county_table,
        )

        load_recent_per_county_table = PostgresOperator(
            task_id="load_recent_per_county_table",
            postgres_conn_id="postgres_default",
            sql=sql_covid_counties.q_load_recent_per_county_table,
        )
        # add location_id key to new data
        add_location_id = PostgresOperator(
            task_id="add_location_id",
            postgres_conn_id="postgres_default",
            sql=sql_covid_counties.q_add_location_id)
        # to compute daily data from cumulated data, it is necessary to have the previous cumulated data
        # for some counties with very few cases / deaths, previous data does not appear in the 1-month window
        # we must thus retrieve it from past data and add it to recent data
        # this is done in 2 phases :
        #   - First, get min(date) in recent data for each location_id (table min_recent)
        #   - Then, compare with max(date) for past_data :
        #       - if min(recent) > max(past) : add max(past) for location_id to recent table
        retrieve_past_data = PostgresOperator(
            task_id="retrieve_past_data",
            postgres_conn_id="postgres_default",
            sql=sql_covid_counties.q_retrieve_past_data)

        # compute daily deaths and cases from cumulated values :
        # - partition by location_id and order by date
        # - compute previous cases and deaths by "shifting" the case and deaths column
        # - compute daily data : cumulated_current - cumulated_prev
        compute_daily_stats = PostgresOperator(
            task_id="compute_daily_stats",
            postgres_conn_id="postgres_default",
            sql=sql_covid_counties.q_compute_daily_stats)

        filter_date = PostgresOperator(task_id="filter_date",
                                       postgres_conn_id="postgres_default",
                                       sql=sql_covid_counties.q_filter_date)
        update_last_date = PostgresOperator(
            task_id="update_last_date",
            postgres_conn_id="postgres_default",
            sql=sql_covid_counties.q_update_last_date)
        append_full_per_county = PostgresOperator(
            task_id="append_full_per_county",
            postgres_conn_id="postgres_default",
            sql=sql_covid_counties.q_append_full_per_county)

        drop_recent_per_county_table = PostgresOperator(
            task_id="drop_recent_per_county_table",
            postgres_conn_id="postgres_default",
            sql=sql_covid_counties.q_drop_recent_per_county_table)
        drop_recent_per_county_table_final = PostgresOperator(
            task_id="drop_recent_per_county_table_final",
            postgres_conn_id="postgres_default",
            sql=sql_covid_counties.q_drop_recent_per_county_table)

    download_nyt_task >> drop_recent_per_county_table >> create_recent_per_county_table >> load_recent_per_county_table >> add_location_id >> retrieve_past_data >> compute_daily_stats >> filter_date >> append_full_per_county >> update_last_date >> drop_recent_per_county_table_final

    return dag
with DAG(
        'orchestration_good_practices',
        start_date=datetime(2021, 1, 1),
        schedule_interval='@daily',
        catchup=False,
        default_args={
            'owner': 'airflow',
            'email_on_failure': False,
            'retries': 1,
            'retry_delay': timedelta(minutes=1)
        },
) as dag:

    opr_refresh_mat_view = PostgresOperator(
        task_id='refresh_mat_view',
        postgres_conn_id='postgres_default',
        sql='REFRESH MATERIALIZED VIEW example_view;',
    )

    opr_submit_run = DatabricksSubmitRunOperator(
        task_id='submit_run',
        databricks_conn_id='databricks',
        new_cluster=new_cluster,
        notebook_task=notebook_task)
    opr_run_now = DatabricksRunNowOperator(task_id='run_now',
                                           databricks_conn_id='databricks',
                                           job_id=5,
                                           notebook_params=notebook_params)

    opr_refresh_mat_view >> opr_submit_run >> opr_run_now
        dag_id='dag_psql',
        default_args=default_args,
        schedule_interval='@monthly',
        start_date=datetime(
            2021, 4,
            27),  #days_ago(2), #datetime.datetime.now(), #days_ago(2),
        tags=["covid", "psql"],
) as dag:

    create_recent_per_county_table = PostgresOperator(
        task_id="create_recent_per_county_table",
        postgres_conn_id="postgres_default",
        sql="""
            CREATE TABLE IF NOT EXISTS recent_per_county (
                date date,
                county text,
                state text,
                fips text,
                cases int,
                deaths int 
                );
                """,
    )

    load_recent_per_county_table = PostgresOperator(
        task_id="load_recent_per_county_table",
        postgres_conn_id="postgres_default",
        sql="""
            COPY recent_per_county FROM '/tmp/us-counties-recent.csv' WITH CSV HEADER ;
        """)
    # add location_id key to new data
    add_location_id = PostgresOperator(task_id="add_location_id",
Exemple #13
0
# create_pet_table, populate_pet_table, get_all_pets, and get_birth_date are examples of tasks created by
# instantiating the Postgres Operator

with DAG(
        dag_id="postgres_operator_dag",
        start_date=datetime.datetime(2020, 2, 2),
        schedule_interval="@once",
        catchup=False,
) as dag:
    # [START postgres_operator_howto_guide_create_pet_table]
    create_pet_table = PostgresOperator(
        task_id="create_pet_table",
        sql="""
            CREATE TABLE IF NOT EXISTS pet (
            pet_id SERIAL PRIMARY KEY,
            name VARCHAR NOT NULL,
            pet_type VARCHAR NOT NULL,
            birth_date DATE NOT NULL,
            OWNER VARCHAR NOT NULL);
          """,
    )
    # [END postgres_operator_howto_guide_create_pet_table]
    # [START postgres_operator_howto_guide_populate_pet_table]
    populate_pet_table = PostgresOperator(
        task_id="populate_pet_table",
        sql="""
            INSERT INTO pet (name, pet_type, birth_date, OWNER)
            VALUES ( 'Max', 'Dog', '2018-07-05', 'Jane');
            INSERT INTO pet (name, pet_type, birth_date, OWNER)
            VALUES ( 'Susie', 'Cat', '2019-05-01', 'Phil');
            INSERT INTO pet (name, pet_type, birth_date, OWNER)
Exemple #14
0
    'retry_delay': timedelta(minutes=2),
    'catchup': False
}

append_data = Variable.get('load_mode') == 'append-only'

dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *',
          max_active_runs=1)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_tables = PostgresOperator(task_id='Create_tables',
                                 dag=dag,
                                 sql='create_tables.sql',
                                 postgres_conn_id='redshift')

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table='staging_events',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    s3_bucket='udacity-dend',
    s3_key='log_data/',
    aws_region='us-west-2',
    jsonpath='log_json_path.json')

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    with open("/tmp/wikipageviews", "r") as f:
        for line in f:
            domain_code, page_title, view_counts, _ = line.split(" ")
            if domain_code == "en" and page_title in pagenames:
                result[page_title] = view_counts

    with open("/tmp/postgres_query.sql", "w") as f:
        for pagename, pageviewcount in result.items():
            f.write("INSERT INTO pageview_counts VALUES ("
                    f"'{pagename}', {pageviewcount}, '{execution_date}'"
                    ");\n")


fetch_pageviews = PythonOperator(
    task_id="fetch_pageviews",
    python_callable=_fetch_pageviews,
    op_kwargs={
        "pagenames": {"Google", "Amazon", "Apple", "Microsoft", "Facebook"}
    },
    dag=dag,
)

write_to_postgres = PostgresOperator(
    task_id="write_to_postgres",
    postgres_conn_id="my_postgres",
    sql="postgres_query.sql",
    dag=dag,
)

get_data >> extract_gz >> fetch_pageviews >> write_to_postgres
Exemple #16
0
{% endfor %}
"""

t3 = BashOperator(
    task_id='templated',
    depends_on_past=False,
    bash_command=templated_command,
    params={'my_param': 'Parameter I passed in'},
    dag=dag,
)

conn_db_src = PostgresHook(postgres_conn_id='PostgresNDoD')
sqlalchemy_engine = conn_db_src.get_sqlalchemy_engine()

create_nDoD_tables = PostgresOperator(task_id="create_nDoD_tables",
                                      postgres_conn_id="PostgresNDoD",
                                      sql="./sql/nDoD_schema.sql")


def load_utp_group(filename, conn_db, schema_db, **kwargs):
    """Load data into db"""

    df = pd.read_csv(filename, sep=';')

    logging.info('CSV in pandas \n %s', df.head().to_string())

    conn = conn_db.connect()
    conn.execute("TRUNCATE TABLE ndod.utp_group CASCADE")
    df.to_sql('utp_group',
              con=conn_db,
              schema=schema_db,
Exemple #17
0
# create_pet_table, populate_pet_table, get_all_pets, and get_birth_date are examples of tasks created by
# instantiating the Postgres Operator

with DAG(
        dag_id="postgres_operator_dag",
        start_date=datetime.datetime(2020, 2, 2),
        schedule_interval="@once",
        catchup=False,
) as dag:
    # [START postgres_operator_howto_guide_create_pet_table]
    create_pet_table = PostgresOperator(
        task_id="create_pet_table",
        sql="""
            CREATE TABLE IF NOT EXISTS pet (
            pet_id SERIAL PRIMARY KEY,
            name VARCHAR NOT NULL,
            pet_type VARCHAR NOT NULL,
            birth_date DATE NOT NULL,
            OWNER VARCHAR NOT NULL);
          """,
    )
    # [END postgres_operator_howto_guide_create_pet_table]
    # [START postgres_operator_howto_guide_populate_pet_table]
    populate_pet_table = PostgresOperator(
        task_id="populate_pet_table",
        sql="""
            INSERT INTO pet (name, pet_type, birth_date, OWNER)
            VALUES ( 'Max', 'Dog', '2018-07-05', 'Jane');
            INSERT INTO pet (name, pet_type, birth_date, OWNER)
            VALUES ( 'Susie', 'Cat', '2019-05-01', 'Phil');
            INSERT INTO pet (name, pet_type, birth_date, OWNER)
Exemple #18
0
def dbt_dags_factory_legacy(
    dwh_engine,
    dwh_conn_id,
    project_name,
    dbt_schema_name,
    airflow_conn_id,
    dag_base_name="DBT_run",
    analytics_reader=None,  # list of users of DWH who are read-only
    schedule_interval=timedelta(hours=1),
    start_date=datetime(2019, 1, 1),
    default_args=None,
    folder=None,
    models=None,
    exclude=None,
):

    if analytics_reader:
        for statement in (
                "insert",
                "update",
                "delete",
                "drop",
                "create",
                "select",
                ";",
                "grant",
        ):
            for reader in analytics_reader:
                if statement in reader.lower():
                    raise Exception("Error! The analytics reader {0} " +
                                    "is invalid.".format(reader))

        # analytics_reader = analytics_reader.split(',')
        analytics_reader_sql = f'\nGRANT USAGE ON SCHEMA "{dbt_schema_name}"'
        analytics_reader_sql += ' TO "{0}";'
        analytics_reader_sql += (f'''
        \nGRANT SELECT ON ALL TABLES IN SCHEMA "{dbt_schema_name}"''' +
                                 ' TO "{0}";')
        analytics_reader_sql = "".join(
            [analytics_reader_sql.format(i) for i in analytics_reader])

    if models and not (type(models) == str):
        models = " --models " + " ".join(models)
    else:
        models = ""

    if exclude and not (type(exclude) == str):
        exclude = " --exclude " + " ".join(exclude)
    else:
        exclude = ""

    flags = models + exclude

    dag = DAG(
        dag_base_name,
        catchup=False,
        max_active_runs=1,
        schedule_interval=schedule_interval,
        start_date=start_date,
        default_args=default_args,
    )

    dag_full_refresh = DAG(
        dag_base_name + "_full_refresh",
        catchup=False,
        max_active_runs=1,
        schedule_interval=None,
        start_date=start_date,
        default_args=default_args,
    )

    folder = folder or (os.environ.get("AIRFLOW_HOME")
                        or conf.get("core", "airflow_home")).replace(
                            "airflow_home/airflow",
                            "dbt_home",
                        )

    bash_command = """
    cd {1}
    source env/bin/activate
    cd {2}
    dbt {0}
    """.format(
        "{0}",
        folder,
        project_name,
    )

    sensor_sql = """
        SELECT
            CASE WHEN COUNT(*) = 0 THEN 1 ELSE 0 END -- only run if exatly equal to 0
        FROM public.dag_run
        WHERE dag_id IN ('{0}', '{1}')
        and state = 'running'
        and not (run_id = '{2}')
    """.format(
        dag._dag_id,
        dag_full_refresh._dag_id,
        "{{ run_id }}",
    )

    # refactor?! not coupled to values in profiles.yml!
    if dwh_engine == EC.DWH_ENGINE_POSTGRES:
        conn = BaseHook.get_connection(dwh_conn_id)
        env = {
            "DBT_DWH_HOST": str(conn.host),
            "DBT_DWH_USER": str(conn.login),
            "DBT_DWH_PASS": str(conn.password),
            "DBT_DWH_PORT": str(conn.port),
            "DBT_DWH_DBNAME": str(conn.schema),
            "DBT_DWH_SCHEMA": dbt_schema_name,
            "DBT_PROFILES_DIR": folder,
        }
    elif dwh_engine == EC.DWH_ENGINE_SNOWFLAKE:
        analytics_conn = BaseHook.get_connection(dwh_conn_id)
        analytics_conn_extra = analytics_conn.extra_dejson
        env = {
            "DBT_ACCOUNT":
            analytics_conn_extra.get(
                "account",
                analytics_conn.host,
            ),
            "DBT_USER":
            analytics_conn.login,
            "DBT_PASS":
            analytics_conn.password,
            "DBT_ROLE":
            analytics_conn_extra.get("role"),
            "DBT_DB":
            analytics_conn_extra.get("database"),
            "DBT_WH":
            analytics_conn_extra.get("warehouse"),
            "DBT_SCHEMA":
            dbt_schema_name,
            "DBT_PROFILES_DIR":
            folder,
        }
    else:
        raise ValueError("DWH type not implemented!")

    # with dag:
    snsr = EWAHSqlSensor(
        task_id="sense_dbt_conflict_avoided",
        conn_id=airflow_conn_id,
        sql=sensor_sql,
        poke_interval=5 * 60,
        mode="reschedule",  # don't block a worker and pool slot
        dag=dag,
    )

    dbt_seed = BashOperator(
        task_id="run_dbt_seed",
        bash_command=bash_command.format("seed"),
        env=env,
        dag=dag,
    )

    dbt_run = BashOperator(
        task_id="run_dbt",
        bash_command=bash_command.format("run" + flags),
        env=env,
        dag=dag,
    )

    dbt_test = BashOperator(
        task_id="test_dbt",
        bash_command=bash_command.format("test" + flags),
        env=env,
        dag=dag,
    )

    dbt_docs = BashOperator(
        task_id="create_dbt_docs",
        bash_command=bash_command.format("docs generate"),
        env=env,
        dag=dag,
    )

    snsr >> dbt_seed >> dbt_run >> dbt_test

    if analytics_reader:
        # This should not occur when using Snowflake
        read_rights = PostgresOperator(
            task_id="grant_access_to_read_users",
            sql=analytics_reader_sql,
            postgres_conn_id=dwh_conn_id,
            dag=dag,
        )
        dbt_test >> read_rights >> dbt_docs
    else:
        dbt_test >> dbt_docs

    # with dag_full_refresh:
    snsr = EWAHSqlSensor(
        task_id="sense_dbt_conflict_avoided",
        conn_id=airflow_conn_id,
        sql=sensor_sql,
        poke_interval=5 * 60,
        mode="reschedule",  # don't block a worker and pool slot
        dag=dag_full_refresh,
    )

    dbt_seed = BashOperator(
        task_id="run_dbt_seed",
        bash_command=bash_command.format("seed"),
        env=env,
        dag=dag_full_refresh,
    )

    dbt_run = BashOperator(
        task_id="run_dbt",
        bash_command=bash_command.format("run --full-refresh" + flags),
        env=env,
        dag=dag_full_refresh,
    )

    dbt_test = BashOperator(
        task_id="test_dbt",
        bash_command=bash_command.format("test" + flags),
        env=env,
        dag=dag_full_refresh,
    )

    dbt_docs = BashOperator(
        task_id="create_dbt_docs",
        bash_command=bash_command.format("docs generate"),
        env=env,
        dag=dag_full_refresh,
    )

    snsr >> dbt_seed >> dbt_run >> dbt_test

    if analytics_reader:
        read_rights = PostgresOperator(
            task_id="grant_access_to_read_users",
            sql=analytics_reader_sql,
            postgres_conn_id=dwh_conn_id,
            dag=dag_full_refresh,
        )
        dbt_test >> read_rights >> dbt_docs
    else:
        dbt_test >> dbt_docs
    return (dag, dag_full_refresh)
Exemple #19
0
def daily_weather_subdag(parent_dag_id, child_dag_id, args):      

    with DAG(
        dag_id=f'{parent_dag_id}.{child_dag_id}',
        default_args=args,
        #schedule_interval= '@once', #'@monthly', # for testing #  @daily',
        #start_date= datetime(2021,5,2), #days_ago(2), #datetime.datetime.now(), #days_ago(2),
    #    end_date= datetime(2021,5,2), #days_ago(2), #datetime.datetime.now(), #days_ago(2),
       # max_active_runs = 1,
        #tags=['covid'],
        ) as dag:
        
            #download_diff_weather_task = download_diff_weather(task_id = "download_diff_weather")
            download_diff_weather_task = dummy_download(task_id = "download_diff_weather")
            # jinja + xcom + task parameters + sql string is uber relou
            stage_recent_insert = PostgresOperator(
                task_id = "stage_recent_insert",
                postgres_conn_id = "postgres_default",
                sql= sq.q_create_stage__weather_table,
                params = { "stage_table" : "recent_insert",
                        "stage_file" : "insert.csv"
                        }
                )
            stage_recent_delete = PostgresOperator(
                task_id = "stage_recent_delete",
                postgres_conn_id = "postgres_default",
                sql = sq.q_create_stage__weather_table,
                params = { "stage_table" : "recent_delete",
                        "stage_file" : "delete.csv"
                        }
                )
            stage_recent_update = PostgresOperator(
                task_id = "stage_recent_update",
                postgres_conn_id = "postgres_default",
                sql = sq.q_create_stage__weather_table,
                params = { "stage_table" : "recent_update",
                        "stage_file" : "update.csv"
                        }          
                )
            
            filter_insert = PostgresOperator(
                task_id = "filter_insert",
                postgres_conn_id = "postgres_default",
                sql = sq.q_filter_weather,
                params= {"full_table" : "recent_insert", 
                        "filtered_table" : "recent_insert_filtered",
                        "selected_stations" : "weatherelem_station"}
                )
            
            filter_delete = PostgresOperator(
                task_id = "filter_delete",
                postgres_conn_id = "postgres_default",
                sql = sq.q_filter_weather,
                params= {"full_table" : "recent_delete", 
                        "filtered_table" : "recent_delete_filtered",
                        "selected_stations" : "weatherelem_station"}
                )
            filter_update = PostgresOperator(
                task_id = "filter_update",
                postgres_conn_id = "postgres_default",
                sql = sq.q_filter_weather,
                params= {"full_table" : "recent_update", 
                        "filtered_table" : "recent_update_filtered",
                        "selected_stations" : "weatherelem_station"}
                )
            
            process_delete = PostgresOperator(
                task_id = "process_delete",
                postgres_conn_id = "postgres_default",
                sql = sq.q_process_delete_weather
                    )
            
            process_insert = PostgresOperator(
                task_id = "process_insert",
                postgres_conn_id = "postgres_default",
                sql = sq.q_process_insert_weather
                    )
            process_update = PostgresOperator(
                task_id = "process_update",
                postgres_conn_id = "postgres_default",
                sql = sq.q_process_update_weather
                    )
            
                    #filter_quality_check = PostgresOperator(
                #task_id = "filter_quality_check",
                #postgres_conn_id = "postgres_default",
                #sql = """
                #"""
                #)
            
        
            download_diff_weather_task >> [stage_recent_insert, stage_recent_update, stage_recent_delete]
            stage_recent_insert >> filter_insert
            stage_recent_delete >> filter_delete >> process_delete
            stage_recent_update >> filter_update >> process_update
            stage_recent_insert >> filter_insert >> process_insert
            process_delete >> process_update >> process_insert
            
            return dag
Exemple #20
0
def _remove_sample_data_from_s3():
    s3_hook = S3Hook()
    if s3_hook.check_for_key(f'{S3_KEY}/{REDSHIFT_TABLE}', S3_BUCKET):
        s3_hook.delete_objects(S3_BUCKET, f'{S3_KEY}/{REDSHIFT_TABLE}')


with DAG(dag_id="example_s3_to_redshift",
         start_date=days_ago(1),
         schedule_interval=None,
         tags=['example']) as dag:
    setup__task_add_sample_data_to_s3 = PythonOperator(
        python_callable=_add_sample_data_to_s3,
        task_id='setup__add_sample_data_to_s3')
    setup__task_create_table = PostgresOperator(
        sql=
        f'CREATE TABLE IF NOT EXISTS {REDSHIFT_TABLE}(Id int, Name varchar)',
        postgres_conn_id='redshift_default',
        task_id='setup__create_table',
    )
    # [START howto_operator_s3_to_redshift_task_1]
    task_transfer_s3_to_redshift = S3ToRedshiftOperator(
        s3_bucket=S3_BUCKET,
        s3_key=S3_KEY,
        schema="PUBLIC",
        table=REDSHIFT_TABLE,
        copy_options=['csv'],
        task_id='transfer_s3_to_redshift',
    )
    # [END howto_operator_s3_to_redshift_task_1]
    teardown__task_drop_table = PostgresOperator(
        sql=f'DROP TABLE IF EXISTS {REDSHIFT_TABLE}',
        postgres_conn_id='redshift_default',
Exemple #21
0
from datetime import datetime

from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.postgres.operators.postgres import PostgresOperator
from etl_spotify import get_tracks, save_tracks_to_db

with DAG(dag_id='etl_spotify',
         start_date=datetime(2021, 7, 1),
         schedule_interval='@daily') as dag:

    create_tracks_table = PostgresOperator(
        task_id='create_table',
        sql='create_table.sql',
        postgres_conn_id='postgres_default',
    )

    getting_spotify_data = PythonOperator(
        task_id='getting_data',
        python_callable=get_tracks,
        provide_context=True,
    )

    insert_data_to_db = PythonOperator(task_id='insert_data_to_db',
                                       python_callable=save_tracks_to_db,
                                       provide_context=True)

    create_tracks_table >> getting_spotify_data >> insert_data_to_db
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

with DAG('metrics_dag',
         start_date=datetime(2021, 2, 15),
         schedule_interval='@once',
         default_args=default_args,
         template_searchpath='/usr/local/airflow/include',
         catchup=False
         ) as dag:

    run_first_task = PostgresOperator(
        task_id='task_2_1',
        postgres_conn_id='postgres',
        sql='task_2_1.sql',
        params={'k': '50', 'p_date': '2019-01'}
    )
    run_second_task = PostgresOperator(
        task_id='task_2_2',
        postgres_conn_id='postgres',
        sql='task_2_2.sql'
    )

    run_first_task >> run_second_task
        os.path.join(os.path.dirname(os.path.abspath(__file__)), '..',
                     'create_tables.sql')) as f:
    create_tables_sql = f.read()

dag = DAG(
    'udacity_sparkify_dag',
    default_args=default_args,
    description='Load and transform data in Redshift with Airflow',
    schedule_interval='@hourly',
    catchup=False,
)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_tables = PostgresOperator(task_id="create_tables",
                                 dag=dag,
                                 postgres_conn_id="redshift",
                                 sql=create_tables_sql)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table="staging_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="log_data",
    json_format="'s3://udacity-dend/log_json_path.json'",
)

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
Exemple #24
0
                job_flow_id=job_flow_id,
                step_id="{{ task_instance.xcom_pull(task_ids='" + 'add_step_{}'.format(key) + "', key='return_value')[0] }}",
                aws_conn_id='aws_default'
            )
            entity_task_list.append(watch_step)

        # if file has sql quality steps, then create additional tasks in DAG
        if file['sql_step_args']:
            # must drop duplicates in table as spark jdbc does not support upsert
            # https://issues.apache.org/jira/browse/SPARK-19335
            drop_table_duplicates = PostgresOperator(
                task_id='drop_table_duplicates_{table}'.format(table=file['sql_step_args']['db_table']),
                postgres_conn_id='postgres_default',
                sql="""
                        DELETE FROM {table} t1 
                        USING {table} t2 
                        WHERE 
                            t1.ctid < t2.ctid AND 
                            t1.{duplicate_key} = t2.{duplicate_key};
                    """.format(table=file['sql_step_args']['db_table'], duplicate_key=file['sql_step_args']['duplicate_key']),
            )

            entity_task_list.append(drop_table_duplicates)

            check_table_has_no_duplicates = SQLValueCheckOperator(
                task_id='check_table_has_no_duplicates_{table}'.format(table=file['sql_step_args']['db_table']),
                conn_id='postgres_default',
                sql="""
                        SELECT COALESCE(
                            (
                                SELECT COUNT({duplicate_key})
            "email_on_retry": False,
            "retries": 1,
            "retry_delay": timedelta(minutes=1),
        },
        catchup=False,
        template_searchpath="/usr/local/airflow/include",
) as dag:

    t0 = DummyOperator(task_id="start")

    # Define Task Group with Postgres Queries
    with TaskGroup("covid_table_queries") as covid_table_queries:
        for state in states:
            generate_files = PostgresOperator(
                task_id="covid_query_{0}".format(state),
                postgres_conn_id="gpdb",
                sql="covid_state_query.sql",
                params={"state": "'" + state + "'"},
            )

    # Define task to send email
    send_email = EmailOperator(
        task_id="send_email",
        to=email_to,
        subject="Covid Greenplum Queries DAG",
        html_content=
        "<p>The Covid queries were run on Greenplum successfully. <p>",
    )

    # Define task dependencies
    t0 >> covid_table_queries >> send_email
         default_args=default_args,
         catchup=False) as dag:

    get_slots = PythonOperator(task_id="get_slots",
                               provide_context=True,
                               python_callable=get_slots)

    choose_next_task = BranchPythonOperator(task_id='choose_next_task',
                                            python_callable=_choose_next_task,
                                            do_xcom_push=False)

    skipped_task = DummyOperator(task_id='skipped_task')

    email_task = PythonOperator(
        task_id='email_task',
        python_callable=notify_email,
        provide_context=True,
        dag=dag,
    )

    delete_xcom_task = PostgresOperator(
        task_id='delete-xcom-task',
        postgres_conn_id='postgres_1',
        sql="delete from xcom where dag_id='" + dag.dag_id +
        "' and date(execution_date)=date('{{ ds }}')",
        trigger_rule='one_success',
        dag=dag)

    get_slots >> choose_next_task >> [skipped_task, email_task
                                      ] >> delete_xcom_task
Exemple #27
0
import airflow.utils.dates
from airflow import DAG
from airflow.sensors.external_task_sensor import ExternalTaskSensor
from airflow.providers.postgres.operators.postgres import PostgresOperator
from datetime import datetime, timedelta

default_args = {"owner": "airflow", "start_date": datetime(2020, 1, 1)}

with DAG(dag_id="cleaning_dag",
         default_args=default_args,
         schedule_interval="*/10 * * * *",
         catchup=False) as dag:

    waiting_for_task = ExternalTaskSensor(task_id='waiting_for_task',
                                          external_dag_id='avocado_dag',
                                          external_task_id='publish_notebook',
                                          failed_states=['failed'])

    cleaning_xcoms = PostgresOperator(task_id='cleaning_xcoms',
                                      sql='sql/CLEANING_XCOMS.sql',
                                      postgres_conn_id='postgres')

    waiting_for_task >> cleaning_xcoms
Exemple #28
0
from airflow import DAG
from airflow.providers.postgres.operators.postgres import PostgresOperator
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'Anil',
    'dag_id': 'PostgreOperatorTest_Dag',
    'start_date': days_ago(1),
    'schedule_interval': '@daily'
}

dag: DAG = DAG(dag_id='PostgreOperatorTest_Dag',
               default_args=default_args,
               template_searchpath='/opt/airflow/')

task1 = PostgresOperator(task_id="create_table",
                         postgres_conn_id='postgres_new',
                         sql="sql/create_table.sql",
                         params={'table_name': 'covid'},
                         database="userdata",
                         dag=dag)
        op_kwargs={
            'dest_table': 'stg_purchases',
            'sql': 'select * from purchases where "purchase_date" = %s',
            'params': [execution_date],
        })

    load_full_products_data = PythonOperator(
        task_id='load_full_products',
        python_callable=transfer_oltp_olap,
        op_kwargs={
            'dest_table': 'stg_products',
            'sql': 'select * from products',
        })

    delete_products_sales_exec_date = PostgresOperator(
        task_id='delete_products_sales_exec_date',
        postgres_conn_id='olap',
        sql='delete_products_sales_exec_date.sql')

    join_purchases_with_products = PostgresOperator(
        task_id='join_purchases_products',
        postgres_conn_id='olap',
        sql='join_purchases_with_products.sql')

    union_incremental_products_sales = PostgresOperator(
        task_id='union_staging_to_products_sales',
        postgres_conn_id='olap',
        sql='union_staging_to_products_sales.sql')

    agg_sales_category = PostgresOperator(task_id='rebuild_agg_sales_category',
                                          postgres_conn_id='olap',
                                          sql='agg_sales_category.sql')
def create_incremental_refresh_dag(
        # fmt: off
        dag_id,
        schedule,
        symbol,
        granularity,
        symbol_id,
        default_args
    # fmt: on
):
    """
    Function returns a dag instance which loads data from
    Binance's S3 bucket to load incremental data.
    S3 Bucket containing monthly/daily data can be used to load
    data containing different intervals.

    Args:
        dag_id (str): dag ID.
        schedule (str): schedule at which DAG should run.
        symbol (str): Stock/Crypto symbol which will be used to pull data.
        symbol_id (int): Symbol Id of the symbol which was shared.
        granularity (str): Time period for which candles are created.
                Ex: 1m, 5m etc.
    Returns:
        instance of Airflow DAG.
    """
    def load_data(symbol, symbol_id, start_ts, end_ts, granularity):
        """
        Used to pull incremental data specified by a
        time range in epoc timestamp - (start_ts, end_ts)
        Args
            - symbol (str): coinpair which Binance API recognises.
            - start_ts (int): time range beginning, in epoch timestamp
            - end_ts (int): time range end, in epoch timestamp
            - granularity (str): Time period of granularity.
                ex: 1m, 5m, 15m.
                See, Binance's API documentation for further options.

        Returns
            - Returns JSON object, containing Pandas Dataframe.
        """
        api = BinanceAPI()

        def get_ts_shift(ts, i):
            shifted_date = ts + 86400 * i
            return round(shifted_date.timestamp())

        # calculate total 24h batches required to backfill data

        time_diff_secs = (end_ts - start_ts).total_seconds()
        batches = round(time_diff_secs / 60 / 60 / 24)

        log_message = "Getting data: {:d} {:d} to {:d}"
        logger.info(log_message.format(symbol, start_ts, end_ts))
        data = [
            api.get_historical(
                symbol,
                start_ts=get_ts_shift(start_ts, i),
                end_ts=get_ts_shift(start_ts, i + 1) + 1,
                granularity=granularity,
            ) for i in range(batches)
        ]

        df = pd.concat(data)
        df.loc[:, "symbol_id"] = symbol_id
        pg_hook.write_pandas_df(
            df.to_json(),
            name="tickerdata_{:s}".format(granularity),
            if_exists="append",
            index=False,
        )

    def delete_data(symbol, symbol_id, start_ts, end_ts, granularity):
        """
        Deletes rows from table specified by a time range - (start_ts, end_ts)
        before writing to the table.

        Args
            - symbol (str): coinpair which Binance API recognises.
            - start_ts (int): time range beginning, in epoch timestamp
            - end_ts (int): time range end, in epoch timestamp
            - granularity (str): Time period of granularity.
                ex: 1m, 5m, 15m.
                See, Binance's API documentation for further options.

        Returns
            - Returns JSON object, containing Pandas Dataframe.
        """
        table = "ticker_data_{:s}".format(granularity)
        CMD = """
            DELETE FROM {table}
            WHERE close_time BETWEEN {start_ts} AND {end_ts}
                AND symbol_id = {symbol_id}
        """.format(table=table,
                   start_ts=start_ts,
                   end_ts=end_ts,
                   symbol_id=symbol_id)

        PostgresOperator(
            task_id="delete_data_{:s}_{:s}".format(symbol, table),
            postgres_conn_id=POSTGRES_CONN_ID,
            sql=CMD,
        )
        return True

    dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args)

    with dag:
        # fmt: off
        t1 = PythonOperator(task_id="delete_data_{:s}".format(symbol),
                            python_callable=delete_data)
        t2 = PythonOperator(task_id="load_data_{:s}".format(symbol),
                            python_callable=load_data)
        # fmt: on

        SQL = """UPDATE symbols
            SET symbol_last_updated_at=timezone('utc', NOW())
            WHERE symbol_id = {:d}""".format(symbol_id)
        t3 = PostgresOperator(
            task_id="Set_{:s}_last_updated_at".format(symbol),
            postgres_conn_id=POSTGRES_CONN_ID,
            sql=SQL,
        )
        t1 >> t2 >> t3

    return dag