Esempio n. 1
0
 def test_postgres_operator_test_multi(self):
     sql = [
         "TRUNCATE TABLE test_airflow",
         "INSERT INTO test_airflow VALUES ('X')",
     ]
     from airflow.operators.postgres_operator import PostgresOperator
     t = PostgresOperator(
         task_id='postgres_operator_test_multi', sql=sql, dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
Esempio n. 2
0
    def test_vacuum(self):
        """
        Verifies the VACUUM operation runs well with the PostgresOperator
        """
        from airflow.operators.postgres_operator import PostgresOperator

        sql = "VACUUM ANALYZE;"
        t = PostgresOperator(
            task_id='postgres_operator_test_vacuum',
            sql=sql,
            dag=self.dag,
            autocommit=True)
        t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
Esempio n. 3
0
    def test_postgres_operator_test(self):
        sql = """
        CREATE TABLE IF NOT EXISTS test_airflow (
            dummy VARCHAR(50)
        );
        """
        from airflow.operators.postgres_operator import PostgresOperator
        t = PostgresOperator(task_id='basic_postgres', sql=sql, dag=self.dag)
        t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)

        autocommitTask = operators.postgres_operator.PostgresOperator(
            task_id='basic_postgres_with_autocommit',
            sql=sql,
            dag=self.dag,
            autocommit=True)
        autocommitTask.run(
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_ti_state=True)
Esempio n. 4
0
    def test_overwrite_schema(self):
        """
        Verifies option to overwrite connection schema
        """
        from airflow.operators.postgres_operator import PostgresOperator

        sql = "SELECT 1;"
        t = PostgresOperator(
            task_id='postgres_operator_test_schema_overwrite',
            sql=sql,
            dag=self.dag,
            autocommit=True,
            database="foobar",
        )

        from psycopg2._psycopg import OperationalError
        try:
            t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
                  ignore_ti_state=True)
        except OperationalError as e:
            assert 'database "foobar" does not exist' in str(e)
Esempio n. 5
0
        ) for key, file in files_to_download.items()
    ]

    # 6. Create TABLE
    create_tables = [
        BashOperator(
            task_id=f"create_table_{key}",
            bash_command=f"psql {pg_params()} < {tmp_dir}/{dag_id}_{key}.sql",
        ) for key, _ in files_to_download.items()
    ]

    # 7. RE-define GEOM type (because ogr2ogr cannot set geom with .csv import)
    redefine_geom = [
        PostgresOperator(
            task_id=f"re-define_geom_{key}",
            sql=SET_GEOM,
            params=dict(tablename=key),
        ) for key, _ in files_to_download.items()
    ]

    # 8. Rename COLUMNS based on Provenance
    provenance_translation = ProvenanceRenameOperator(
        task_id="rename_columns",
        dataset_name=dag_id,
        rename_indexes=False,
        pg_schema="public",
    )

    # 9. Drop Exisiting TABLE
    drop_tables = [
        PostgresOperator(
Esempio n. 6
0
standardize_borough = standardize_task('standardize_borough')
standardize_address = standardize_task('standardize_address')


def yes_trigger(_, dag):
    return dag


trigger_facdb_3_geoprocessing = TriggerDagRunOperator(
    task_id='trigger_facdb_3_geoprocessing',
    trigger_dag_id='facdb_3_geoprocessing',
    python_callable=yes_trigger,
    dag=facdb_2_assembly)

## ORDER TASKS

facdb_2_assembly >> create

for task_file in os.listdir(
        "/home/airflow/airflow/dags/facdb_2_assembly/config"):
    config = PostgresOperator(task_id=task_file[:-4],
                              postgres_conn_id='facdb',
                              sql="/facdb_2_assembly/config/" + task_file,
                              dag=facdb_2_assembly)
    create >> config >> join_sourcedatainfo

(join_sourcedatainfo >> standardize_fixallcaps >> standardize_capacity >>
 standardize_oversightlevel >> standardize_agencytag >> standardize_trim >>
 standardize_factypes >> standardize_borough >> standardize_address >>
 create_bblbin_one2one >> create_uid >> trigger_facdb_3_geoprocessing)
    def check_table():
        return "sense_the_csv"

    check_for_table = BranchPythonOperator(
        task_id="Check_for_table",
        python_callable=check_table,
    )

    # TASK 2 (Case if the table is not already there.) -> Create table in postgres.
    create_table = PostgresOperator(
        task_id="Create_Table_in_Postgres",
        sql="""CREATE TABLE sampletable(
            Date text,
            Open text,
            High text,
            Low text,
            Close text
        );
        """,
    )

    # TASK 2 (If the table is there, then this will be the second task) -> Wait for the file using the FileSensor
    file_sensing_task = FileSensor(
        trigger_rule=TriggerRule.ONE_SUCCESS,
        task_id="sense_the_csv",
        filepath="sample1.csv",
        fs_conn_id="my_file_system",
        poke_interval=10,
    )
    'retries': 3,
    'email_on_retry': False,
    'email_on_failure': False,
    'retry_delay': timedelta(minutes=5),
    'depends_on_past': False
}

with DAG('avocado_dag',
         default_args=default_args,
         description='Forecasting avocado prices',
         schedule_interval='*/10 * * * *',
         start_date=datetime(2020, 1, 1),
         catchup=False) as dag:

    creating_accuray_table = PostgresOperator(
        task_id='creating_accuray_table',
        sql='sql/CREATE_TABLE_ACCURACIES.sql',
        postgres_conn_id='postgres')

    downloading_data = PythonOperator(task_id='downloading_data',
                                      python_callable=download_dataset)

    sanity_check = PythonOperator(task_id='sanity_check',
                                  python_callable=check_dataset,
                                  provide_context=True)

    waiting_for_data = FileSensor(task_id='waiting_for_data',
                                  fs_conn_id='fs_default',
                                  filepath='avocado.csv',
                                  poke_interval=15)

    n_estimators = [100, 150]
Esempio n. 9
0
# - Set "Login" and "Password"
# - Set "Port" to 5439


def load_data_to_redshift(*args, **kwargs):
    aws_hook = AwsHook("aws_credentials", client_type="s3")
    credentials = aws_hook.get_credentials()
    redshift_hook = PostgresHook("redshift")
    redshift_hook.run(
        ss.COPY_ALL_TRIPS_SQL.format(credentials.access_key,
                                     credentials.secret_key))


dag = DAG('aws_s3_to_redshift', start_date=datetime.datetime.now())

create_table = PostgresOperator(task_id="create_table",
                                dag=dag,
                                postgres_conn_id="redshift",
                                sql=ss.CREATE_TRIPS_TABLE_SQL)

copy_task = PythonOperator(task_id='load_from_s3_to_redshift',
                           dag=dag,
                           python_callable=load_data_to_redshift)

location_traffic_task = PostgresOperator(task_id="calculate_location_traffic",
                                         dag=dag,
                                         postgres_conn_id="redshift",
                                         sql=ss.LOCATION_TRAFFIC_SQL)

create_table >> copy_task
copy_task >> location_traffic_task
JOIN asamoilov.dm_payment_report_dim_billing_year dby ON s.billing_year_key = dby.billing_year_key
JOIN asamoilov.dm_payment_report_dim_legal_type dlt ON s.legal_type_key = dlt.legal_type_key
JOIN asamoilov.dm_payment_report_dim_district dd ON s.district_key = dd.district_key
JOIN asamoilov.dm_payment_report_dim_billing_mode dbm ON s.billing_mode_key = dbm.billing_mode_key
JOIN asamoilov.dm_payment_report_dim_registration_year dry ON s.registration_year_key = dry.registration_year_key;
"""

SQL_DROP_DM_PAYMENT_REPORT_TMP = """
DROP TABLE IF EXISTS asamoilov.dm_payment_report_tmp_{{ execution_date.year }};
"""

start_load = DummyOperator(task_id="start_load", dag=dag)

dm_payment_report_tmp = PostgresOperator(
    task_id="dm_payment_report_tmp",
    dag=dag,
    # postgres_conn_id="postgres_default",
    sql=SQL_DM_PAYMENT_REPORT_TMP)

dm_payment_report_dim_billing_year = PostgresOperator(
    task_id="dm_payment_report_dim_billing_year",
    dag=dag,
    # postgres_conn_id="postgres_default",
    sql=SQL_DM_PAYMENT_REPORT_DIM_BILLING_YEAR)

dm_payment_report_dim_legal_type = PostgresOperator(
    task_id="dm_payment_report_dim_legal_type",
    dag=dag,
    # postgres_conn_id="postgres_default",
    sql=SQL_DM_PAYMENT_REPORT_DIM_LEGAL_TYPE)
Esempio n. 11
0
        credentials.secret_key,
    )
    redshift_hook.run(sql_stmt)


dag = DAG(
    'lesson2.exercise3',
    start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
    end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0),
    schedule_interval='@monthly',
    max_active_runs=1
)

create_trips_table = PostgresOperator(
    task_id="create_trips_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql.CREATE_TRIPS_TABLE_SQL
)

copy_trips_task = PythonOperator(
    task_id='load_trips_from_s3_to_redshift',
    dag=dag,
    python_callable=load_trip_data_to_redshift,
    provide_context=True,
)

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql.CREATE_STATIONS_TABLE_SQL,
    provenance_translation = ProvenanceRenameOperator(
        task_id="rename_columns",
        dataset_name=f"{dag_id}",
        prefix_table_name=f"{dag_id}_",
        postfix_table_name="_new",
        rename_indexes=False,
        pg_schema="public",
    )

    # 8. Revalidate invalid geometry records
    # the source has some invalid records
    # to do: inform the source maintainer
    revalidate_geometry_records = [
        PostgresOperator(
            task_id=f"revalidate_geometry_{key}",
            sql=[
                f"UPDATE {dag_id}_{key}_new SET geometry = ST_CollectionExtract((st_makevalid(geometry)),3) WHERE 1=1 AND ST_IsValid(geometry) is false; COMMIT;",
            ],
        )
        for key in tables_to_create.keys()
    ]

    # Prepare the checks and added them per source to a dictionary
    for key in tables_to_create.keys():

        total_checks.clear()
        count_checks.clear()
        geo_checks.clear()

        count_checks.append(
            COUNT_CHECK.make_check(
                check_id=f"count_check_{key}",
        task_id="slack_at_start",
        http_conn_id="slack",
        webhook_token=slack_webhook_token,
        message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})",
        username="******",
    )

    check_table_exists = PostgresXcomOperator(task_id="check_table_exists",
                                              sql=SQL_EXISTS_CHECK,
                                              do_xcom_push=True)

    branch_task = BranchPythonOperator(task_id="branch_task",
                                       provide_context=True,
                                       python_callable=choose_branch)

    update_oplaadpalen = PostgresOperator(
        task_id="update_oplaadpalen", sql=f"{sql_path}/oplaadpalen_copy.sql")

    create_oplaadpalen = PostgresOperator(
        task_id="create_oplaadpalen", sql=f"{sql_path}/oplaadpalen_create.sql")

    # The trigger_rule is essential, otherwise the skipped path blocks progress
    import_allego = PythonOperator(
        task_id="import_allego",
        python_callable=import_oplaadpalen,
        trigger_rule="none_failed_or_skipped",
        op_args=[
            PostgresHook(postgres_conn_id=dag.default_args["postgres_conn_id"]
                         ).get_conn()
        ],
    )
Esempio n. 14
0
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('transfer_data',
          catchup=False,
          default_args=default_args,
          schedule_interval=None,
          max_active_runs=1)

task_create_tables = PostgresOperator(
    task_id='task_create_tables',
    sql=SQL_PATH + 'create_transfer_and_aggregate_tables.sql',
    postgres_conn_id='my_local_db',
    dag=dag)

task_create_views = PostgresOperator(task_id='task_create_views',
                                     sql=SQL_PATH + 'create_views.sql',
                                     postgres_conn_id='my_local_db',
                                     dag=dag)

task_load_transfer_table = PythonOperator(task_id='task_load_transfer_table',
                                          python_callable=load_transfer,
                                          provide_context=True,
                                          dag=dag)

task_transfer_to_aggregate_table = PostgresOperator(
    task_id='task_transfer_to_aggregate_table',
Esempio n. 15
0
        ) as dag :
        
        task_1 = BashOperator(
            task_id = 'copy_csv_file',
            bash_command= 'cp "/mnt/c/Users/sivkumar/Documents/Project Refference/VNPT/Data/network data/DIM_PM_PROVINCE.csv" /home/siva/files/',
            dag = dag
        )
        
        task_2 = PostgresOperator(
            task_id = 'create_table',
            database = 'postgres',
            postgres_conn_id = "postgres_localhost",
            sql = """
                DROP TABLE IF EXISTS prestage.dim_province;

                CREATE TABLE prestage.dim_province(
                    province_id numeric(10),
                    code varchar(10),
                    province_name varchar(50),
                    status numeric(5)
                );
            """,
            dag = dag
        )

        task_3 = PythonOperator(
            task_id = 'insert_into_postgres',
            provide_context=True,
            python_callable = load_to_dwh,
            dag=dag
        )
    # retrieve credentials with "get_credentials()" method
    AWSCredentials = awsHookInstance.get_credentials()
    # instantiate PostgresHook Class
    derivedRedshiftHook = PostgresHook("redshift")
    # retrieve Station data bulk load SQL statement
    bulkLoadStatement = sql_statements.COPY_STATIONS_SQL.format(
         AWSCredentials.access_key
        ,AWSCredentials.secret_key
    )
    # execute SQL statement
    derivedRedshiftHook.run(bulkLoadStatement)

# create Tasks by instantiating Operator Classes
createTripsTable = PostgresOperator(
     task_id='createTripsTable'
    ,postgres_conn_id='redshift'
    ,sql=sql_statements.CREATE_TRIPS_TABLE_SQL
    ,dag=bulkLoadDag
)

createStationsTable = PostgresOperator(
     task_id='createStationsTable'
    ,postgres_conn_id='redshift'
    ,sql=sql_statements.CREATE_STATIONS_TABLE_SQL
    ,dag=bulkLoadDag
)

loadTripsData = PythonOperator(
     task_id='loadTripsData'
    ,python_callable=loadTripDataFromS3ToRedshift
    ,dag=bulkLoadDag
)
Esempio n. 17
0
        )
        if route.post_process:
            hook = PostgresHook(postgres_conn_id=postgres_conn_id)
            hook.run(route.post_process)


with DAG(dag_id, default_args=default_args) as dag:

    count_checks = []
    colname_checks = []
    geo_checks = []
    renames = []

    drop_old_tables = PostgresOperator(
        task_id="drop_old_tables",
        sql=DROP_TMPL,
        params=dict(tablenames=TABLES_TO_DROP),
    )

    import_geojson = PythonOperator(
        task_id="import_geojson",
        python_callable=_load_geojson,
        op_args=[default_args.get("postgres_conn_id", "postgres_default")],
    )

    for route in ROUTES:
        count_checks.append(
            COUNT_CHECK.make_check(
                check_id=f"count_check_{route.name}",
                pass_value=3,
                params=dict(table_name=route.tmp_db_table_name),
Esempio n. 18
0
    'depends_on_past': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
    'email_on_retry': False,
    'catchup': False,
}

dag = DAG('data_pipeline_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='@hourly')

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_tables = PostgresOperator(task_id='create_tables',
                                 dag=dag,
                                 postgres_conn_id='redshift',
                                 sql='create_tables.sql')

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    aws_credentials_id='aws_credentials',
    redshift_conn_id='redshift',
    table='staging_events',
    s3_bucket='udacity-dend',
    s3_key='log_data',
    json_parameter='s3://udacity-dend/log_json_path.json')

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=5)
}

dag = DAG("user_behaviour",
          default_args=default_args,
          schedule_interval="0 0 * * *",
          max_active_runs=1)

end_of_data_pipeline = DummyOperator(task_id='end_of_data_pipeline', dag=dag)

pg_unload = PostgresOperator(
    dag=dag,
    task_id='pg_unload',
    sql=unload_user_purchase,
    postgres_conn_id='postgres_default',
    params={'temp_filtered_user_purchase': temp_filtered_user_purchase},
    depends_on_past=True,
    wait_for_downstream=True)

user_purchase_to_s3_stage = PythonOperator(
    dag=dag,
    task_id='user_purchase_to_s3_stage',
    python_callable=_local_to_s3,
    op_kwargs={
        'filename': temp_filtered_user_purchase,
        'key': temp_filtered_user_purchase_key,
    },
)

# remove_local_user_purchase_file = PythonOperator(
    redshift_hook = PostgresHook("redshift")
    records = redshift_hook.get_records("""
        SELECT birthyear FROM younger_riders ORDER BY birthyear DESC LIMIT 1
    """)
    if len(records) > 0 and len(records[0]) > 0:
        logging.info(f"Youngest rider was born in {records[0][0]}")


dag = DAG("lesson3.exercise2", start_date=datetime.datetime.utcnow())

create_oldest_task = PostgresOperator(task_id="create_oldest",
                                      dag=dag,
                                      sql="""
        BEGIN;
        DROP TABLE IF EXISTS older_riders;
        CREATE TABLE older_riders AS (
            SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945
        );
        COMMIT;
    """,
                                      postgres_conn_id="redshift")

log_oldest_task = PythonOperator(task_id="log_oldest",
                                 dag=dag,
                                 python_callable=log_oldest)

create_youngest_task = PostgresOperator(task_id="create_youngest",
                                        dag=dag,
                                        sql="""
        BEGIN;
        DROP TABLE IF EXISTS younger_riders;
Esempio n. 21
0
with open(movie_clean_emr_steps) as json_file:
    emr_steps = json.load(json_file)

last_step = len(emr_steps) - 1

with DAG(dag_id="user_behaviour",
         default_args=default_args,
         schedule_interval="0 0 * * *",
         max_active_runs=1) as dag:

    end_of_data_pipeline = DummyOperator(task_id='end_of_data_pipeline')
    # this task loads the user purchase data we wanted into local csv file
    pg_unload = PostgresOperator(
        task_id='pg_unload',
        sql=unload_user_purchase,
        postgres_conn_id='postgres_default',
        params={'temp_filtered_user_purchase': temp_filtered_user_purchase},
        depends_on_past=True,
        wait_for_downstream=True)
    # this task deploys the local file onto S3
    user_purchase_to_s3_stage = PythonOperator(
        task_id='user_purchase_to_s3_stage',
        python_callable=_local_to_s3,
        op_kwargs={
            'filename': temp_filtered_user_purchase,
            'key': temp_filtered_user_purchase_key,
        },
    )
    # after deployment, remove the local file
    remove_local_user_purchase_file = PythonOperator(
        task_id='remove_local_user_purchase_file',
            task_id="update_SQL_data",
            python_callable=convert_biz_data,
            op_args=[
                f"{dag_id}_{dag_id}_new",
                f"{tmp_dir}/{dag_id}.utf8.sql",
                f"{tmp_dir}/{file}",
                f"{tmp_dir}/{dag_id}_updated_data_insert.sql",
            ],
        ) for files in files_to_download.values() for file in files
        if "xlsx" in file
    ]

    # 8. CREATE target TABLE (the ogr2ogr output is not used to create the table)
    create_table = PostgresOperator(
        task_id=f"create_target_table",
        sql=CREATE_TABLE,
        params=dict(tablename=f"{dag_id}_{dag_id}_new"),
    )

    # 9. Import data
    import_data = BashOperator(
        task_id="import_data",
        bash_command=
        f"psql {pg_params()} < {tmp_dir}/{dag_id}_updated_data_insert.sql",
    )

    # 10. UPDATE target TABLE (add display field content)
    update_table = PostgresOperator(
        task_id=f"update_target_table",
        sql=UPDATE_TABLE,
        params=dict(tablename=f"{dag_id}_{dag_id}_new"),
Esempio n. 23
0
    'email_on_retry': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
    'catchup': False,
    'depends_on_past': False,
    'schedule_interval': '@hourly'
}

dag = DAG('etl_task',
          default_args=default_args,
          description='Data Pipelines with Airflow')

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_tables = PostgresOperator(task_id="create_tables",
                                 postgres_conn_id="redshift",
                                 sql="create_tables.sql",
                                 dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id="stage_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    table="staging_events",
    s3_bucket='udacity-dend',
    s3_key="log_data/",
    extra_params="format as json 's3://udacity-dend/log_json_path.json'",
    dag=dag)

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    redshift_conn_id="redshift",
Esempio n. 24
0
    'start_date': datetime(2021, 4, 12),
    'retries': 2,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG(
    'amvisitors_dbschema_dag',
    default_args=default_args,
    description='Cleand DB and prepare Lookup Tables in Redshift with Airflow',
    schedule_interval=None,
    catchup=False)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

drop_db_schema = PostgresOperator(task_id=f"drop_db_schema",
                                  dag=dag,
                                  postgres_conn_id='redshift',
                                  sql=SqlQueries.db_schema_drop)

lt_mode_to_redshift = LoadLookupToRedshiftOperator(
    task_id='Load_Mode_Lookup_Tables',
    provide_context=False,
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    table="dim_mode",
    sql_init_command=SqlQueries.mode_table_create,
    copy_options="IGNOREHEADER 0 FORMAT AS JSON 'auto'",
    s3_path="s3://lzalewsk-capstone/lookup_data/_i94model.json",
    region="us-west-2")

lt_location_to_redshift = LoadLookupToRedshiftOperator(
Esempio n. 25
0
importing_zones_from_S3 = PythonOperator(dag=dag,
                                         task_id='importing_zones_from_S3',
                                         provide_context=False,
                                         python_callable=dim_table)

# popular_destinations=PythonOperator(dag=dag,
#                             task_id='popular_destinations',
#                             provide_context=False,
#                             python_callable=run_sql,
#                             op_kwargs={'file_path': './sql/DML/popular_destinations_monthly.sql'})

postgres_conn_id = 'postgres_airflow'

create_stg_taxis_data = PostgresOperator(dag=dag,
                                         postgres_conn_id=postgres_conn_id,
                                         task_id='create_stg_taxis_data',
                                         sql='DDL/create_stg_taxis_data.sql')

staging_taxis_data = PostgresOperator(dag=dag,
                                      postgres_conn_id=postgres_conn_id,
                                      task_id='staging_taxis_data',
                                      sql='DML/staging_taxis_data.sql')

create_table_popular_destination_zones = PostgresOperator(
    dag=dag,
    postgres_conn_id=postgres_conn_id,
    task_id='create_table_popular_destination_zones',
    sql='DDL/create_table_popular_destination_zones.sql')

popular_destinations_zones_passengers = PostgresOperator(
    dag=dag,
Esempio n. 26
0
def standardize_task(task_id):
    return PostgresOperator(
        task_id=task_id,
        postgres_conn_id='facdb',
        sql="/facdb_2_assembly/standardize/{0}.sql".format(task_id),
        dag=facdb_2_assembly)
Esempio n. 27
0
    Add IMDB ratings and metadata to shows
    '''


def promote_staging(**kwargs):
    '''
    Replace the live schema with the staging schema
    '''


default_args = {'owner': 'airflow', 'retries': 0}

with DAG('syncing_movie_and_tv_data',
         default_args=default_args,
         start_date=datetime.now(),
         schedule_interval=timedelta(minutes=10)) as dag:

    # TODO: use alembic for this?
    create_my_ratings = PostgresOperator(task_id='create_my_ratings',
                                         postgres_conn_id='postgres_movies',
                                         sql='''
        create table if not exists my_ratings(imdb_url text unique primary key, stars integer not null);
        truncate table my_ratings;
        ''',
                                         dag=dag)

    extract_airtable_shows = PythonOperator(
        task_id='extract_airtable_shows',
        python_callable=extract_airtable_shows,
        dag=dag)
Esempio n. 28
0
        task_id="insert_into_table_markering",
        sql_files=[f"{sql_path}/cmsa_data_insert_markering.sql"],
    )

    # 9. RENAME columns based on PROVENANCE
    provenance_translation = ProvenanceRenameOperator(
        task_id="provenance_rename",
        dataset_name=f"{dag_id}",
        prefix_table_name=f"{dag_id}_",
        postfix_table_name="_new",
        rename_indexes=False,
        pg_schema="public",
    )

    # 10. Rename temp named tables to final names
    rename_tables = PostgresOperator(task_id="rename_tables",
                                     sql=SQL_TABLE_RENAMES)

(slack_at_start >> mkdir >> download_geojson >> fetch_files >> proces_cmsa >>
 create_tables >> import_data >> fill_markering >> provenance_translation >>
 rename_tables)

dag.doc_md = """
    #### DAG summery
    This DAG containts crowd monitoring sensor data,
    the source is the CMSA (Crowd Monitoring Systeem Amsterdam)
    #### Mission Critical
    Classified as 2 (beschikbaarheid [range: 1,2,3])
    #### On Failure Actions
    Fix issues and rerun dag on working days
    #### Point of Contact
    Inform the businessowner at [businessowner]@amsterdam.nl
Esempio n. 29
0
    dag=dag,
)

#
# TODO: Consolidate check_trips and check_stations into a single check in the subdag
#       as we did with the create and copy in the demo
#
check_trips = HasRowsOperator(task_id="check_trips_data",
                              dag=dag,
                              redshift_conn_id="redshift",
                              table="trips")

check_stations = HasRowsOperator(task_id="check_stations_data",
                                 dag=dag,
                                 redshift_conn_id="redshift",
                                 table="stations")

location_traffic_task = PostgresOperator(
    task_id="calculate_location_traffic",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.LOCATION_TRAFFIC_SQL)

#
# TODO: Reorder the Graph once you have moved the checks
#
trips_subdag_task >> check_trips
stations_subdag_task >> check_stations
check_stations >> location_traffic_task
check_trips >> location_traffic_task
Esempio n. 30
0
default_args = {"start_date": datetime(2020, 1, 1), "owner": "airflow"}

with DAG(dag_id="twitter_dag",
         schedule_interval="@daily",
         default_args=default_args,
         catchup=False) as dag:
    waiting_for_tweets = FileSensor(task_id="waiting_for_tweets",
                                    fs_conn_id="fs_tweet",
                                    filepath="data.csv",
                                    poke_interval=5)

    fetching_tweets = PythonOperator(task_id="fetching_tweets",
                                     python_callable=fetching_tweets.main)
    cleaning_tweets = PythonOperator(task_id="cleaning_tweets",
                                     python_callable=cleaning_tweets.main)

    storing_tweets = PostgresOperator(
        task_id='storing_tweets',
        postgres_conn_id="postgres_default",
        sql=
        '''CREATE TABLE IF NOT EXISTS tweets(Tweet varchar(250), Date varchar(50),  Retweet_from varchar(50), T_User varchar(50));'''
    )

    update_tweets = PostgresOperator(
        task_id='update_tweets',
        postgres_conn_id="postgres_default",
        sql=
        '''COPY tweets(Tweet, Date, Retweet_from, T_User) FROM '/tmp/data_cleaned.csv' DELIMITER ',' CSV HEADER;'''
    )

    waiting_for_tweets >> fetching_tweets >> cleaning_tweets >> storing_tweets >> update_tweets
    'owner': 'udacity',
    'start_date': datetime(2020, 4, 14),
}

dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='@daily'
        )

start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

#Table Creation task
create_tables_task = PostgresOperator(
    task_id="create_tables",
    dag=dag,
    sql='create_tables.sql',
    postgres_conn_id="redshift"
  )

#Load task: S3 to staging table-staging_events 
stage_events_to_redshift = StageToRedshiftOperator(
    task_id='stage_events',
    dag=dag,
    table="staging_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="log_data" 
)

#Load task: S3 to staging table-staging_songs
Esempio n. 32
0
    'retry_delay':300,
    'email_on_retry': False,
    'catchup': False
}

dag = DAG('Sparkify_Data_Pipelines_DAG',
          default_args=default_args,
          description='Extact, load and transform data in Redshift with Airflow',
          schedule_interval='@hourly'
        )

start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

create_staging_events_table = PostgresOperator(
    task_id='Create_staging_events_table',
    dag=dag,
    postgres_conn_id='redshift',
    sql=SqlQueries.staging_events_table_create
)

create_staging_songs_table = PostgresOperator(
    task_id='Create_staging_songs_table', 
    dag=dag,
    postgres_conn_id='redshift',
    sql=SqlQueries.staging_songs_table_create
)

create_songplays_table = PostgresOperator(
    task_id='Create_songplays_table',
    dag=dag,
    postgres_conn_id='redshift',
    sql=SqlQueries.songplays_table_create
Esempio n. 33
0
                              filename='tinydesk_api_data.json',
                              aws_conn_id='tinydesk_aws',
                              bucket_name=os.environ['S3_BUCKET'],
                              dag=dag)

t11 = ScrapeYoutubeAPIOperator(
    task_id='get_tinydesk_at_home_video_data',
    playlist_id='PLy2PCKGkKRVYPm1tBwoX45ocAzuhVyvJX',
    youtube_api_key=os.environ['YOUTUBE_API_KEY'],
    filename='tinydesk_at_home_api_data.json',
    aws_conn_id='tinydesk_aws',
    bucket_name=os.environ['S3_BUCKET'],
    dag=dag)

t2 = PostgresOperator(task_id='empty_staging_table',
                      sql='DELETE FROM VIDEO_STAGING',
                      postgres_conn_id='tinydesk_postgres',
                      dag=dag)

t13 = PostgresOperator(task_id='empty_staging_table_at_home',
                       sql='DELETE FROM VIDEO_STAGING_AT_HOME',
                       postgres_conn_id='tinydesk_postgres',
                       dag=dag)

t3 = FileToPostgresOperator(task_id='load_to_staging',
                            filename='tinydesk_api_data.json',
                            postgres_conn_id='tinydesk_postgres',
                            table='video_staging',
                            aws_conn_id='tinydesk_aws',
                            bucket_name=os.environ['S3_BUCKET'],
                            dag=dag)
# Initialize subdag variables
parent_task_id = 'udac_example_dag'
start_date = datetime.utcnow()

dag = DAG(
    parent_task_id,
    default_args=default_args,
    description='Load and transform data in Redshift with Airflow',
)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

# Create Staging tables
create_staging_events = PostgresOperator(
    task_id="create_staging_events",
    dag=dag,
    postgres_conn_id="redshift",
    sql=CreateTables.create_staging_events)

create_songs_table = PostgresOperator(task_id="create_staging_songs",
                                      dag=dag,
                                      postgres_conn_id="redshift",
                                      sql=CreateTables.create_staging_songs)

# Populate staging tables
stage_events_to_redshift = StageToRedshiftOperator(
    task_id="Stage_events",
    dag=dag,
    table="staging_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",