Esempio n. 1
0
        "db_user": "******",
    }

    get = BashOperator(
        task_id='get_' + source,
        bash_command=
        'npm run get {{ params.source }} --prefix=~/airflow/dags/facdb_1_download -- --ftp_user={{ params.ftp_user }} --ftp_pass={{ params.ftp_pass }} --download_dir={{ params.download_dir }}',
        params=params,
        dag=facdb_1_download)

    push = BashOperator(
        task_id='push_' + source,
        bash_command=
        "npm run push {{ params.source }} --prefix=~/airflow/dags/facdb_1_download -- --db={{ params.db }} --db_user={{ params.db_user }} --download_dir={{ params.download_dir }}",
        params=params,
        dag=facdb_1_download)

    get >> push

    if os.path.isfile(
            "/home/airflow/airflow/dags/facdb_1_download/datasets/{0}/after.sql"
            .format(source)):
        after = PostgresOperator(
            task_id='after_' + source,
            postgres_conn_id='facdb',
            sql="/facdb_1_download/datasets/{0}/after.sql".format(source),
            dag=facdb_1_download)
        push >> after >> trigger_facdb_2_assembly
    else:
        push >> trigger_facdb_2_assembly
Esempio n. 2
0
    dag=dag
)

transform_population_task = PythonOperator(
    task_id="transform_population",
    python_callable=transform_population_func,
    op_kwargs={
        "input_csv_file_name": f"{project_dir}/data/17100009.csv",
        "spark_output_dir": f"{project_dir}/population"
    },
    dag=dag
)

create_contributions_in_postgres = PostgresOperator(
    task_id="create_contributions_in_postgres",
    sql=sql_queries.create_contributions,
    postgres_conn_id="postgres",
    dag=dag
)

create_population_in_postgres = PostgresOperator(
    task_id="create_population_in_postgres",
    sql=sql_queries.create_population,
    postgres_conn_id="postgres",
    dag=dag
)

load_contributions_to_postgres = PythonOperator(
    task_id="load_contributions_to_postgres",
    python_callable=load_spark_csv_to_postgres,
    op_kwargs={
        "spark_csv_dir": f"{project_dir}/contributions",
            t_srs="EPSG:28992",
            input_file_sep="SEMICOLON",
            auto_detect_type="YES",
            geometry_name="geometrie",
            mode="PostgreSQL",
            db_conn=db_conn,
        )
        for key, file in files_to_download.items()
    ]

    # 6. RE-define GEOM type (because ogr2ogr cannot set geom with .csv import)
    # except themas itself, which is a dimension table (parent) of veiligeafstanden table
    redefine_geoms = [
        PostgresOperator(
            task_id=f"re-define_geom_{key}",
            sql=SET_GEOM,
            params=dict(tablename=f"{dag_id}_{key}_new"),
        )
        for key in files_to_download.keys()
        if key == "veiligeafstanden"
    ]

    # 7. Add thema-context to child tables from parent table (themas)
    # except themas itself, which is a dimension table (parent) of veiligeafstanden table
    add_thema_contexts = [
        PostgresOperator(
            task_id=f"add_context_{key}",
            sql=ADD_THEMA_CONTEXT,
            params=dict(tablename=f"{dag_id}_{key}_new", parent_table=f"{dag_id}_themas_new"),
        )
        for key in files_to_download.keys()
AWS_KEY = os.environ.get('AWS_KEY')
AWS_SECRET = os.environ.get('AWS_SECRET')

default_args = {
    'owner': 'udacity',
    'start_date': datetime(2019, 1, 12),
}

dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *')

start_operator = PostgresOperator(task_id='Begin_execution',
                                  dag=dag,
                                  postgres_conn_id="redshift",
                                  sql=create_tables.create_table_queries)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table="staging_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="/udacity-dend",
    s3_key="log_data")

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    table="staging_songs",
Esempio n. 5
0
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG('etl_categories',
          schedule_interval='@hourly',
          catchup=False,
          default_args=default_args,
          description='Loads newly added menus categories daily.')

t1 = PostgresOperator(task_id='if_not_exists',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    CREATE TABLE IF NOT EXISTS categories (
      id          SERIAL PRIMARY KEY,
      name        VARCHAR(64) NOT NULL,
      menu_id     INTEGER REFERENCES menus(id),
      description TEXT,
      UNIQUE (name, menu_id)
    );''',
                      dag=dag)

t2 = PostgresOperator(task_id='insert',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    INSERT INTO categories (id, name, menu_id, description)
      SELECT id, name, menu_id, description
        FROM tmp_categories;
    ''',
                      dag=dag)
Esempio n. 6
0
def load_trip_data_to_redshift(*args, **kwargs):
    aws_hook = AwsHook("aws_credentials")
    credentials = aws_hook.get_credentials()
    redshift_hook = PostgresHook("redshift")
    sql_stmt = sql.COPY_ALL_TRIPS_SQL.format(
        credentials.access_key,
        credentials.secret_key,
    )
    redshift_hook.run(sql_stmt)


dag = DAG('lesson2.demo1', start_date=datetime.datetime.now())

create_trips_table = PostgresOperator(task_id="create_trips_table",
                                      dag=dag,
                                      postgres_conn_id="redshift",
                                      sql=sql.CREATE_TRIPS_TABLE_SQL)

copy_trips_task = PythonOperator(
    task_id='load_trips_from_s3_to_redshift',
    dag=dag,
    python_callable=load_trip_data_to_redshift,
)

location_traffic_task = PostgresOperator(task_id="calculate_location_traffic",
                                         dag=dag,
                                         postgres_conn_id="redshift",
                                         sql=sql.LOCATION_TRAFFIC_SQL)

create_trips_table >> copy_trips_task
# TODO: First, load the Airflow UI and run this DAG once.
Esempio n. 7
0
    'email': '*****@*****.**',
    'email_on_failure': False,
    'retries': 1, 
    'retry_delay': timedelta(minutes=5)
    }

# initiating the DAG
dag = airflow.DAG(
    dag_id='outlier_detector',
    schedule_interval="@weekly",
    default_args=args,
    max_active_runs=1)

task0 = PostgresOperator(
    task_id='pull_data_from_postgres',
    sql=sql_pull_data.format(filename),
    postgres_conn_id='postgres_default',
    dag=dag)

task1 = PythonOperator(
    task_id='detect_outliers',
    provide_context=True,
    op_args=[filename],
    python_callable=detect_outliers,
    dag=dag)

task2 = PythonOperator(
    task_id='send_email_if_outliers',
    provide_context=True,
    # all the variable used below should be setup as environment variable
    op_args=[email_to, email_cc, subject, message],
Esempio n. 8
0
        python_callable=process_osm.modify_tables,
    )
    osm_add_metadata = PythonOperator(task_id="add-osm-metadata",
                                      python_callable=import_osm.add_metadata)
    setup >> osm_download >> osm_import >> osm_migrate >> osm_add_metadata

    # VG250 (Verwaltungsgebiete 250) data import
    vg250_download = PythonOperator(
        task_id="download-vg250",
        python_callable=import_vg250.download_vg250_files,
    )
    vg250_import = PythonOperator(task_id="import-vg250",
                                  python_callable=import_vg250.to_postgres)
    vg250_nuts_mview = PostgresOperator(
        task_id="vg250_nuts_mview",
        sql="vg250_lan_nuts_id_mview.sql",
        postgres_conn_id="egon_data",
        autocommit=True,
    )
    vg250_metadata = PythonOperator(
        task_id="add-vg250-metadata",
        python_callable=import_vg250.add_metadata,
    )
    vg250_clean_and_prepare = PostgresOperator(
        task_id="vg250_clean_and_prepare",
        sql="cleaning_and_preparation.sql",
        postgres_conn_id="egon_data",
        autocommit=True,
    )
    setup >> vg250_download >> vg250_import >> vg250_nuts_mview
    vg250_nuts_mview >> vg250_metadata >> vg250_clean_and_prepare
Esempio n. 9
0
    project_root=project_root,
    repos=[
        {
            'name': 'flights_meta',
            'zenodo_id': flights_repo
        },
        #{'name': 'tweets_meta', 'zenodo_id': tweets_repo }, # TODO: out of scope of this version
    ])
covid_data_task = RawDataHandler(task_id="covid_data_downloader",
                                 dag=dag,
                                 destination_folder=output_path,
                                 s3_bucket='udacity-awss',
                                 aws_credentials_id="s3_credentials")

create_tables_task = PostgresOperator(task_id="create_tables",
                                      dag=dag,
                                      postgres_conn_id="redshift",
                                      sql=SqlQueries.create_sttmts)

create_emr_task = EmrCreateJobFlowOperator(
    task_id="create_emr_cluster",
    job_flow_overrides=EmrHandler.JOB_FLOW_OVERRIDES,
    aws_conn_id="aws_credentials",
    emr_conn_id="emr_connection",
    dag=dag)

add_emr_mount_task = EmrAddStepsOperator(
    task_id='add_emr_mount',
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
    aws_conn_id='aws_credentials',
    steps=EmrHandler.SPARK_STEP_MOUNT,
Esempio n. 10
0
                bash_command=f"psql {pg_params()} < {path}",
            )
        )
    for path in (
        f"{tmp_dir}/hior_properties_new.sql",
        f"{tmp_dir}/hior_attributes_new.sql",
    ):
        name = pathlib.Path(path).stem
        import_linked_tables.append(
            BashOperator(
                task_id=f"create_{name}",
                bash_command=f"psql {pg_params()} < {path}",
            )
        )

    rename_table = PostgresOperator(task_id="rename_table", sql=SQL_TABLE_RENAME)

    # Grant database permissions
    grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=dag_id)


(
    slack_at_start
    >> fetch_xls
    >> convert_data
    >> create_table
    >> import_tables[1:]
    >> rename_table
    >> grant_db_permissions
)
                                       table='reopening_tier',
                                       source_table='staging_reopening_tier',
                                       sql=LoadFactQueries.load_open_tiers)

# transform the staging tables and load nationwide cases fact table on redshift
loadfact_nationwide = LoadFactOperator(task_id='Loadfact_nationwide',
                                       dag=dag,
                                       redshift_conn_id='redshift',
                                       table='other_states_cases',
                                       source_table='nationwide_cases',
                                       sql=LoadFactQueries.load_nationwide)

# transform the staging tables and load healthcare facilities dimension table on redshift
loaddim_healthcare = PostgresOperator(
    task_id='Loaddim_healthcare',
    dag=dag,
    sql=LoadDimensionQueries.load_healthcare_facs,
    postgres_conn_id='redshift')

# transform the staging tables and load county dimension table on redshift
loaddim_county = PostgresOperator(task_id='Loaddim_county',
                                  dag=dag,
                                  sql=LoadDimensionQueries.load_county,
                                  postgres_conn_id='redshift')

# transform the staging tables and load prison dimension table on redshift
loaddim_prison = PostgresOperator(task_id='Loaddim_prison',
                                  dag=dag,
                                  sql=LoadDimensionQueries.load_prison,
                                  postgres_conn_id='redshift')
# run only once the dag to create table
dag = DAG('01_sparkify_create_tables_dag',
          default_args=default_args,
          description=
          'Create staging, Fact, Dimension tables in Redshift with Airflow',
          schedule_interval='@once',
          catchup=False)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

# Define task to create tables in Redshift

create_staging_events_table = PostgresOperator(
    task_id='Create_staging_events_table',
    dag=dag,
    postgres_conn_id='redshift',
    sql=SqlQueries.staging_events_table_create)

create_staging_songs_table = PostgresOperator(
    task_id='Create_staging_songs_table',
    dag=dag,
    postgres_conn_id='redshift',
    sql=SqlQueries.staging_songs_table_create)

create_songplays_table = PostgresOperator(
    task_id='Create_songplays_table',
    dag=dag,
    postgres_conn_id='redshift',
    sql=SqlQueries.songplays_table_create)
    'owner': '211dashboard',
    'start_date': datetime(2020, 6, 1),
    'concurrency': 1,
    'retries': 0,
    'depends_on_past': False,
    'catchup': False
}

dag = DAG(dag_id='211dash_manual_update',
          schedule_interval='@once',
          template_searchpath=SEARCH_PATH,
          default_args=args)
''' Define manual update operators. '''
''' 1. Census data operators '''

truncate_core_census_tables = PostgresOperator(
    task_id='truncate_core_census_tables', sql='trnctTbls_census.sql', dag=dag)

transform_census_county_files = PythonOperator(
    task_id='transform_census_county_files',
    python_callable=transform_static_s3,
    op_kwargs={
        'data': 'census_county',
        'filename': 'census_data_by_county.csv',
        'resource_path': RESOURCE_PATH,
        'transformer': transform_census_data,
        'sep': '|'
    },
    dag=dag)

transform_census_tract_files = PythonOperator(
    task_id='transform_census_tract_files',
        "lesson3.exercise3",
        stations_task_id,
        "redshift",
        "aws_default",
        "stations",
        sql_statements.CREATE_STATIONS_TABLE_SQL,
        s3_bucket="udacity-dend",
        s3_key="data-pipelines/divvy/unpartitioned/divvy_stations_2017.csv",
        start_date=start_date,
    ),
    task_id=stations_task_id,
    dag=dag,
)

#
# TODO: Consolidated multiple HasRowsOperator into subdag
#

location_traffic_task = PostgresOperator(
    task_id="calculate_location_traffic",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.LOCATION_TRAFFIC_SQL
)

#
# TODO: Reorder the Graph once you have moved the checks
#
trips_subdag_task >> location_traffic_task
stations_subdag_task >> location_traffic_task
Esempio n. 15
0
    'retry_delay': timedelta(minutes=5),
}

# Defining the DAG
dag = DAG('udac_capstone_dag',
          default_args=default_args,
          description='Transform data in S3 and load to Redshift with Airflow',
          schedule_interval='@monthly',
          max_active_runs=1,
          catchup=False)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

# Task to create tables in Redshift
create_tables_task = PostgresOperator(task_id="create_tables",
                                      sql='capstone_create_tables.sql',
                                      postgres_conn_id="redshift",
                                      dag=dag)

# Initial processing of datasets and loading cleansed data to S3

S3_immig_task = BashOperator(
    task_id='load_to_S3_immig',
    bash_command='python /home/workspace/airflow/dags/script/etl_immig.py',
    dag=dag)

S3_temp_task = BashOperator(
    task_id='load_to_S3_temp',
    bash_command='python /home/workspace/airflow/dags/script/etl_temp.py',
    dag=dag)

S3_us_demog_task = BashOperator(
# Tasks definition:
# -----------------

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

# Drop tables:
# -----------
redshift_conn_id = "redshift"

drop_tables_subtask = PostgresOperator(
    task_id="dropping_staging_events_table",
    dag=dag,
    postgres_conn_id=redshift_conn_id,
    sql=("""DROP TABLE IF EXISTS staging_events;
   DROP TABLE IF EXISTS staging_songs;
   DROP TABLE IF EXISTS songplays;
   DROP TABLE IF EXISTS artists;
   DROP TABLE IF EXISTS songs;
   DROP TABLE IF EXISTS time;
   DROP TABLE IF EXISTS users;
   """))
#

# Create tables:
#--------------

create_staging_events_table_task = PostgresOperator(
    task_id="create_staging_events_table",
    dag=dag,
    postgres_conn_id=redshift_conn_id,
    sql=SqlQueries.create_staging_events_table.format("staging_events"))
with DAG("prepare_code_postal",
         default_args=default_args,
         schedule_interval=None) as dag:

    cmd = "mkdir -p $DIR_PATH & wget \"$URL\" -O $DIR_PATH/code_postal.csv"

    download = BashOperator(
        env={
            "URL": "https://www.data.gouv.fr/fr/datasets/r/554590ab-ae62-40ac-8353-ee75162c05ee",
            "DIR_PATH": "{data_dir}/communes".format(data_dir=DATA_DIR)},
        task_id="download",
        bash_command=cmd)

    load = EmbulkOperator(
        task_id="load",
        embulk_config="code_postal.yml.liquid")

    prepare = PythonOperator(
        task_id="prepare",
        python_callable=recipes.prepare_code_postal)

    create_index = PostgresOperator(
        task_id="create_index",
        sql=textwrap.dedent("""
            CREATE INDEX code_postal_code_insee_idx
            ON etl.code_postal (code_insee)"""),
        postgres_conn_id=CONN_ID)

    download >> load >> prepare >> create_index
Esempio n. 18
0
create_tables = PostgresOperator(
    task_id="create_tables",
    dag=dag,
    postgres_conn_id="redshift",
    sql="""
        CREATE TABLE IF NOT EXISTS public.artists (
            artistid varchar(256) NOT NULL,
            name varchar(256),
            location varchar(256),
            lattitude numeric(18,0),
            longitude numeric(18,0)
        );
        CREATE TABLE IF NOT EXISTS public.songplays (
            playid varchar(32) NOT NULL,
            start_time timestamp NOT NULL,
            userid int4 NOT NULL,
            "level" varchar(256),
            songid varchar(256),
            artistid varchar(256),
            sessionid int4,
            location varchar(256),
            user_agent varchar(256),
            CONSTRAINT songplays_pkey PRIMARY KEY (playid)
        );
        CREATE TABLE IF NOT EXISTS public.songs (
            songid varchar(256) NOT NULL,
            title varchar(256),
            artistid varchar(256),
            "year" int4,
            duration numeric(18,0),
            CONSTRAINT songs_pkey PRIMARY KEY (songid)
        );
        CREATE TABLE IF NOT EXISTS public.staging_events (
            artist varchar(256),
            auth varchar(256),
            firstname varchar(256),
            gender varchar(256),
            iteminsession int4,
            lastname varchar(256),
            length numeric(18,0),
            "level" varchar(256),
            location varchar(256),
            "method" varchar(256),
            page varchar(256),
            registration numeric(18,0),
            sessionid int4,
            song varchar(256),
            status int4,
            ts int8,
            useragent varchar(256),
            userid int4
        );
        CREATE TABLE IF NOT EXISTS public.staging_songs (
            num_songs int4,
            artist_id varchar(256),
            artist_name varchar(256),
            artist_latitude numeric(18,0),
            artist_longitude numeric(18,0),
            artist_location varchar(256),
            song_id varchar(256),
            title varchar(256),
            duration numeric(18,0),
            "year" int4
        );
        CREATE TABLE IF NOT EXISTS public.users (
            userid int4 NOT NULL,
            first_name varchar(256),
            last_name varchar(256),
            gender varchar(256),
            "level" varchar(256),
            CONSTRAINT users_pkey PRIMARY KEY (userid)
        );
        CREATE TABLE IF NOT EXISTS time (
            start_time timestamp PRIMARY KEY,
            hour integer,
            day integer,
            week integer,
            month integer,
            year integer,
            weekday integer
)
    """
)
Esempio n. 19
0
def init_dims_sub_dag(parent_dag_name, child_dag_name, start_date,
                      redshift_conn_id):
    dag = DAG('%s.%s' % (parent_dag_name, child_dag_name),
              start_date=start_date)

    drop_dim_vehicles_task = PostgresOperator(
        task_id='drop_dim_vehicles',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_VEHICLES)

    drop_dim_vehicle_models_task = PostgresOperator(
        task_id='drop_dim_vehicle_models',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_VEHICLE_MODELS)

    drop_dim_rental_zones_task = PostgresOperator(
        task_id='drop_dim_rental_zones',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_RENTAL_ZONES)

    drop_dim_companies_task = PostgresOperator(
        task_id='drop_dim_companies',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_COMPANIES)

    drop_dim_categories_task = PostgresOperator(
        task_id='drop_dim_categroies',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_CATEGORIES)

    drop_dim_date_task = PostgresOperator(
        task_id='drop_dim_date',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_DATE)

    drop_dim_weather_task = PostgresOperator(
        task_id='drop_dim_weather',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_WEATHER)

    create_dim_vehicles_task = PostgresOperator(
        task_id='create_dim_vehicles',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_VEHICLES)

    create_dim_vehicle_models_task = PostgresOperator(
        task_id='create_dim_vehicle_models',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_VEHICLE_MODELS)

    create_dim_rental_zones_task = PostgresOperator(
        task_id='create_dim_rental_zones',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_RENTAL_ZONES)

    create_dim_companies_task = PostgresOperator(
        task_id='create_dim_companies',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_COMPANIES)

    create_dim_categories_task = PostgresOperator(
        task_id='create_dim_categories',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_CATEGORIES)

    create_dim_date_task = PostgresOperator(
        task_id='create_dim_date',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_DATE)

    create_dim_weather_task = PostgresOperator(
        task_id='create_dim_weather',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_WEATHER)

    drop_dim_vehicles_task >> create_dim_vehicles_task
    drop_dim_vehicle_models_task >> create_dim_vehicle_models_task
    drop_dim_rental_zones_task >> create_dim_rental_zones_task
    drop_dim_companies_task >> create_dim_companies_task
    drop_dim_categories_task >> create_dim_categories_task
    drop_dim_date_task >> create_dim_date_task
    drop_dim_weather_task >> create_dim_weather_task

    return dag
Esempio n. 20
0
          description='Loads newly registered restaurants daily.')

# Wait for new_food_deliveries DAG to complete
t1 = ExternalTaskSensor(task_id='wait_for_new_food_deliveries',
                        external_dag_id='new_food_deliveries',
                        mode='reschedule',
                        dag=dag)

t2 = PostgresOperator(task_id='if_not_exists',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    CREATE TABLE IF NOT EXISTS restaurants (
      id                SERIAL PRIMARY KEY,
      created_at        TIMESTAMP NOT NULL,
      updated_at        TIMESTAMP NOT NULL,
      name              VARCHAR(64) NOT NULL,
      email             VARCHAR(64) UNIQUE NOT NULL,
      address           VARCHAR(64) NOT NULL,
      phone             VARCHAR(64) NOT NULL,
      city_id           INTEGER REFERENCES cities(id),
      business_hours_id INTEGER REFERENCES business_hours(id),
      description       TEXT
    );''',
                      dag=dag)

t3 = PostgresOperator(task_id='etl',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    INSERT INTO restaurants (id, created_at, updated_at, name, email, address, phone, city_id, business_hours_id, description)
      SELECT id, created_at, updated_at, name, email, address, phone, city_id, business_hours_id, description
        FROM tmp_restaurants;
    ''',
Esempio n. 21
0
)

load_and_analyze = PythonOperator(
    task_id='load_and_analyze',
    dag=dag,
    python_callable=load_and_analyze,
    provide_context=True,
)

create_oldest_task = PostgresOperator(
    task_id="create_oldest",
    dag=dag,
    sql="""
        BEGIN;
        DROP TABLE IF EXISTS older_riders;
        CREATE TABLE older_riders AS (
            SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945
        );
        COMMIT;
    """,
    postgres_conn_id="redshift"
)

log_oldest_task = PythonOperator(
    task_id="log_oldest",
    dag=dag,
    python_callable=log_oldest
)

create_youngest_task = PostgresOperator(
    task_id="create_youngest",
Esempio n. 22
0
from has_rows import HasRowsOperator

import sql_statements


dag = DAG(
    'bicycle_sharing_example',
    start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
    end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0),
    schedule_interval='@monthly',
    max_active_runs=1
)

create_trips_table = PostgresOperator(
    task_id="create_trips_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_TRIPS_TABLE_SQL
)

# https://airflow.apache.org/docs/apache-airflow/stable/_modules/airflow/models/baseoperator.html
copy_trips_task = S3ToRedshiftOperator(
    aws_credentials_id="aws_credentials",
    redshift_conn_id="redshift",
    table="trips",
    s3_path="s3://udacity-dend/data-pipelines/divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv",
    task_id='load_trips_from_s3_to_redshift',
    dag=dag,
    sla=datetime.timedelta(hours=1)
)

check_trips = HasRowsOperator(
    'start_date': datetime(2013, 1, 1, 0, 0, 0),
    'depends_on_past': True
}

dag = DAG(
    USERNAME + '_final_project_dwh_etl_traffic',
    default_args=default_args,
    description='Final project DWH ETL traffic',
    schedule_interval="0 0 1 1 *",
    max_active_runs = 1,
)

clear_ods = PostgresOperator(
    task_id="clear_ods",
    dag=dag,
    sql="""
        DELETE FROM ygladkikh.project_ods_traffic WHERE EXTRACT(YEAR FROM time_stamp) = {{ execution_date.year }}
    """
)

fill_ods = PostgresOperator(
    task_id="fill_ods",
    dag=dag,
    sql="""
        INSERT INTO ygladkikh.project_ods_traffic
        SELECT 	user_id,to_timestamp("timestamp"/1000), device_id,	device_ip_addr,	bytes_sent,	bytes_received 
        FROM ygladkikh.project_stg_traffic 
        WHERE EXTRACT(YEAR FROM to_timestamp("timestamp"/1000)) = {{ execution_date.year }}
    """
)
Esempio n. 24
0
    redshift_hook = PostgresHook("redshift")
    records = redshift_hook.get_records("""
        SELECT birthyear FROM younger_riders ORDER BY birthyear DESC LIMIT 1
    """)
    if len(records) > 0 and len(records[0]) > 0:
        logging.info(f"Youngest rider was born in {records[0][0]}")


dag = DAG("lesson3.exercise2", start_date=datetime.datetime.utcnow())

create_oldest_task = PostgresOperator(task_id="create_oldest",
                                      dag=dag,
                                      sql="""
        BEGIN;
        DROP TABLE IF EXISTS older_riders;
        CREATE TABLE older_riders AS (
            SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945
        );
        COMMIT;
    """,
                                      postgres_conn_id="redshift")

create_youngest_task = PostgresOperator(task_id="create_youngest",
                                        dag=dag,
                                        sql="""
        BEGIN;
        DROP TABLE IF EXISTS younger_riders;
        CREATE TABLE younger_riders AS (
            SELECT * FROM trips WHERE birthyear > 2000
        );
        COMMIT;
        for line in f:
            domain_code, page_title, view_counts, _ = line.split(" ")
            if domain_code == "en" and page_title in pagenames:
                result[page_title] = view_counts

    with open("/tmp/postgres_query.sql", "w") as f:
        for pagename, pageviewcount in result.items():
            f.write("INSERT INTO pageview_counts VALUES ("
                    f"'{pagename}', {pageviewcount}, '{execution_date}'"
                    ");\n")


fetch_pageviews = PythonOperator(
    task_id="fetch_pageviews",
    python_callable=_fetch_pageviews,
    op_kwargs={
        "pagenames": {"Google", "Amazon", "Apple", "Microsoft", "Facebook"}
    },
    provide_context=True,
    dag=dag,
)

write_to_postgres = PostgresOperator(
    task_id="write_to_postgres",
    postgres_conn_id="my_postgres",
    sql="postgres_query.sql",
    dag=dag,
)

get_data >> extract_gz >> fetch_pageviews >> write_to_postgres