Esempi in Python per PostgresOperator.PostgresOperator, esempi in Python per airflow.operators.postgres_operator.PostgresOperator.PostgresOperator

Esempio n. 1

0

Mostra file

        "db_user": "******",
    }

    get = BashOperator(
        task_id='get_' + source,
        bash_command=
        'npm run get {{ params.source }} --prefix=~/airflow/dags/facdb_1_download -- --ftp_user={{ params.ftp_user }} --ftp_pass={{ params.ftp_pass }} --download_dir={{ params.download_dir }}',
        params=params,
        dag=facdb_1_download)

    push = BashOperator(
        task_id='push_' + source,
        bash_command=
        "npm run push {{ params.source }} --prefix=~/airflow/dags/facdb_1_download -- --db={{ params.db }} --db_user={{ params.db_user }} --download_dir={{ params.download_dir }}",
        params=params,
        dag=facdb_1_download)

    get >> push

    if os.path.isfile(
            "/home/airflow/airflow/dags/facdb_1_download/datasets/{0}/after.sql"
            .format(source)):
        after = PostgresOperator(
            task_id='after_' + source,
            postgres_conn_id='facdb',
            sql="/facdb_1_download/datasets/{0}/after.sql".format(source),
            dag=facdb_1_download)
        push >> after >> trigger_facdb_2_assembly
    else:
        push >> trigger_facdb_2_assembly

Esempio n. 2

0

Mostra file

    dag=dag
)

transform_population_task = PythonOperator(
    task_id="transform_population",
    python_callable=transform_population_func,
    op_kwargs={
        "input_csv_file_name": f"{project_dir}/data/17100009.csv",
        "spark_output_dir": f"{project_dir}/population"
    },
    dag=dag
)

create_contributions_in_postgres = PostgresOperator(
    task_id="create_contributions_in_postgres",
    sql=sql_queries.create_contributions,
    postgres_conn_id="postgres",
    dag=dag
)

create_population_in_postgres = PostgresOperator(
    task_id="create_population_in_postgres",
    sql=sql_queries.create_population,
    postgres_conn_id="postgres",
    dag=dag
)

load_contributions_to_postgres = PythonOperator(
    task_id="load_contributions_to_postgres",
    python_callable=load_spark_csv_to_postgres,
    op_kwargs={
        "spark_csv_dir": f"{project_dir}/contributions",

Esempio n. 3

0

Mostra file

File: veiligeafstanden.py Progetto: Amsterdam/dataservices-airflow

            t_srs="EPSG:28992",
            input_file_sep="SEMICOLON",
            auto_detect_type="YES",
            geometry_name="geometrie",
            mode="PostgreSQL",
            db_conn=db_conn,
        )
        for key, file in files_to_download.items()
    ]

    # 6. RE-define GEOM type (because ogr2ogr cannot set geom with .csv import)
    # except themas itself, which is a dimension table (parent) of veiligeafstanden table
    redefine_geoms = [
        PostgresOperator(
            task_id=f"re-define_geom_{key}",
            sql=SET_GEOM,
            params=dict(tablename=f"{dag_id}_{key}_new"),
        )
        for key in files_to_download.keys()
        if key == "veiligeafstanden"
    ]

    # 7. Add thema-context to child tables from parent table (themas)
    # except themas itself, which is a dimension table (parent) of veiligeafstanden table
    add_thema_contexts = [
        PostgresOperator(
            task_id=f"add_context_{key}",
            sql=ADD_THEMA_CONTEXT,
            params=dict(tablename=f"{dag_id}_{key}_new", parent_table=f"{dag_id}_themas_new"),
        )
        for key in files_to_download.keys()

Esempio n. 4

0

Mostra file

File: udac_example_dag.py Progetto: Srinivs-p/Data-Pipelines-with-Airflow

AWS_KEY = os.environ.get('AWS_KEY')
AWS_SECRET = os.environ.get('AWS_SECRET')

default_args = {
    'owner': 'udacity',
    'start_date': datetime(2019, 1, 12),
}

dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *')

start_operator = PostgresOperator(task_id='Begin_execution',
                                  dag=dag,
                                  postgres_conn_id="redshift",
                                  sql=create_tables.create_table_queries)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table="staging_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="/udacity-dend",
    s3_key="log_data")

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    table="staging_songs",

Esempio n. 5

0

Mostra file

    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG('etl_categories',
          schedule_interval='@hourly',
          catchup=False,
          default_args=default_args,
          description='Loads newly added menus categories daily.')

t1 = PostgresOperator(task_id='if_not_exists',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    CREATE TABLE IF NOT EXISTS categories (
      id          SERIAL PRIMARY KEY,
      name        VARCHAR(64) NOT NULL,
      menu_id     INTEGER REFERENCES menus(id),
      description TEXT,
      UNIQUE (name, menu_id)
    );''',
                      dag=dag)

t2 = PostgresOperator(task_id='insert',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    INSERT INTO categories (id, name, menu_id, description)
      SELECT id, name, menu_id, description
        FROM tmp_categories;
    ''',
                      dag=dag)

Esempio n. 6

0

Mostra file

def load_trip_data_to_redshift(*args, **kwargs):
    aws_hook = AwsHook("aws_credentials")
    credentials = aws_hook.get_credentials()
    redshift_hook = PostgresHook("redshift")
    sql_stmt = sql.COPY_ALL_TRIPS_SQL.format(
        credentials.access_key,
        credentials.secret_key,
    )
    redshift_hook.run(sql_stmt)


dag = DAG('lesson2.demo1', start_date=datetime.datetime.now())

create_trips_table = PostgresOperator(task_id="create_trips_table",
                                      dag=dag,
                                      postgres_conn_id="redshift",
                                      sql=sql.CREATE_TRIPS_TABLE_SQL)

copy_trips_task = PythonOperator(
    task_id='load_trips_from_s3_to_redshift',
    dag=dag,
    python_callable=load_trip_data_to_redshift,
)

location_traffic_task = PostgresOperator(task_id="calculate_location_traffic",
                                         dag=dag,
                                         postgres_conn_id="redshift",
                                         sql=sql.LOCATION_TRAFFIC_SQL)

create_trips_table >> copy_trips_task
# TODO: First, load the Airflow UI and run this DAG once.

Esempio n. 7

0

Mostra file

    'email': '*****@*****.**',
    'email_on_failure': False,
    'retries': 1, 
    'retry_delay': timedelta(minutes=5)
    }

# initiating the DAG
dag = airflow.DAG(
    dag_id='outlier_detector',
    schedule_interval="@weekly",
    default_args=args,
    max_active_runs=1)

task0 = PostgresOperator(
    task_id='pull_data_from_postgres',
    sql=sql_pull_data.format(filename),
    postgres_conn_id='postgres_default',
    dag=dag)

task1 = PythonOperator(
    task_id='detect_outliers',
    provide_context=True,
    op_args=[filename],
    python_callable=detect_outliers,
    dag=dag)

task2 = PythonOperator(
    task_id='send_email_if_outliers',
    provide_context=True,
    # all the variable used below should be setup as environment variable
    op_args=[email_to, email_cc, subject, message],

Esempio n. 8

0

Mostra file

        python_callable=process_osm.modify_tables,
    )
    osm_add_metadata = PythonOperator(task_id="add-osm-metadata",
                                      python_callable=import_osm.add_metadata)
    setup >> osm_download >> osm_import >> osm_migrate >> osm_add_metadata

    # VG250 (Verwaltungsgebiete 250) data import
    vg250_download = PythonOperator(
        task_id="download-vg250",
        python_callable=import_vg250.download_vg250_files,
    )
    vg250_import = PythonOperator(task_id="import-vg250",
                                  python_callable=import_vg250.to_postgres)
    vg250_nuts_mview = PostgresOperator(
        task_id="vg250_nuts_mview",
        sql="vg250_lan_nuts_id_mview.sql",
        postgres_conn_id="egon_data",
        autocommit=True,
    )
    vg250_metadata = PythonOperator(
        task_id="add-vg250-metadata",
        python_callable=import_vg250.add_metadata,
    )
    vg250_clean_and_prepare = PostgresOperator(
        task_id="vg250_clean_and_prepare",
        sql="cleaning_and_preparation.sql",
        postgres_conn_id="egon_data",
        autocommit=True,
    )
    setup >> vg250_download >> vg250_import >> vg250_nuts_mview
    vg250_nuts_mview >> vg250_metadata >> vg250_clean_and_prepare

Esempio n. 9

0

Mostra file

    project_root=project_root,
    repos=[
        {
            'name': 'flights_meta',
            'zenodo_id': flights_repo
        },
        #{'name': 'tweets_meta', 'zenodo_id': tweets_repo }, # TODO: out of scope of this version
    ])
covid_data_task = RawDataHandler(task_id="covid_data_downloader",
                                 dag=dag,
                                 destination_folder=output_path,
                                 s3_bucket='udacity-awss',
                                 aws_credentials_id="s3_credentials")

create_tables_task = PostgresOperator(task_id="create_tables",
                                      dag=dag,
                                      postgres_conn_id="redshift",
                                      sql=SqlQueries.create_sttmts)

create_emr_task = EmrCreateJobFlowOperator(
    task_id="create_emr_cluster",
    job_flow_overrides=EmrHandler.JOB_FLOW_OVERRIDES,
    aws_conn_id="aws_credentials",
    emr_conn_id="emr_connection",
    dag=dag)

add_emr_mount_task = EmrAddStepsOperator(
    task_id='add_emr_mount',
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
    aws_conn_id='aws_credentials',
    steps=EmrHandler.SPARK_STEP_MOUNT,

Esempio n. 10

0

Mostra file

File: hior.py Progetto: Amsterdam/dataservices-airflow

                bash_command=f"psql {pg_params()} < {path}",
            )
        )
    for path in (
        f"{tmp_dir}/hior_properties_new.sql",
        f"{tmp_dir}/hior_attributes_new.sql",
    ):
        name = pathlib.Path(path).stem
        import_linked_tables.append(
            BashOperator(
                task_id=f"create_{name}",
                bash_command=f"psql {pg_params()} < {path}",
            )
        )

    rename_table = PostgresOperator(task_id="rename_table", sql=SQL_TABLE_RENAME)

    # Grant database permissions
    grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=dag_id)


(
    slack_at_start
    >> fetch_xls
    >> convert_data
    >> create_table
    >> import_tables[1:]
    >> rename_table
    >> grant_db_permissions
)

Esempio n. 11

0

Mostra file

File: covid_dag.py Progetto: Hiroaki0422/California-Covid-Data-Warehouse-

                                       table='reopening_tier',
                                       source_table='staging_reopening_tier',
                                       sql=LoadFactQueries.load_open_tiers)

# transform the staging tables and load nationwide cases fact table on redshift
loadfact_nationwide = LoadFactOperator(task_id='Loadfact_nationwide',
                                       dag=dag,
                                       redshift_conn_id='redshift',
                                       table='other_states_cases',
                                       source_table='nationwide_cases',
                                       sql=LoadFactQueries.load_nationwide)

# transform the staging tables and load healthcare facilities dimension table on redshift
loaddim_healthcare = PostgresOperator(
    task_id='Loaddim_healthcare',
    dag=dag,
    sql=LoadDimensionQueries.load_healthcare_facs,
    postgres_conn_id='redshift')

# transform the staging tables and load county dimension table on redshift
loaddim_county = PostgresOperator(task_id='Loaddim_county',
                                  dag=dag,
                                  sql=LoadDimensionQueries.load_county,
                                  postgres_conn_id='redshift')

# transform the staging tables and load prison dimension table on redshift
loaddim_prison = PostgresOperator(task_id='Loaddim_prison',
                                  dag=dag,
                                  sql=LoadDimensionQueries.load_prison,
                                  postgres_conn_id='redshift')

Esempio n. 12

0

Mostra file

File: sparkify_create_tables_dag.py Progetto: sdelopez/05_udacity_data_engineering_project_data_pipelines

# run only once the dag to create table
dag = DAG('01_sparkify_create_tables_dag',
          default_args=default_args,
          description=
          'Create staging, Fact, Dimension tables in Redshift with Airflow',
          schedule_interval='@once',
          catchup=False)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

# Define task to create tables in Redshift

create_staging_events_table = PostgresOperator(
    task_id='Create_staging_events_table',
    dag=dag,
    postgres_conn_id='redshift',
    sql=SqlQueries.staging_events_table_create)

create_staging_songs_table = PostgresOperator(
    task_id='Create_staging_songs_table',
    dag=dag,
    postgres_conn_id='redshift',
    sql=SqlQueries.staging_songs_table_create)

create_songplays_table = PostgresOperator(
    task_id='Create_songplays_table',
    dag=dag,
    postgres_conn_id='redshift',
    sql=SqlQueries.songplays_table_create)

Esempio n. 13

0

Mostra file

File: manual_update.py Progetto: stlrda/211Dashboard-Workflows

    'owner': '211dashboard',
    'start_date': datetime(2020, 6, 1),
    'concurrency': 1,
    'retries': 0,
    'depends_on_past': False,
    'catchup': False
}

dag = DAG(dag_id='211dash_manual_update',
          schedule_interval='@once',
          template_searchpath=SEARCH_PATH,
          default_args=args)
''' Define manual update operators. '''
''' 1. Census data operators '''

truncate_core_census_tables = PostgresOperator(
    task_id='truncate_core_census_tables', sql='trnctTbls_census.sql', dag=dag)

transform_census_county_files = PythonOperator(
    task_id='transform_census_county_files',
    python_callable=transform_static_s3,
    op_kwargs={
        'data': 'census_county',
        'filename': 'census_data_by_county.csv',
        'resource_path': RESOURCE_PATH,
        'transformer': transform_census_data,
        'sep': '|'
    },
    dag=dag)

transform_census_tract_files = PythonOperator(
    task_id='transform_census_tract_files',

Esempio n. 14

0

Mostra file

File: dag.py Progetto: claudiordgz/udacity--data-engineering-nanodegree

        "lesson3.exercise3",
        stations_task_id,
        "redshift",
        "aws_default",
        "stations",
        sql_statements.CREATE_STATIONS_TABLE_SQL,
        s3_bucket="udacity-dend",
        s3_key="data-pipelines/divvy/unpartitioned/divvy_stations_2017.csv",
        start_date=start_date,
    ),
    task_id=stations_task_id,
    dag=dag,
)

#
# TODO: Consolidated multiple HasRowsOperator into subdag
#

location_traffic_task = PostgresOperator(
    task_id="calculate_location_traffic",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.LOCATION_TRAFFIC_SQL
)

#
# TODO: Reorder the Graph once you have moved the checks
#
trips_subdag_task >> location_traffic_task
stations_subdag_task >> location_traffic_task

Esempio n. 15

0

Mostra file

    'retry_delay': timedelta(minutes=5),
}

# Defining the DAG
dag = DAG('udac_capstone_dag',
          default_args=default_args,
          description='Transform data in S3 and load to Redshift with Airflow',
          schedule_interval='@monthly',
          max_active_runs=1,
          catchup=False)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

# Task to create tables in Redshift
create_tables_task = PostgresOperator(task_id="create_tables",
                                      sql='capstone_create_tables.sql',
                                      postgres_conn_id="redshift",
                                      dag=dag)

# Initial processing of datasets and loading cleansed data to S3

S3_immig_task = BashOperator(
    task_id='load_to_S3_immig',
    bash_command='python /home/workspace/airflow/dags/script/etl_immig.py',
    dag=dag)

S3_temp_task = BashOperator(
    task_id='load_to_S3_temp',
    bash_command='python /home/workspace/airflow/dags/script/etl_temp.py',
    dag=dag)

S3_us_demog_task = BashOperator(

Esempio n. 16

0

Mostra file

File: sparkify_create_table_dag.py Progetto: ssime-git/Sparkify_Airflow_ELT_Pipeline

# Tasks definition:
# -----------------

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

# Drop tables:
# -----------
redshift_conn_id = "redshift"

drop_tables_subtask = PostgresOperator(
    task_id="dropping_staging_events_table",
    dag=dag,
    postgres_conn_id=redshift_conn_id,
    sql=("""DROP TABLE IF EXISTS staging_events;
   DROP TABLE IF EXISTS staging_songs;
   DROP TABLE IF EXISTS songplays;
   DROP TABLE IF EXISTS artists;
   DROP TABLE IF EXISTS songs;
   DROP TABLE IF EXISTS time;
   DROP TABLE IF EXISTS users;
   """))
#

# Create tables:
#--------------

create_staging_events_table_task = PostgresOperator(
    task_id="create_staging_events_table",
    dag=dag,
    postgres_conn_id=redshift_conn_id,
    sql=SqlQueries.create_staging_events_table.format("staging_events"))

Esempio n. 17

0

Mostra file

File: prepare_code_postal.py Progetto: MTES-MCT/kelrisks-data-preparation

with DAG("prepare_code_postal",
         default_args=default_args,
         schedule_interval=None) as dag:

    cmd = "mkdir -p $DIR_PATH & wget \"$URL\" -O $DIR_PATH/code_postal.csv"

    download = BashOperator(
        env={
            "URL": "https://www.data.gouv.fr/fr/datasets/r/554590ab-ae62-40ac-8353-ee75162c05ee",
            "DIR_PATH": "{data_dir}/communes".format(data_dir=DATA_DIR)},
        task_id="download",
        bash_command=cmd)

    load = EmbulkOperator(
        task_id="load",
        embulk_config="code_postal.yml.liquid")

    prepare = PythonOperator(
        task_id="prepare",
        python_callable=recipes.prepare_code_postal)

    create_index = PostgresOperator(
        task_id="create_index",
        sql=textwrap.dedent("""
            CREATE INDEX code_postal_code_insee_idx
            ON etl.code_postal (code_insee)"""),
        postgres_conn_id=CONN_ID)

    download >> load >> prepare >> create_index

Esempio n. 18

0

Mostra file

create_tables = PostgresOperator(
    task_id="create_tables",
    dag=dag,
    postgres_conn_id="redshift",
    sql="""
        CREATE TABLE IF NOT EXISTS public.artists (
            artistid varchar(256) NOT NULL,
            name varchar(256),
            location varchar(256),
            lattitude numeric(18,0),
            longitude numeric(18,0)
        );
        CREATE TABLE IF NOT EXISTS public.songplays (
            playid varchar(32) NOT NULL,
            start_time timestamp NOT NULL,
            userid int4 NOT NULL,
            "level" varchar(256),
            songid varchar(256),
            artistid varchar(256),
            sessionid int4,
            location varchar(256),
            user_agent varchar(256),
            CONSTRAINT songplays_pkey PRIMARY KEY (playid)
        );
        CREATE TABLE IF NOT EXISTS public.songs (
            songid varchar(256) NOT NULL,
            title varchar(256),
            artistid varchar(256),
            "year" int4,
            duration numeric(18,0),
            CONSTRAINT songs_pkey PRIMARY KEY (songid)
        );
        CREATE TABLE IF NOT EXISTS public.staging_events (
            artist varchar(256),
            auth varchar(256),
            firstname varchar(256),
            gender varchar(256),
            iteminsession int4,
            lastname varchar(256),
            length numeric(18,0),
            "level" varchar(256),
            location varchar(256),
            "method" varchar(256),
            page varchar(256),
            registration numeric(18,0),
            sessionid int4,
            song varchar(256),
            status int4,
            ts int8,
            useragent varchar(256),
            userid int4
        );
        CREATE TABLE IF NOT EXISTS public.staging_songs (
            num_songs int4,
            artist_id varchar(256),
            artist_name varchar(256),
            artist_latitude numeric(18,0),
            artist_longitude numeric(18,0),
            artist_location varchar(256),
            song_id varchar(256),
            title varchar(256),
            duration numeric(18,0),
            "year" int4
        );
        CREATE TABLE IF NOT EXISTS public.users (
            userid int4 NOT NULL,
            first_name varchar(256),
            last_name varchar(256),
            gender varchar(256),
            "level" varchar(256),
            CONSTRAINT users_pkey PRIMARY KEY (userid)
        );
        CREATE TABLE IF NOT EXISTS time (
            start_time timestamp PRIMARY KEY,
            hour integer,
            day integer,
            week integer,
            month integer,
            year integer,
            weekday integer
)
    """
)

Esempio n. 19

0

Mostra file

File: sub_init_dims.py Progetto: pamo12/capstone

def init_dims_sub_dag(parent_dag_name, child_dag_name, start_date,
                      redshift_conn_id):
    dag = DAG('%s.%s' % (parent_dag_name, child_dag_name),
              start_date=start_date)

    drop_dim_vehicles_task = PostgresOperator(
        task_id='drop_dim_vehicles',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_VEHICLES)

    drop_dim_vehicle_models_task = PostgresOperator(
        task_id='drop_dim_vehicle_models',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_VEHICLE_MODELS)

    drop_dim_rental_zones_task = PostgresOperator(
        task_id='drop_dim_rental_zones',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_RENTAL_ZONES)

    drop_dim_companies_task = PostgresOperator(
        task_id='drop_dim_companies',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_COMPANIES)

    drop_dim_categories_task = PostgresOperator(
        task_id='drop_dim_categroies',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_CATEGORIES)

    drop_dim_date_task = PostgresOperator(
        task_id='drop_dim_date',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_DATE)

    drop_dim_weather_task = PostgresOperator(
        task_id='drop_dim_weather',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.DROP_TABLE_DIM_WEATHER)

    create_dim_vehicles_task = PostgresOperator(
        task_id='create_dim_vehicles',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_VEHICLES)

    create_dim_vehicle_models_task = PostgresOperator(
        task_id='create_dim_vehicle_models',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_VEHICLE_MODELS)

    create_dim_rental_zones_task = PostgresOperator(
        task_id='create_dim_rental_zones',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_RENTAL_ZONES)

    create_dim_companies_task = PostgresOperator(
        task_id='create_dim_companies',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_COMPANIES)

    create_dim_categories_task = PostgresOperator(
        task_id='create_dim_categories',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_CATEGORIES)

    create_dim_date_task = PostgresOperator(
        task_id='create_dim_date',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_DATE)

    create_dim_weather_task = PostgresOperator(
        task_id='create_dim_weather',
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=init_statements.CREATE_TABLE_DIM_WEATHER)

    drop_dim_vehicles_task >> create_dim_vehicles_task
    drop_dim_vehicle_models_task >> create_dim_vehicle_models_task
    drop_dim_rental_zones_task >> create_dim_rental_zones_task
    drop_dim_companies_task >> create_dim_companies_task
    drop_dim_categories_task >> create_dim_categories_task
    drop_dim_date_task >> create_dim_date_task
    drop_dim_weather_task >> create_dim_weather_task

    return dag

Esempio n. 20

0

Mostra file

File: etl_restaurants.py Progetto: walter-hernandez/marquez

          description='Loads newly registered restaurants daily.')

# Wait for new_food_deliveries DAG to complete
t1 = ExternalTaskSensor(task_id='wait_for_new_food_deliveries',
                        external_dag_id='new_food_deliveries',
                        mode='reschedule',
                        dag=dag)

t2 = PostgresOperator(task_id='if_not_exists',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    CREATE TABLE IF NOT EXISTS restaurants (
      id                SERIAL PRIMARY KEY,
      created_at        TIMESTAMP NOT NULL,
      updated_at        TIMESTAMP NOT NULL,
      name              VARCHAR(64) NOT NULL,
      email             VARCHAR(64) UNIQUE NOT NULL,
      address           VARCHAR(64) NOT NULL,
      phone             VARCHAR(64) NOT NULL,
      city_id           INTEGER REFERENCES cities(id),
      business_hours_id INTEGER REFERENCES business_hours(id),
      description       TEXT
    );''',
                      dag=dag)

t3 = PostgresOperator(task_id='etl',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    INSERT INTO restaurants (id, created_at, updated_at, name, email, address, phone, city_id, business_hours_id, description)
      SELECT id, created_at, updated_at, name, email, address, phone, city_id, business_hours_id, description
        FROM tmp_restaurants;
    ''',

Esempio n. 21

0

Mostra file

)

load_and_analyze = PythonOperator(
    task_id='load_and_analyze',
    dag=dag,
    python_callable=load_and_analyze,
    provide_context=True,
)

create_oldest_task = PostgresOperator(
    task_id="create_oldest",
    dag=dag,
    sql="""
        BEGIN;
        DROP TABLE IF EXISTS older_riders;
        CREATE TABLE older_riders AS (
            SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945
        );
        COMMIT;
    """,
    postgres_conn_id="redshift"
)

log_oldest_task = PythonOperator(
    task_id="log_oldest",
    dag=dag,
    python_callable=log_oldest
)

create_youngest_task = PostgresOperator(
    task_id="create_youngest",

Esempio n. 22

0

Mostra file

from has_rows import HasRowsOperator

import sql_statements


dag = DAG(
    'bicycle_sharing_example',
    start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
    end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0),
    schedule_interval='@monthly',
    max_active_runs=1
)

create_trips_table = PostgresOperator(
    task_id="create_trips_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_TRIPS_TABLE_SQL
)

# https://airflow.apache.org/docs/apache-airflow/stable/_modules/airflow/models/baseoperator.html
copy_trips_task = S3ToRedshiftOperator(
    aws_credentials_id="aws_credentials",
    redshift_conn_id="redshift",
    table="trips",
    s3_path="s3://udacity-dend/data-pipelines/divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv",
    task_id='load_trips_from_s3_to_redshift',
    dag=dag,
    sla=datetime.timedelta(hours=1)
)

check_trips = HasRowsOperator(

Esempio n. 23

0

Mostra file

File: final_project_dwh_etl_traffic.py Progetto: julglad/airflow-training

    'start_date': datetime(2013, 1, 1, 0, 0, 0),
    'depends_on_past': True
}

dag = DAG(
    USERNAME + '_final_project_dwh_etl_traffic',
    default_args=default_args,
    description='Final project DWH ETL traffic',
    schedule_interval="0 0 1 1 *",
    max_active_runs = 1,
)

clear_ods = PostgresOperator(
    task_id="clear_ods",
    dag=dag,
    sql="""
        DELETE FROM ygladkikh.project_ods_traffic WHERE EXTRACT(YEAR FROM time_stamp) = {{ execution_date.year }}
    """
)

fill_ods = PostgresOperator(
    task_id="fill_ods",
    dag=dag,
    sql="""
        INSERT INTO ygladkikh.project_ods_traffic
        SELECT 	user_id,to_timestamp("timestamp"/1000), device_id,	device_ip_addr,	bytes_sent,	bytes_received 
        FROM ygladkikh.project_stg_traffic 
        WHERE EXTRACT(YEAR FROM to_timestamp("timestamp"/1000)) = {{ execution_date.year }}
    """
)

Esempio n. 24

0

Mostra file

    redshift_hook = PostgresHook("redshift")
    records = redshift_hook.get_records("""
        SELECT birthyear FROM younger_riders ORDER BY birthyear DESC LIMIT 1
    """)
    if len(records) > 0 and len(records[0]) > 0:
        logging.info(f"Youngest rider was born in {records[0][0]}")


dag = DAG("lesson3.exercise2", start_date=datetime.datetime.utcnow())

create_oldest_task = PostgresOperator(task_id="create_oldest",
                                      dag=dag,
                                      sql="""
        BEGIN;
        DROP TABLE IF EXISTS older_riders;
        CREATE TABLE older_riders AS (
            SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945
        );
        COMMIT;
    """,
                                      postgres_conn_id="redshift")

create_youngest_task = PostgresOperator(task_id="create_youngest",
                                        dag=dag,
                                        sql="""
        BEGIN;
        DROP TABLE IF EXISTS younger_riders;
        CREATE TABLE younger_riders AS (
            SELECT * FROM trips WHERE birthyear > 2000
        );
        COMMIT;

Esempio n. 25

0

Mostra file

File: listing_4_20.py Progetto: asatrya/data-pipelines-with-apache-airflow

        for line in f:
            domain_code, page_title, view_counts, _ = line.split(" ")
            if domain_code == "en" and page_title in pagenames:
                result[page_title] = view_counts

    with open("/tmp/postgres_query.sql", "w") as f:
        for pagename, pageviewcount in result.items():
            f.write("INSERT INTO pageview_counts VALUES ("
                    f"'{pagename}', {pageviewcount}, '{execution_date}'"
                    ");\n")


fetch_pageviews = PythonOperator(
    task_id="fetch_pageviews",
    python_callable=_fetch_pageviews,
    op_kwargs={
        "pagenames": {"Google", "Amazon", "Apple", "Microsoft", "Facebook"}
    },
    provide_context=True,
    dag=dag,
)

write_to_postgres = PostgresOperator(
    task_id="write_to_postgres",
    postgres_conn_id="my_postgres",
    sql="postgres_query.sql",
    dag=dag,
)

get_data >> extract_gz >> fetch_pageviews >> write_to_postgres