"db_user": "******", } get = BashOperator( task_id='get_' + source, bash_command= 'npm run get {{ params.source }} --prefix=~/airflow/dags/facdb_1_download -- --ftp_user={{ params.ftp_user }} --ftp_pass={{ params.ftp_pass }} --download_dir={{ params.download_dir }}', params=params, dag=facdb_1_download) push = BashOperator( task_id='push_' + source, bash_command= "npm run push {{ params.source }} --prefix=~/airflow/dags/facdb_1_download -- --db={{ params.db }} --db_user={{ params.db_user }} --download_dir={{ params.download_dir }}", params=params, dag=facdb_1_download) get >> push if os.path.isfile( "/home/airflow/airflow/dags/facdb_1_download/datasets/{0}/after.sql" .format(source)): after = PostgresOperator( task_id='after_' + source, postgres_conn_id='facdb', sql="/facdb_1_download/datasets/{0}/after.sql".format(source), dag=facdb_1_download) push >> after >> trigger_facdb_2_assembly else: push >> trigger_facdb_2_assembly
dag=dag ) transform_population_task = PythonOperator( task_id="transform_population", python_callable=transform_population_func, op_kwargs={ "input_csv_file_name": f"{project_dir}/data/17100009.csv", "spark_output_dir": f"{project_dir}/population" }, dag=dag ) create_contributions_in_postgres = PostgresOperator( task_id="create_contributions_in_postgres", sql=sql_queries.create_contributions, postgres_conn_id="postgres", dag=dag ) create_population_in_postgres = PostgresOperator( task_id="create_population_in_postgres", sql=sql_queries.create_population, postgres_conn_id="postgres", dag=dag ) load_contributions_to_postgres = PythonOperator( task_id="load_contributions_to_postgres", python_callable=load_spark_csv_to_postgres, op_kwargs={ "spark_csv_dir": f"{project_dir}/contributions",
t_srs="EPSG:28992", input_file_sep="SEMICOLON", auto_detect_type="YES", geometry_name="geometrie", mode="PostgreSQL", db_conn=db_conn, ) for key, file in files_to_download.items() ] # 6. RE-define GEOM type (because ogr2ogr cannot set geom with .csv import) # except themas itself, which is a dimension table (parent) of veiligeafstanden table redefine_geoms = [ PostgresOperator( task_id=f"re-define_geom_{key}", sql=SET_GEOM, params=dict(tablename=f"{dag_id}_{key}_new"), ) for key in files_to_download.keys() if key == "veiligeafstanden" ] # 7. Add thema-context to child tables from parent table (themas) # except themas itself, which is a dimension table (parent) of veiligeafstanden table add_thema_contexts = [ PostgresOperator( task_id=f"add_context_{key}", sql=ADD_THEMA_CONTEXT, params=dict(tablename=f"{dag_id}_{key}_new", parent_table=f"{dag_id}_themas_new"), ) for key in files_to_download.keys()
AWS_KEY = os.environ.get('AWS_KEY') AWS_SECRET = os.environ.get('AWS_SECRET') default_args = { 'owner': 'udacity', 'start_date': datetime(2019, 1, 12), } dag = DAG('udac_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *') start_operator = PostgresOperator(task_id='Begin_execution', dag=dag, postgres_conn_id="redshift", sql=create_tables.create_table_queries) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, table="staging_events", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="/udacity-dend", s3_key="log_data") stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag, table="staging_songs",
'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG('etl_categories', schedule_interval='@hourly', catchup=False, default_args=default_args, description='Loads newly added menus categories daily.') t1 = PostgresOperator(task_id='if_not_exists', postgres_conn_id='food_delivery_db', sql=''' CREATE TABLE IF NOT EXISTS categories ( id SERIAL PRIMARY KEY, name VARCHAR(64) NOT NULL, menu_id INTEGER REFERENCES menus(id), description TEXT, UNIQUE (name, menu_id) );''', dag=dag) t2 = PostgresOperator(task_id='insert', postgres_conn_id='food_delivery_db', sql=''' INSERT INTO categories (id, name, menu_id, description) SELECT id, name, menu_id, description FROM tmp_categories; ''', dag=dag)
def load_trip_data_to_redshift(*args, **kwargs): aws_hook = AwsHook("aws_credentials") credentials = aws_hook.get_credentials() redshift_hook = PostgresHook("redshift") sql_stmt = sql.COPY_ALL_TRIPS_SQL.format( credentials.access_key, credentials.secret_key, ) redshift_hook.run(sql_stmt) dag = DAG('lesson2.demo1', start_date=datetime.datetime.now()) create_trips_table = PostgresOperator(task_id="create_trips_table", dag=dag, postgres_conn_id="redshift", sql=sql.CREATE_TRIPS_TABLE_SQL) copy_trips_task = PythonOperator( task_id='load_trips_from_s3_to_redshift', dag=dag, python_callable=load_trip_data_to_redshift, ) location_traffic_task = PostgresOperator(task_id="calculate_location_traffic", dag=dag, postgres_conn_id="redshift", sql=sql.LOCATION_TRAFFIC_SQL) create_trips_table >> copy_trips_task # TODO: First, load the Airflow UI and run this DAG once.
'email': '*****@*****.**', 'email_on_failure': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } # initiating the DAG dag = airflow.DAG( dag_id='outlier_detector', schedule_interval="@weekly", default_args=args, max_active_runs=1) task0 = PostgresOperator( task_id='pull_data_from_postgres', sql=sql_pull_data.format(filename), postgres_conn_id='postgres_default', dag=dag) task1 = PythonOperator( task_id='detect_outliers', provide_context=True, op_args=[filename], python_callable=detect_outliers, dag=dag) task2 = PythonOperator( task_id='send_email_if_outliers', provide_context=True, # all the variable used below should be setup as environment variable op_args=[email_to, email_cc, subject, message],
python_callable=process_osm.modify_tables, ) osm_add_metadata = PythonOperator(task_id="add-osm-metadata", python_callable=import_osm.add_metadata) setup >> osm_download >> osm_import >> osm_migrate >> osm_add_metadata # VG250 (Verwaltungsgebiete 250) data import vg250_download = PythonOperator( task_id="download-vg250", python_callable=import_vg250.download_vg250_files, ) vg250_import = PythonOperator(task_id="import-vg250", python_callable=import_vg250.to_postgres) vg250_nuts_mview = PostgresOperator( task_id="vg250_nuts_mview", sql="vg250_lan_nuts_id_mview.sql", postgres_conn_id="egon_data", autocommit=True, ) vg250_metadata = PythonOperator( task_id="add-vg250-metadata", python_callable=import_vg250.add_metadata, ) vg250_clean_and_prepare = PostgresOperator( task_id="vg250_clean_and_prepare", sql="cleaning_and_preparation.sql", postgres_conn_id="egon_data", autocommit=True, ) setup >> vg250_download >> vg250_import >> vg250_nuts_mview vg250_nuts_mview >> vg250_metadata >> vg250_clean_and_prepare
project_root=project_root, repos=[ { 'name': 'flights_meta', 'zenodo_id': flights_repo }, #{'name': 'tweets_meta', 'zenodo_id': tweets_repo }, # TODO: out of scope of this version ]) covid_data_task = RawDataHandler(task_id="covid_data_downloader", dag=dag, destination_folder=output_path, s3_bucket='udacity-awss', aws_credentials_id="s3_credentials") create_tables_task = PostgresOperator(task_id="create_tables", dag=dag, postgres_conn_id="redshift", sql=SqlQueries.create_sttmts) create_emr_task = EmrCreateJobFlowOperator( task_id="create_emr_cluster", job_flow_overrides=EmrHandler.JOB_FLOW_OVERRIDES, aws_conn_id="aws_credentials", emr_conn_id="emr_connection", dag=dag) add_emr_mount_task = EmrAddStepsOperator( task_id='add_emr_mount', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id='aws_credentials', steps=EmrHandler.SPARK_STEP_MOUNT,
bash_command=f"psql {pg_params()} < {path}", ) ) for path in ( f"{tmp_dir}/hior_properties_new.sql", f"{tmp_dir}/hior_attributes_new.sql", ): name = pathlib.Path(path).stem import_linked_tables.append( BashOperator( task_id=f"create_{name}", bash_command=f"psql {pg_params()} < {path}", ) ) rename_table = PostgresOperator(task_id="rename_table", sql=SQL_TABLE_RENAME) # Grant database permissions grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=dag_id) ( slack_at_start >> fetch_xls >> convert_data >> create_table >> import_tables[1:] >> rename_table >> grant_db_permissions )
table='reopening_tier', source_table='staging_reopening_tier', sql=LoadFactQueries.load_open_tiers) # transform the staging tables and load nationwide cases fact table on redshift loadfact_nationwide = LoadFactOperator(task_id='Loadfact_nationwide', dag=dag, redshift_conn_id='redshift', table='other_states_cases', source_table='nationwide_cases', sql=LoadFactQueries.load_nationwide) # transform the staging tables and load healthcare facilities dimension table on redshift loaddim_healthcare = PostgresOperator( task_id='Loaddim_healthcare', dag=dag, sql=LoadDimensionQueries.load_healthcare_facs, postgres_conn_id='redshift') # transform the staging tables and load county dimension table on redshift loaddim_county = PostgresOperator(task_id='Loaddim_county', dag=dag, sql=LoadDimensionQueries.load_county, postgres_conn_id='redshift') # transform the staging tables and load prison dimension table on redshift loaddim_prison = PostgresOperator(task_id='Loaddim_prison', dag=dag, sql=LoadDimensionQueries.load_prison, postgres_conn_id='redshift')
# run only once the dag to create table dag = DAG('01_sparkify_create_tables_dag', default_args=default_args, description= 'Create staging, Fact, Dimension tables in Redshift with Airflow', schedule_interval='@once', catchup=False) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) # Define task to create tables in Redshift create_staging_events_table = PostgresOperator( task_id='Create_staging_events_table', dag=dag, postgres_conn_id='redshift', sql=SqlQueries.staging_events_table_create) create_staging_songs_table = PostgresOperator( task_id='Create_staging_songs_table', dag=dag, postgres_conn_id='redshift', sql=SqlQueries.staging_songs_table_create) create_songplays_table = PostgresOperator( task_id='Create_songplays_table', dag=dag, postgres_conn_id='redshift', sql=SqlQueries.songplays_table_create)
'owner': '211dashboard', 'start_date': datetime(2020, 6, 1), 'concurrency': 1, 'retries': 0, 'depends_on_past': False, 'catchup': False } dag = DAG(dag_id='211dash_manual_update', schedule_interval='@once', template_searchpath=SEARCH_PATH, default_args=args) ''' Define manual update operators. ''' ''' 1. Census data operators ''' truncate_core_census_tables = PostgresOperator( task_id='truncate_core_census_tables', sql='trnctTbls_census.sql', dag=dag) transform_census_county_files = PythonOperator( task_id='transform_census_county_files', python_callable=transform_static_s3, op_kwargs={ 'data': 'census_county', 'filename': 'census_data_by_county.csv', 'resource_path': RESOURCE_PATH, 'transformer': transform_census_data, 'sep': '|' }, dag=dag) transform_census_tract_files = PythonOperator( task_id='transform_census_tract_files',
"lesson3.exercise3", stations_task_id, "redshift", "aws_default", "stations", sql_statements.CREATE_STATIONS_TABLE_SQL, s3_bucket="udacity-dend", s3_key="data-pipelines/divvy/unpartitioned/divvy_stations_2017.csv", start_date=start_date, ), task_id=stations_task_id, dag=dag, ) # # TODO: Consolidated multiple HasRowsOperator into subdag # location_traffic_task = PostgresOperator( task_id="calculate_location_traffic", dag=dag, postgres_conn_id="redshift", sql=sql_statements.LOCATION_TRAFFIC_SQL ) # # TODO: Reorder the Graph once you have moved the checks # trips_subdag_task >> location_traffic_task stations_subdag_task >> location_traffic_task
'retry_delay': timedelta(minutes=5), } # Defining the DAG dag = DAG('udac_capstone_dag', default_args=default_args, description='Transform data in S3 and load to Redshift with Airflow', schedule_interval='@monthly', max_active_runs=1, catchup=False) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) # Task to create tables in Redshift create_tables_task = PostgresOperator(task_id="create_tables", sql='capstone_create_tables.sql', postgres_conn_id="redshift", dag=dag) # Initial processing of datasets and loading cleansed data to S3 S3_immig_task = BashOperator( task_id='load_to_S3_immig', bash_command='python /home/workspace/airflow/dags/script/etl_immig.py', dag=dag) S3_temp_task = BashOperator( task_id='load_to_S3_temp', bash_command='python /home/workspace/airflow/dags/script/etl_temp.py', dag=dag) S3_us_demog_task = BashOperator(
# Tasks definition: # ----------------- start_operator = DummyOperator(task_id='Begin_execution', dag=dag) # Drop tables: # ----------- redshift_conn_id = "redshift" drop_tables_subtask = PostgresOperator( task_id="dropping_staging_events_table", dag=dag, postgres_conn_id=redshift_conn_id, sql=("""DROP TABLE IF EXISTS staging_events; DROP TABLE IF EXISTS staging_songs; DROP TABLE IF EXISTS songplays; DROP TABLE IF EXISTS artists; DROP TABLE IF EXISTS songs; DROP TABLE IF EXISTS time; DROP TABLE IF EXISTS users; """)) # # Create tables: #-------------- create_staging_events_table_task = PostgresOperator( task_id="create_staging_events_table", dag=dag, postgres_conn_id=redshift_conn_id, sql=SqlQueries.create_staging_events_table.format("staging_events"))
with DAG("prepare_code_postal", default_args=default_args, schedule_interval=None) as dag: cmd = "mkdir -p $DIR_PATH & wget \"$URL\" -O $DIR_PATH/code_postal.csv" download = BashOperator( env={ "URL": "https://www.data.gouv.fr/fr/datasets/r/554590ab-ae62-40ac-8353-ee75162c05ee", "DIR_PATH": "{data_dir}/communes".format(data_dir=DATA_DIR)}, task_id="download", bash_command=cmd) load = EmbulkOperator( task_id="load", embulk_config="code_postal.yml.liquid") prepare = PythonOperator( task_id="prepare", python_callable=recipes.prepare_code_postal) create_index = PostgresOperator( task_id="create_index", sql=textwrap.dedent(""" CREATE INDEX code_postal_code_insee_idx ON etl.code_postal (code_insee)"""), postgres_conn_id=CONN_ID) download >> load >> prepare >> create_index
create_tables = PostgresOperator( task_id="create_tables", dag=dag, postgres_conn_id="redshift", sql=""" CREATE TABLE IF NOT EXISTS public.artists ( artistid varchar(256) NOT NULL, name varchar(256), location varchar(256), lattitude numeric(18,0), longitude numeric(18,0) ); CREATE TABLE IF NOT EXISTS public.songplays ( playid varchar(32) NOT NULL, start_time timestamp NOT NULL, userid int4 NOT NULL, "level" varchar(256), songid varchar(256), artistid varchar(256), sessionid int4, location varchar(256), user_agent varchar(256), CONSTRAINT songplays_pkey PRIMARY KEY (playid) ); CREATE TABLE IF NOT EXISTS public.songs ( songid varchar(256) NOT NULL, title varchar(256), artistid varchar(256), "year" int4, duration numeric(18,0), CONSTRAINT songs_pkey PRIMARY KEY (songid) ); CREATE TABLE IF NOT EXISTS public.staging_events ( artist varchar(256), auth varchar(256), firstname varchar(256), gender varchar(256), iteminsession int4, lastname varchar(256), length numeric(18,0), "level" varchar(256), location varchar(256), "method" varchar(256), page varchar(256), registration numeric(18,0), sessionid int4, song varchar(256), status int4, ts int8, useragent varchar(256), userid int4 ); CREATE TABLE IF NOT EXISTS public.staging_songs ( num_songs int4, artist_id varchar(256), artist_name varchar(256), artist_latitude numeric(18,0), artist_longitude numeric(18,0), artist_location varchar(256), song_id varchar(256), title varchar(256), duration numeric(18,0), "year" int4 ); CREATE TABLE IF NOT EXISTS public.users ( userid int4 NOT NULL, first_name varchar(256), last_name varchar(256), gender varchar(256), "level" varchar(256), CONSTRAINT users_pkey PRIMARY KEY (userid) ); CREATE TABLE IF NOT EXISTS time ( start_time timestamp PRIMARY KEY, hour integer, day integer, week integer, month integer, year integer, weekday integer ) """ )
def init_dims_sub_dag(parent_dag_name, child_dag_name, start_date, redshift_conn_id): dag = DAG('%s.%s' % (parent_dag_name, child_dag_name), start_date=start_date) drop_dim_vehicles_task = PostgresOperator( task_id='drop_dim_vehicles', dag=dag, postgres_conn_id=redshift_conn_id, sql=init_statements.DROP_TABLE_DIM_VEHICLES) drop_dim_vehicle_models_task = PostgresOperator( task_id='drop_dim_vehicle_models', dag=dag, postgres_conn_id=redshift_conn_id, sql=init_statements.DROP_TABLE_DIM_VEHICLE_MODELS) drop_dim_rental_zones_task = PostgresOperator( task_id='drop_dim_rental_zones', dag=dag, postgres_conn_id=redshift_conn_id, sql=init_statements.DROP_TABLE_DIM_RENTAL_ZONES) drop_dim_companies_task = PostgresOperator( task_id='drop_dim_companies', dag=dag, postgres_conn_id=redshift_conn_id, sql=init_statements.DROP_TABLE_DIM_COMPANIES) drop_dim_categories_task = PostgresOperator( task_id='drop_dim_categroies', dag=dag, postgres_conn_id=redshift_conn_id, sql=init_statements.DROP_TABLE_DIM_CATEGORIES) drop_dim_date_task = PostgresOperator( task_id='drop_dim_date', dag=dag, postgres_conn_id=redshift_conn_id, sql=init_statements.DROP_TABLE_DIM_DATE) drop_dim_weather_task = PostgresOperator( task_id='drop_dim_weather', dag=dag, postgres_conn_id=redshift_conn_id, sql=init_statements.DROP_TABLE_DIM_WEATHER) create_dim_vehicles_task = PostgresOperator( task_id='create_dim_vehicles', dag=dag, postgres_conn_id=redshift_conn_id, sql=init_statements.CREATE_TABLE_DIM_VEHICLES) create_dim_vehicle_models_task = PostgresOperator( task_id='create_dim_vehicle_models', dag=dag, postgres_conn_id=redshift_conn_id, sql=init_statements.CREATE_TABLE_DIM_VEHICLE_MODELS) create_dim_rental_zones_task = PostgresOperator( task_id='create_dim_rental_zones', dag=dag, postgres_conn_id=redshift_conn_id, sql=init_statements.CREATE_TABLE_DIM_RENTAL_ZONES) create_dim_companies_task = PostgresOperator( task_id='create_dim_companies', dag=dag, postgres_conn_id=redshift_conn_id, sql=init_statements.CREATE_TABLE_DIM_COMPANIES) create_dim_categories_task = PostgresOperator( task_id='create_dim_categories', dag=dag, postgres_conn_id=redshift_conn_id, sql=init_statements.CREATE_TABLE_DIM_CATEGORIES) create_dim_date_task = PostgresOperator( task_id='create_dim_date', dag=dag, postgres_conn_id=redshift_conn_id, sql=init_statements.CREATE_TABLE_DIM_DATE) create_dim_weather_task = PostgresOperator( task_id='create_dim_weather', dag=dag, postgres_conn_id=redshift_conn_id, sql=init_statements.CREATE_TABLE_DIM_WEATHER) drop_dim_vehicles_task >> create_dim_vehicles_task drop_dim_vehicle_models_task >> create_dim_vehicle_models_task drop_dim_rental_zones_task >> create_dim_rental_zones_task drop_dim_companies_task >> create_dim_companies_task drop_dim_categories_task >> create_dim_categories_task drop_dim_date_task >> create_dim_date_task drop_dim_weather_task >> create_dim_weather_task return dag
description='Loads newly registered restaurants daily.') # Wait for new_food_deliveries DAG to complete t1 = ExternalTaskSensor(task_id='wait_for_new_food_deliveries', external_dag_id='new_food_deliveries', mode='reschedule', dag=dag) t2 = PostgresOperator(task_id='if_not_exists', postgres_conn_id='food_delivery_db', sql=''' CREATE TABLE IF NOT EXISTS restaurants ( id SERIAL PRIMARY KEY, created_at TIMESTAMP NOT NULL, updated_at TIMESTAMP NOT NULL, name VARCHAR(64) NOT NULL, email VARCHAR(64) UNIQUE NOT NULL, address VARCHAR(64) NOT NULL, phone VARCHAR(64) NOT NULL, city_id INTEGER REFERENCES cities(id), business_hours_id INTEGER REFERENCES business_hours(id), description TEXT );''', dag=dag) t3 = PostgresOperator(task_id='etl', postgres_conn_id='food_delivery_db', sql=''' INSERT INTO restaurants (id, created_at, updated_at, name, email, address, phone, city_id, business_hours_id, description) SELECT id, created_at, updated_at, name, email, address, phone, city_id, business_hours_id, description FROM tmp_restaurants; ''',
) load_and_analyze = PythonOperator( task_id='load_and_analyze', dag=dag, python_callable=load_and_analyze, provide_context=True, ) create_oldest_task = PostgresOperator( task_id="create_oldest", dag=dag, sql=""" BEGIN; DROP TABLE IF EXISTS older_riders; CREATE TABLE older_riders AS ( SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945 ); COMMIT; """, postgres_conn_id="redshift" ) log_oldest_task = PythonOperator( task_id="log_oldest", dag=dag, python_callable=log_oldest ) create_youngest_task = PostgresOperator( task_id="create_youngest",
from has_rows import HasRowsOperator import sql_statements dag = DAG( 'bicycle_sharing_example', start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0), end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0), schedule_interval='@monthly', max_active_runs=1 ) create_trips_table = PostgresOperator( task_id="create_trips_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_TRIPS_TABLE_SQL ) # https://airflow.apache.org/docs/apache-airflow/stable/_modules/airflow/models/baseoperator.html copy_trips_task = S3ToRedshiftOperator( aws_credentials_id="aws_credentials", redshift_conn_id="redshift", table="trips", s3_path="s3://udacity-dend/data-pipelines/divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv", task_id='load_trips_from_s3_to_redshift', dag=dag, sla=datetime.timedelta(hours=1) ) check_trips = HasRowsOperator(
'start_date': datetime(2013, 1, 1, 0, 0, 0), 'depends_on_past': True } dag = DAG( USERNAME + '_final_project_dwh_etl_traffic', default_args=default_args, description='Final project DWH ETL traffic', schedule_interval="0 0 1 1 *", max_active_runs = 1, ) clear_ods = PostgresOperator( task_id="clear_ods", dag=dag, sql=""" DELETE FROM ygladkikh.project_ods_traffic WHERE EXTRACT(YEAR FROM time_stamp) = {{ execution_date.year }} """ ) fill_ods = PostgresOperator( task_id="fill_ods", dag=dag, sql=""" INSERT INTO ygladkikh.project_ods_traffic SELECT user_id,to_timestamp("timestamp"/1000), device_id, device_ip_addr, bytes_sent, bytes_received FROM ygladkikh.project_stg_traffic WHERE EXTRACT(YEAR FROM to_timestamp("timestamp"/1000)) = {{ execution_date.year }} """ )
redshift_hook = PostgresHook("redshift") records = redshift_hook.get_records(""" SELECT birthyear FROM younger_riders ORDER BY birthyear DESC LIMIT 1 """) if len(records) > 0 and len(records[0]) > 0: logging.info(f"Youngest rider was born in {records[0][0]}") dag = DAG("lesson3.exercise2", start_date=datetime.datetime.utcnow()) create_oldest_task = PostgresOperator(task_id="create_oldest", dag=dag, sql=""" BEGIN; DROP TABLE IF EXISTS older_riders; CREATE TABLE older_riders AS ( SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945 ); COMMIT; """, postgres_conn_id="redshift") create_youngest_task = PostgresOperator(task_id="create_youngest", dag=dag, sql=""" BEGIN; DROP TABLE IF EXISTS younger_riders; CREATE TABLE younger_riders AS ( SELECT * FROM trips WHERE birthyear > 2000 ); COMMIT;
for line in f: domain_code, page_title, view_counts, _ = line.split(" ") if domain_code == "en" and page_title in pagenames: result[page_title] = view_counts with open("/tmp/postgres_query.sql", "w") as f: for pagename, pageviewcount in result.items(): f.write("INSERT INTO pageview_counts VALUES (" f"'{pagename}', {pageviewcount}, '{execution_date}'" ");\n") fetch_pageviews = PythonOperator( task_id="fetch_pageviews", python_callable=_fetch_pageviews, op_kwargs={ "pagenames": {"Google", "Amazon", "Apple", "Microsoft", "Facebook"} }, provide_context=True, dag=dag, ) write_to_postgres = PostgresOperator( task_id="write_to_postgres", postgres_conn_id="my_postgres", sql="postgres_query.sql", dag=dag, ) get_data >> extract_gz >> fetch_pageviews >> write_to_postgres