def get_s3_to_redshift_dag(parent_dag_name, task_id, redshift_conn_id, aws_credentials_id, table, create_sql_stmt, s3_bucket, s3_key, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) create_task = PostgresOperator(task_id=f"create_{table}_table", dag=dag, postgres_conn_id=redshift_conn_id, sql=create_sql_stmt) copy_task = S3ToRedshiftOperator( task_id=f"load_{table}_from_s3_to_redshift", dag=dag, table=table, redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, s3_bucket=s3_bucket, s3_key=s3_key) # # TODO: Move the HasRowsOperator task here from the DAG # check_task = HasRowsOperator(task_id=f"check_{table}_has_rows", dag=dag, redshift_conn_id=redshift_conn_id, table=table) create_task >> copy_task copy_task >> check_task # # TODO: Use DAG ordering to place the check task # return dag
def get_s3_to_redshift_dag(parent_dag_name, task_id, redshift_conn_id, aws_credentials_id, staging_table, create_sql_stmt, s3_bucket, s3_key, create_table_option=True, redshift_copy_params=[], *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) create_table_task = PythonOperator(task_id=f"create_{staging_table}_table", dag=dag, python_callable=create_table, provide_context=True, params={ 'create_table_option': create_table_option, 'redshift_conn_id': redshift_conn_id, 'create_sql_stmt': create_sql_stmt, 'table': staging_table }) stage_table_from_s3_task = S3ToRedshiftOperator( dag=dag, task_id=f"load_{staging_table}_from_s3_to_redshift", redshift_conn_id=redshift_conn_id, aws_conn_id=aws_credentials_id, table=staging_table, s3_bucket=s3_bucket, s3_key=s3_key, overwrite=True, copy_params=redshift_copy_params) verify_staged_table_task = PostgresHasRowsOperator( dag=dag, task_id=f"verify_{staging_table}_count", table=staging_table, connection_id=redshift_conn_id) create_table_task >> stage_table_from_s3_task >> verify_staged_table_task return dag
def get_s3_to_redshift_dag( parent_dag_name, task_id, redshift_conn_id, aws_credentials_id, table, create_sql_stmt, s3_bucket, s3_key, *args, **kwargs ): dag = DAG( dag_id=f"{parent_dag_name}.{task_id}", # subdags' dag_id must follow this naming pattern **kwargs ) # TODO: Move the create table code here create_table_task = PostgresOperator( task_id=f"create_{table}_table", postgres_conn_id=redshift_conn_id, sql=create_sql_stmt, dag=dag ) # TODO: Move the S3ToRedshiftOperator code here copy_table_task = S3ToRedshiftOperator( task_id=f"load_{table}_from_s3_to_redshift", table=table, redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, s3_bucket=s3_bucket, s3_key=s3_key, dag=dag ) # TODO: Move the check table code here check_table_task = HasRowsOperator( task_id=f"check_{table}_data", redshift_conn_id=redshift_conn_id, table=table, dag=dag ) # TODO: Define ordering of tasks within this subdag create_table_task >> copy_table_task copy_table_task >> check_table_task return dag
def create_task(table): """Returns an operator for copying the table into Redshift""" return S3ToRedshiftOperator( dag=dag, task_id=f"copy_{table.get('table_name', None)}_to_redshift", redshift_conn_id=redshift_conn_id, redshift_schema=redshift_schema, table=table.get('table_name', None), s3_conn_id=s3_conn_id, s3_bucket=s3_bucket, s3_key=table.get('s3_key', None), load_type=load_type, copy_params=table.get('copy_params', None), origin_schema=table.get('origin_schema', None), primary_key=table.get('primary_key', None), foreign_key=table.get('foreign_key', {}), schema_location=schema_location)
def get_s3_to_redshift(parent_dag_name, task_id, tables_definition, redshift_conn_id, redshift_schema, s3_conn_id, s3_bucket, load_type, schema_location, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) for table in tables_definition: S3ToRedshiftOperator( dag=dag, task_id=f"copy_{table.get('table_name',None)}_to_redshift", redshift_conn_id=redshift_conn_id, redshift_schema=redshift_schema, table=f"staging_{table.get('table_name',None)}", s3_conn_id=s3_conn_id, s3_bucket=s3_bucket, s3_key=table.get('s3_key', None), load_type=load_type, copy_params=table.get('copy_params', None), origin_schema=table.get('origin_schema', None), schema_location=schema_location) return dag
S3_KEY = 'github/{0}/{1}.json'.format(org['name'], endpoint['name']) TI_PREFIX = 'github_{0}_from_{1}'.format(endpoint['name'], org['name']) GITHUB_TASK_ID = '{0}_to_s3'.format(TI_PREFIX) REDSHIFT_TASK_ID = '{0}_to_redshift'.format(TI_PREFIX) github = GithubToS3Operator(task_id=GITHUB_TASK_ID, github_conn_id=org['github_conn_id'], github_org=org['name'], github_repo='all', github_object=endpoint['name'], payload=endpoint['payload'], s3_conn_id=S3_CONN_ID, s3_bucket=S3_BUCKET, s3_key=S3_KEY) redshift = S3ToRedshiftOperator(task_id=REDSHIFT_TASK_ID, s3_conn_id=S3_CONN_ID, s3_bucket=S3_BUCKET, s3_key=S3_KEY, origin_schema=ORIGIN_SCHEMA, SCHEMA_LOCATION=SCHEMA_LOCATION, load_type=LOAD_TYPE, copy_params=COPY_PARAMS, redshift_schema=REDSHIFT_SCHEMA, table='{0}_{1}'.format(org['name'], endpoint['name']), redshift_conn_id=REDSHIFT_CONN_ID, primary_key='id') kick_off_dag >> github >> redshift
'Catchup': False } dag = DAG('udac_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *' ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = S3ToRedshiftOperator( task_id='stage_events', dag=dag, table="staging_events", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="log_data", ) stage_songs_to_redshift = S3ToRedshiftOperator( task_id='staging_songs', dag=dag, table="staging_songs", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="song_data", )
dag=dag, provide_context=True) load_sas_file = PythonOperator(task_id='clean_and_load_i94', python_callable=load_sas_data, dag=dag, provide_context=True) ## ##clean_data_check = PythonOperator(task_id='clean_data_query', python_callable = clean_data, dag=dag, provide_context=True) load_s3_to_redshift = S3ToRedshiftOperator( task_id='s3_to_redshift', dag=dag, schema='public', table='us_city_demography', redshift_conn_id='pipeline_redshift', aws_conn_id='aws_credentials', copy_options=tuple(['csv', 'Delimiter', '\';\'', 'IGNOREHEADER 1']), s3_bucket='udacitycapstonedata', s3_key='us-cities-demographics.csv', ) run_quality_checks = \ DataQualityOperator(task_id='Run_data_quality_checks', dag=dag, tables=['us_city_demography', 'immigration'], redshift_conn_id='pipeline_redshift') ## analyze_data = PythonOperator(task_id='analyze_data_from_redshift', python_callable = analyze_query, dag=dag, provide_context=True) ## hello_operator = PythonOperator(task_id='hello_task', python_callable=print_hello, dag=dag, provide_context=True) dummy_operator >> create_redshift
schedule_interval="@monthly", max_active_runs=1 ) create_trips_table = PostgresOperator( task_id="create_trips_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_TRIPS_TABLE_SQL ) copy_trips_task = S3ToRedshiftOperator( task_id="load_trips_from_s3_to_redshift", dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udac-data-pipelines", s3_key="divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv" ) # # TODO: Replace this data quality check with the HasRowsOperator # check_trips = HasRowsOperator( task_id='check_trips_data', dag=dag, redshitft_conn_id = "redshift", table = "trips" )
for table in tables: S3_KEY = 'salesforce/{}/{}.json'.format('{{ execution_date }}', table['name'].lower()) salesforce_to_s3 = SalesforceToS3Operator(task_id='{0}_to_S3'.format(table['name']), sf_conn_id=SF_CONN_ID, obj=table, output=S3_KEY, fmt='ndjson', s3_conn_id=S3_CONN_ID, s3_bucket=S3_BUCKET, record_time_added=True, coerce_to_timestamp=True, dag=dag) s3_to_redshift = S3ToRedshiftOperator(task_id='{0}_to_Redshift'.format(table['name']), redshift_conn_id=REDSHIFT_CONN_ID, redshift_schema=REDSHIFT_SCHEMA_NAME, table=table, s3_conn_id=S3_CONN_ID, s3_bucket=S3_BUCKET, s3_key=S3_KEY, origin_schema=ORIGIN_SCHEMA, copy_params=COPY_PARAMS, schema_location=SCHEMA_LOCATION, load_type=table['load_type'], dag=dag) kick_off_dag >> salesforce_to_s3 >> s3_to_redshift
from datetime import datetime from airflow import DAG from airflow.operators import S3ToRedshiftOperator from airflow.operators.dummy_operator import DummyOperator from airflow import configuration args = {'owner': 'scopeworker', 'provide_context': True} dag = DAG('s3_to_redshift_dag', description='REDSHIFT DAG', schedule_interval='*/1 * * * *', start_date=datetime(2017, 3, 20), catchup=False, default_args=args) redshift_operator = S3ToRedshiftOperator( task_id="s3_to_redshift", redshift_conn_id=configuration.get("postgresql", "postgresql_conn_id"), s3_bucket="scopeworkerproduction", aws_conn_id=configuration.get("s3", "s3_conn_id"), s3_access_key_id=configuration.get("s3", "s3_access_key_id"), s3_secret_access_key=configuration.get("s3", "s3_secret_access_key"), delimiter='|', region="us-west-2", dag=dag) dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag) dummy_operator >> redshift_operator
start_date=datetime.datetime(2020, 6, 21), max_active_runs=1) # create trips table createTripsTable = PostgresOperator(task_id='createTripsTable', postgres_conn_id='redshift', sql=sql_statements.CREATE_TRIPS_TABLE_SQL, dag=customDag) # load data from S3 to Resdhift loadTripData = S3ToRedshiftOperator( task_id='loadTripData', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', table='trips', truncate=False, s3_bucket='udacity-dend', s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv', delimiter=',', ignore_headers=1, dag=customDag) # check data quality checkDataQuality = HasRowsOperator(task_id='checkDataQuality', redshift_conn_id='redshift', table='trips', dag=customDag) # create fact table createFactTable = FactsCalculatorOperator(task_id='createFactTable', redshift_conn_id='redshift',
def create_dag(dag_id, schedule, marketo_conn_id, redshift_conn_id, redshift_schema, s3_conn_id, s3_bucket, default_args, catchup=False): dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args, catchup=catchup) if 'backfill' in dag_id: endpoints = ['leads'] else: endpoints = [ 'activities', 'campaigns', 'leads', 'programs', 'lead_lists' ] COPY_PARAMS = [ "COMPUPDATE OFF", "STATUPDATE OFF", "JSON 'auto'", "TIMEFORMAT 'auto'" "TRUNCATECOLUMNS", "region as 'us-east-1'" ] with dag: d = DummyOperator(task_id='kick_off_dag') l = RateLimitOperator(task_id='check_rate_limit', service='marketo', service_conn_id=marketo_conn_id, threshold=RATE_LIMIT_THRESHOLD, threshold_type=RATE_LIMIT_THRESHOLD_TYPE) d >> l for endpoint in endpoints: MARKETO_SCHEMA = schema[endpoint] TABLE_NAME = 'mkto_{0}'.format(endpoint) S3_KEY = 'marketo/{0}/{1}_{2}.json'.format(redshift_schema, endpoint, "{{ ts_nodash }}") MARKETO_TASK_ID = 'get_{0}_marketo_data'.format(endpoint) REDSHIFT_TASK_ID = 'marketo_{0}_to_redshift'.format(endpoint) START_AT = "{{ execution_date.isoformat() }}" END_AT = "{{ next_execution_date.isoformat() }}" m = MarketoToS3Operator(task_id=MARKETO_TASK_ID, marketo_conn_id=marketo_conn_id, endpoint=endpoint, s3_conn_id=s3_conn_id, s3_bucket=s3_bucket, s3_key=S3_KEY, output_format='json', start_at=START_AT, end_at=END_AT) l >> m if endpoint != 'leads': r = S3ToRedshiftOperator(task_id=REDSHIFT_TASK_ID, s3_conn_id=s3_conn_id, s3_bucket=s3_bucket, s3_key=S3_KEY, load_type='rebuild', load_format='JSON', schema_location='local', origin_schema=MARKETO_SCHEMA, redshift_schema=redshift_schema, table=TABLE_NAME, copy_params=COPY_PARAMS, redshift_conn_id=redshift_conn_id) m >> r else: rl = S3ToRedshiftOperator(task_id=REDSHIFT_TASK_ID, s3_conn_id=s3_conn_id, s3_bucket=s3_bucket, s3_key=S3_KEY, load_type='upsert', load_format='JSON', schema_location='local', origin_schema=MARKETO_SCHEMA, redshift_schema=redshift_schema, table=TABLE_NAME, primary_key='id', copy_params=COPY_PARAMS, incremental_key='updated_at', redshift_conn_id=redshift_conn_id) m >> rl return dag
def create_dag(dag_id, schedule, hubspot_conn_id, redshift_conn_id, redshift_schema, default_args, catchup=False, max_active_runs=3): try: h = HttpHook(method='GET', http_conn_id=hubspot_conn_id) pw = h.get_connection(conn_id=hubspot_conn_id).password response = h.run('oauth/v1/access-tokens/{0}'.format(pw)) scopes = response.json()['scopes'] dag = DAG(dag_id, default_args=default_args, schedule_interval=schedule, catchup=catchup, max_active_runs=max_active_runs) with dag: kick_off_dag = DummyOperator(task_id='kick_off_dag') kick_off_dag tables_to_build = [] for endpoint in endpoints: if endpoint['scope'] in scopes: if 'backfill' in dag_id and 'startTimestamp': if endpoint['hubspot_args'].keys(): tables_to_build.append(endpoint) else: tables_to_build.append(endpoint) for table in tables_to_build: HUBSPOT_ARGS = table.get('hubspot_args', {}) TABLE_NAME = table.get('name', '') LOAD_TYPE = table.get('load_type', '') PRIMARY_KEY = None INCREMENTAL_KEY = None if 'primary_key' in table.keys(): PRIMARY_KEY = table['primary_key'] if 'incremental_key' in table.keys(): INCREMENTAL_KEY = table['incremental_key'] S3_KEY = ('hubspot/{0}/{1}_{2}.json'.format( redshift_schema, TABLE_NAME, "{{ ts_nodash }}")) split_key = path.splitext(S3_KEY) LOAD_KEY = '{0}_core'.format(split_key[0]) h = HubspotToS3Operator( task_id='hubspot_{0}_data_to_s3'.format(TABLE_NAME), hubspot_conn_id=hubspot_conn_id, hubspot_object=TABLE_NAME, hubspot_args=HUBSPOT_ARGS, s3_conn_id=S3_CONN_ID, s3_bucket=S3_BUCKET, s3_key=S3_KEY) kick_off_dag >> h if table['name'] == 'timeline': pass else: r = S3ToRedshiftOperator( task_id='hubspot_{0}_to_redshift'.format(TABLE_NAME), s3_conn_id=S3_CONN_ID, s3_bucket=S3_BUCKET, s3_key=LOAD_KEY, origin_schema=getattr(hubspot_schema, TABLE_NAME), origin_datatype='json', copy_params=COPY_PARAMS, load_type=LOAD_TYPE, primary_key=PRIMARY_KEY, incremental_key=INCREMENTAL_KEY, schema_location='local', redshift_schema=redshift_schema, table=TABLE_NAME, redshift_conn_id=redshift_conn_id) h >> r if table['subtables']: for subtable in table['subtables']: SUBTABLE_LOAD_KEY = '{0}_{1}'.format( split_key[0], subtable) SUBTABLE_NAME = '{0}_{1}'.format(TABLE_NAME, subtable) if SUBTABLE_NAME == 'timeline': SUBTABLE_NAME = TABLE_NAME s = S3ToRedshiftOperator( task_id='hubspot_{0}_{1}_to_redshift'.format( TABLE_NAME, subtable), s3_conn_id=S3_CONN_ID, s3_bucket=S3_BUCKET, s3_key=SUBTABLE_LOAD_KEY, origin_schema=getattr( hubspot_schema, '{0}_{1}'.format(TABLE_NAME, subtable)), origin_datatype='json', load_type=LOAD_TYPE, schema_location='local', copy_params=COPY_PARAMS, redshift_schema=redshift_schema, table=SUBTABLE_NAME, redshift_conn_id=REDSHIFT_CONN_ID) h >> s return dag except: pass
# 2. Performs a data quality check on the Trips table in RedShift # 3. Uses the FactsCalculatorOperator to create a Facts table in Redshift # a. **NOTE**: to complete this step you must complete the FactsCalcuatorOperator # skeleton defined in plugins/operators/facts_calculator.py # dag = DAG("lesson3.exercise4", start_date=datetime.datetime.utcnow()) # # TODO: Load trips data from S3 to RedShift. Use the s3_key # "data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv" # and the s3_bucket "udacity-dend" # copy_trips_task = S3ToRedshiftOperator( task_id='copy_trips_task', dag=dag, redshift_conn_id='redshift', aws_credentials_id='aws_credentials', table='trips', s3_bucket='udacity-dend', s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv') # # TODO: Perform a data quality check on the Trips table # check_trips = HasRowsOperator( task_id='check_trips_data', dag=dag, table='trips', redshift_conn_id='redshift', ) #
init_tables = PostgresOperator(dag=dag, task_id="init_tables", postgres_conn_id="redshift_songs", sql=dag_sql.init_tables) verify_staged_songs = PostgresHasRowsOperator( task_id="verify_imported_records", table="staging_songs", connection_id="redshift_songs") stage_songs_from_s3 = S3ToRedshiftOperator( task_id="stage-songs-to-redshift", redshift_conn_id="redshift_songs", aws_conn_id="s3_songs", table="staging_songs", s3_bucket=input_bucket, s3_key=song_data, overwrite=True, copy_params=stage_songs_params) extract_artists = PostgresOperator(task_id="load_artists", postgres_conn_id="redshift_songs", sql=dag_sql.load_artists) extract_songs = PostgresOperator(task_id="load_songs", postgres_conn_id="redshift_songs", sql=dag_sql.load_songs) init_tables >> stage_songs_from_s3 stage_songs_from_s3 >> verify_staged_songs verify_staged_songs >> [extract_artists, extract_songs]
type = workflow.get('id', None) name = workflow.get('name', None) fields = workflow.get('fields', None) S3_KEY = '{type}_{time}.csv'.format(type=workflow['id'], time=TIME) imap_to_s3 = ImapToS3Operator( task_id='{}_to_s3'.format(type), imap_conn_id=IMAP_CONN_ID, imap_email=IMAP_EMAIL, imap_subject=name, s3_conn_id=S3_CONN_ID, s3_bucket=S3_BUCKET, s3_key=S3_KEY, ) s3_to_redshift = S3ToRedshiftOperator( task_id='{}_to_redshift'.format(type), s3_conn_id=S3_CONN_ID, s3_bucket=S3_BUCKET, s3_key=S3_KEY, redshift_conn_id=REDSHIFT_CONN_ID, redshift_schema=REDSHIFT_SCHEMA, table=type, origin_schema=fields, schema_location='local', ) kick_off_dag >> imap_to_s3 >> s3_to_redshift
# create Tasks by instantiating Operator Classes createTripsTable = PostgresOperator(task_id='createTripsTable', postgres_conn_id='redshift', sql=sql_statements.CREATE_TRIPS_TABLE_SQL, dag=bulkLoadDag) # IMPORTANT: # As "s3_key" parameter has been marked as "Templatable" (in its Class # "template_fields" variable) it'll be able to fetch JINJA template # variables like {{ds}}, {{execution_date}} and so on. loadTripsData = S3ToRedshiftOperator( task_id="loadTripsData", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="trips", truncate=False, s3_bucket="udacity-dend", s3_key= "data-pipelines/divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv", delimiter=",", ignore_headers=1, dag=bulkLoadDag) checkTripsData = HasRowsOperator(task_id='checkTripsData', redshift_conn_id='redshift', table='trips', dag=bulkLoadDag) createStationsTable = PostgresOperator( task_id="createStationsTable", postgres_conn_id="redshift", sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
default_args=default_args, description='Extract, load and, transform covid \ and google mobility data into redshift', schedule_interval='@once', catchup=True) # task definition to create tables in redshift create_tables = CreateTablesInRedshiftOperator( task_id="create_redshift_tables", dag=dag, redshift_conn_id="redshift") # task definition to copy location lookup data into staging table copy_location_lookup_data_task = S3ToRedshiftOperator( task_id="copy_location_lookup_data_from_s3_to_redshift", dag=dag, table="location_lookup_staging", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="btrivedi-udacityproject", s3_key=f"location_lookup/location_lookup_11-24-2020.psv.gz", provide_context=True, ) # task definition to copy covid data into staging table copy_covid_data_task = S3ToRedshiftOperator( task_id="copy_covid_data_from_s3_to_redshift", dag=dag, table="covid_data_staging", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="btrivedi-udacityproject", s3_key=f"covid/covid", file_format="json",
def redshift_table_ddl_plus_bulk_load(parent_dag_id, parent_task_id, redshift_conn_id, aws_credentials_id, table, create_table_statement, s3_bucket, s3_key, *args, **kwargs): """ This Python Factory Function returns an Airflow DAG Class instance to be used as a SubDag within the (outer) DAG that calls it. The DAG in turn executes three Task within a Redshift database instance: - execute CREATE TABLE (DDL) statement through PostgresOperator; - execute COPY (bulk load) to pull data from S3 into the newly created table through custom S3ToRedshiftOperator - perform data quality check through custom HasRowsOperator Args: parent_dag_id (str): """ # instantiate DAG Class subDagInstance = DAG( #---------------------------------------------------------------------- # # ATTENTION! "parent_dag_id.parent_task_id" IS MANDATORY # # It's an Airflow Convention that subdags must have their "dag_id" # attribute set to the combination of: # # 1. "dag_id" of the outer DAG Instance calling the SubDagOperator # which in turn calls the Python Factory Function; # # 2. "task_id" assigned to the SubDagOperator mentioned in item number 1 # # 3. "." a dot must separate the "dag_id" and "task_id" mentioned in # both items 1 and 2, respectively; # #---------------------------------------------------------------------- dag_id=f"{parent_dag_id}.{parent_task_id}" # make sure keyword arguments are also received , **kwargs) # NOTICE how Task instances are created in a SubDag just as they would in # a "normal" DAG. createTable = PostgresOperator(task_id=f'create_{table}_table', postgres_conn_id=redshift_conn_id, sql=create_table_statement, dag=subDagInstance) bulkLoadTable = S3ToRedshiftOperator(task_id=f'bulk_load_{table}_table', redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, table=table, s3_bucket=s3_bucket, s3_key=s3_key, dag=subDagInstance) checkDataQuality = HasRowsOperator(task_id=f'check_{table}_table', redshift_conn_id=redshift_conn_id, table=table, dag=subDagInstance) # Define Task dependencies createTable >> bulkLoadTable >> checkDataQuality return subDagInstance
methods = {'PlayerCareer': 'regular_season_totals', 'PlayerGameLogs': 'info'} with dag: kick_off_dag = DummyOperator(task_id='basketball_analysis_sideproj') for player in players: for method in methods.keys(): to_s3 = NbaToS3Operator( task_id='{0}_{1}_to_s3'.format(player, method), player_name=player, endpoint='player', method=method, id=players[player], stats=methods[method], s3_conn_id='astronomer-s3', s3_bucket='astronomer-workflows-dev', s3_key='{0}_{1}.json'.format(player, method), ) s3_to_redshift = S3ToRedshiftOperator( task_id='{0}_{1}_to_redshift'.format(player, method), redshift_conn_id='astronomer-redshift-dev', redshift_schema='viraj_testing', table='{player}_{stats}'.format(player=player, stats=methods[method]), s3_conn_id='astronomer-s3', s3_bucket='astronomer-workflows-dev', s3_key='{0}_{1}.json'.format(player, method), origin_schema='{0}_{1}_schema.json'.format(player, method), load_type='rebuild') kick_off_dag >> to_s3 >> s3_to_redshift
dag=dag) # Append breakdown fields (primary keys) after # primary keys which are in every workflow output_table_fields = list(fields) output_table_fields = output_table_fields[:4] + breakdown[ 'fields'] + output_table_fields[4:] primary_key = [ 'ad_id', 'adset_id', 'campaign_id', 'account_id', 'date_start' ] primary_key.extend(breakdown_fields) s3_to_redshift = S3ToRedshiftOperator(task_id='s3_{}_to_redshift'.format( breakdown['name']), s3_conn_id=S3_CONN_ID, s3_bucket=S3_BUCKET, s3_key=S3_KEY, redshift_conn_id=REDSHIFT_CONN_ID, redshift_schema=REDSHIFT_SCHEMA, copy_params=COPY_PARAMS, table=breakdown['name'], origin_schema=output_table_fields, schema_location='local', primary_key=primary_key, load_type='upsert', dag=dag) start >> facebook_ads >> s3_to_redshift
# 2. Performs a data quality check on the Trips table in RedShift # 3. Uses the FactsCalculatorOperator to create a Facts table in Redshift # a. **NOTE**: to complete this step you must complete the FactsCalcuatorOperator # skeleton defined in plugins/operators/facts_calculator.py # dag = DAG("lesson3.exercise4", start_date=datetime.datetime.utcnow()) # # The following code will load trips data from S3 to RedShift. Use the s3_key # "data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv" # and the s3_bucket "udacity-dend" # copy_trips_task = S3ToRedshiftOperator( task_id="load_trips_from_s3_to_redshift", dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv") # # Data quality check on the Trips table # check_trips = HasRowsOperator(task_id="check_trips_data", dag=dag, redshift_conn_id="redshift", table="trips") # # We use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column is # `tripduration` and the groupby_column is `bikeid`
create_mode_dim_table = PostgresOperator(task_id="create_mode_dim_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_MODE_SQL) # Create Visa Dimension Table create_visa_dim_table = PostgresOperator(task_id="create_visa_dim_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_VISA_SQL) # load immigration data to immigration_fact table copy_immigration_task = S3ToRedshiftOperator( task_id="load_immigration_from_s3_to_redshift", dag=dag, table="immigration_fact", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket=bucket, s3_key="immigration-data/cleanjob.csv") # Load happiness data to happiness table copy_happiness_task = S3ToRedshiftOperator( task_id="load_happiness_from_s3_to_redshift", dag=dag, table="happiness", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket=bucket, s3_key="happiness-data/2016.csv") # Load country data to Country table
task_id='get_google_analytics_data', google_analytics_conn_id=GOOGLE_ANALYTICS_CONN_ID, view_id=view_id, since=SINCE, until=UNTIL, sampling_level=SAMPLING_LEVEL, dimensions=DIMENSIONS, metrics=METRICS, page_size=PAGE_SIZE, include_empty_rows=INCLUDE_EMPTY_ROWS, s3_conn_id=S3_CONN_ID, s3_bucket=S3_BUCKET, s3_key=S3_KEY) redshift = S3ToRedshiftOperator( task_id='sink_to_redshift', redshift_conn_id=REDSHIFT_CONN_ID, redshift_schema=REDSHIFT_SCHEMA, table='report_{}'.format(view_id), s3_conn_id=S3_CONN_ID, s3_bucket=S3_BUCKET, s3_key=S3_KEY, origin_schema=google_analytics_reporting_schema, schema_location='local', copy_params=COPY_PARAMS, load_type='upsert', primary_key=PRIMARY_KEY, incremental_key=INCREMENTAL_KEY) d >> g >> redshift
s3_key=S3_KEY, replace=True) flatten_object = PythonOperator( task_id='flatten_{0}'.format(collection['name']), python_callable=flatten_py, templates_dict={ 's3_key': S3_KEY, 's3_conn': S3_CONN, 's3_bucket': S3_BUCKET, 'collection_name': collection['name'], 'flattened_key': FLATTENED_KEY, 'origin_schema': collection['schema'] }, provide_context=True) redshift = S3ToRedshiftOperator( task_id='{0}_to_redshift'.format(collection['name']), s3_conn_id=S3_CONN, s3_bucket=S3_BUCKET, s3_key=FLATTENED_KEY, redshift_conn_id=REDSHIFT_CONN_ID, redshift_schema=REDSHIFT_SCHEMA, origin_schema=collection['schema'], redshift_table=collection['name'], primary_key=collection.get('primary_key', None), incremental_key=collection.get('incremental_key', None), load_type=collection['load_type']) kick_off_dag >> mongo >> flatten_object >> redshift