def create_and_load_table_dag(parent_dag_name, task_id, redshift_conn_id,
                              create_sql, insert_sql, table, truncate, *args,
                              **kwargs):

    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    create_users_table = CreateTableOperator(task_id=f'create_{table}_table',
                                             dag=dag,
                                             redshift_conn_id=redshift_conn_id,
                                             create_sql=create_sql,
                                             table=table)

    load_user_dimension_table = LoadDimensionOperator(
        task_id=f'Load_{table}_dim_table',
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        query=insert_sql,
        truncate=truncate)

    create_users_table >> load_user_dimension_table

    return dag
예제 #2
0
    'catchup': True,
    'end_date': datetime(2018, 11, 2)
}

dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='@hourly')

### Start operator
start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

### Create tables only if not exist
create_staging_events = CreateTableOperator(
    task_id='create_staging_events',
    dag=dag,
    redshift_conn_id='redshift',
    table_name='staging_events',
    sql_command=SqlQueries.create_staging_events)

create_staging_songs = CreateTableOperator(
    task_id='create_staging_songs',
    dag=dag,
    redshift_conn_id='redshift',
    table_name='staging_songs',
    sql_command=SqlQueries.create_staging_songs)

create_songplays = CreateTableOperator(task_id='create_songplays',
                                       dag=dag,
                                       redshift_conn_id='redshift',
                                       table_name='songplays',
                                       sql_command=SqlQueries.create_songplays)
예제 #3
0
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
    'email_on_retry': False
}

dag = DAG('udac_sparkify_airflow',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *'
        )

start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

create_table = CreateTableOperator(
    task_id = "Create_tables",
    redshift_conn_id = "redshift",
    dag = dag
)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    redshift_conn_id = "redshift",
    aws_credentials_id = "aws_credentials",
    table= "event",
    s3_bucket=default_args["s3_bucket"],
    format="JSON",
    dag=dag
)

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='@hourly',
          max_active_runs=1)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

# extract data from sas and save data to csv format
# redshift_conn_id = conn_id used only when load data to
generate_csv_operator = GenerateCsvOperator(
    task_id="Extract_sas",
    dag=dag,
    aws_credential_id="aws_credentials",
    s3_bucket=s3_bucket,
    s3_sas_key=s3_sas_key)
create_tables_operator = CreateTableOperator(task_id="Create_All_Tables",
                                             dag=dag,
                                             redshift_conn_id="redshift")

# using subDAG?
# immigrations us_cities_demographics airport i94visas i94port i94mode i94cit i94addr
# airport_codes_csv.csv
load_airport_operator = CopyInsertTableOperator(
    task_id="Load_airport_Tables",
    dag=dag,
    table_name="airport",
    aws_credential_id="aws_credentials",
    redshift_conn_id="redshift",
    s3_bucket=s3_bucket,
    s3_csv_key="airport_codes_csv.csv")
load_immigration_operator = CopyInsertTableOperator(
    task_id="Load_immigration_Tables",
예제 #5
0
dag = DAG(
    'udac_example_dag',
    default_args=default_args,
    description='Load and transform data in Redshift with Airflow',
    ##https://medium.com/intage-analytics/airflow-trick-to-find-the-exact-start-date-via-cron-expression-23b5351007b
    schedule_interval='0 * * * *',  ## Dag is running hourly
    max_active_runs=1,
    catchup=False)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_tables = CreateTableOperator(
    task_id='create_tables',
    postgres_conn_id='redshift',
    provide_context=True,
    dag=dag,
    sql_statment1=CREATE_ALL_TABLES_SQL.CREATE_TABLE_artists,
    sql_statment2=CREATE_ALL_TABLES_SQL.CREATE_TABLE_songplays,
    sql_statment3=CREATE_ALL_TABLES_SQL.CREATE_TABLE_songs,
    sql_statment4=CREATE_ALL_TABLES_SQL.CREATE_TABLE_staging_events,
    sql_statment5=CREATE_ALL_TABLES_SQL.CREATE_TABLE_time)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    provide_context=True,
    table='staging_events',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    s3_bucket='udacity-dend/',
    s3_key='log_data/{execution_date.year}/{execution_date.month}',
    json_path='s3://udacity-dend/log_json_path.json',
    'email_on_retry': False
}

# create DAG udac_pipeline_airflow
dag = DAG('udac_pipeline_airflow',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *',
          max_active_runs=1)

# dummy task
start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

# create all tables: staging tables, fact table and dimension tables
create_tables = CreateTableOperator(task_id='Create_tables',
                                    dag=dag,
                                    redshift_conn_id="redshift")

# load log data from s3 to staging_events table in redshift
stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    table="staging_events",
    s3_bucket="udacity-dend",
    s3_key="log_data",
    jsonpaths_file="s3://udacity-dend/log_json_path.json")

# load log data from s3 to staging_songs table in redshift
stage_songs_to_redshift = StageToRedshiftOperator(
예제 #7
0
    'catchup': True
}

dag_name = 'udac_airflow_dag' 
dag = DAG(dag_name,
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *',
          max_active_runs = 1          
        )

start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

create_tables_in_redshift = CreateTableOperator(
    task_id = 'create_tables_in_redshift',
    redshift_conn_id = 'redshift',
    dag = dag
)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    table_name="staging_events",
    s3_bucket = s3_bucket,
    s3_key = log_s3_key,
    file_format="JSON",
    log_json_file = log_json_file,
    redshift_conn_id = "redshift",
    aws_credential_id="aws_credentials",
    dag=dag,
    provide_context=True
)
          catchup=False)

extract_sas_data_operator = ExtractionFromSASOperator(
	task_id ='Extract_data_from_SAS_save_as_csv_in_s3bucket',
	dag=dag,
	s3_bucket = 'uda-capstone-data',
  s3_load_prefix = 'csv_data',
  s3_save_prefix = 'csv_data',
  file_name = 'I94_SAS_Labels_Descriptions.SAS')



create_immigration_table = CreateTableOperator(
  task_id = 'Create_immigration_table',
  dag=dag,
  table = 'immigration',
  create_sql_stmt = SqlQueries.immigrant_table_create,
  drop_sql_stmt = SqlQueries.drop_table
  )

load_immigration_table = CopyTableOperator(
  task_id = 'Load_immigration_table',
  dag=dag,
  table = 'immigration',
  schema ='public',
  s3_bucket = 'uda-capstone-data',
  s3_load_prefix = 'sas_data',
  iam_role = Variable.get("IAM_ROLE")
  )

data_quality_check_on_immigration = CheckQualityOperator(
    cur.execute(code)
    result = cur.fetchall()[0][0]
    if result == failvalue:
        print(f"Data quality check failed. {table} has no rows.")
    else:
        print(f"Data quality check passed. {table} has rows")


#Start task definition
start = PythonOperator(task_id="start", dag=dag, python_callable=start)

#Definition of tasksnecessary for creating the dimension and fact tables

create_dim_date = CreateTableOperator(task_id="create_dim_date",
                                      dag=dag,
                                      sql=sql.create_dim_date,
                                      table="v_dim_date",
                                      cur=cur)

create_dim_vehicletype = CreateTableOperator(task_id="create_dim_vehicletype",
                                             dag=dag,
                                             sql=sql.create_dim_vehicle_type,
                                             table="dim_vehicletype",
                                             cur=cur)

create_dim_driver = CreateTableOperator(task_id="create_dim_driver",
                                        dag=dag,
                                        sql=sql.create_dim_driver,
                                        table="dim_driver",
                                        cur=cur)
예제 #10
0
    'start_date': datetime(2020, 5, 11),
    'end_date': datetime(2020, 12, 30),
}

#create DAG
dag = DAG('sparkify_etl_dag',
          description='Performs ETL operations form S3 to Redshift',
          max_active_runs=3,
          start_date=datetime(2020, 6, 10, 0, 0, 0, 0))

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_tables_task = CreateTableOperator(task_id='create_tables',
                                         dag=dag,
                                         redshift_conn_id='redshift',
                                         tables=[
                                             "artists", "songplays", "songs",
                                             "staging_events", "staging_songs",
                                             "time", "users"
                                         ])

stage_events_task = StageTablesToRedshiftOperator(
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    table='staging_events',
    s3_bucket='udacity-dend',
    s3_key='log_data',
    region='us-west-2',
    file_format="s3://udacity-dend/log_json_path.json",
    provide_context=True,
    execution_date=None,
    task_id='staging_events_data',
    'depends_on_past': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
    'email_on_retry': False,
    'catchup': False
}

dag = DAG('sparkify_etl_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *')

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_tables_in_redshift = CreateTableOperator(task_id="Create_tables",
                                                dag=dag,
                                                redshift_conn_id="redshift")

stage_events_to_redshift = StageToRedshiftOperator(
    task_id="Stage_events",
    dag=dag,
    table="staging_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="log_data",
    jsonpath="s3://udacity-dend/log_json_path.json")

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
예제 #12
0
connection_operator = DummyOperator(task_id='connection_operator', dag=dag)
finish_operator = DummyOperator(task_id='finish_execution', dag=dag)

fetch_api_bisq = FetchApiOperator(task_id="fetch_api_bisq",
                                  dag=dag,
                                  aws_con=aws_credentials,
                                  remote_provider="bisq",
                                  aws_bucket_name=s3_bucket)
fetch_api_paxful = FetchApiOperator(task_id="fetch_api_paxful",
                                    dag=dag,
                                    aws_con=aws_credentials,
                                    remote_provider="paxful",
                                    aws_bucket_name=s3_bucket)

create_table = CreateTableOperator(task_id="Create_table",
                                   dag=dag,
                                   conn_id="redshift",
                                   sql_query=SqlQueries.create_table)

stage_paxful_to_redshift = StageToRedshiftOperator(
    task_id='stage_paxful',
    dag=dag,
    table_name="staging_paxful",
    s3_bucket=s3_bucket,
    conn_id="redshift",
    remote_provider="paxful",
    aws_credential_id=aws_credentials,
    provide_context=True)

stage_bisq_to_redshift = StageToRedshiftOperator(
    task_id='stage_bisq',
    dag=dag,
예제 #13
0
    description='Load and transform data in Redshift with Airflow',
    catchup=False,
    max_active_runs=1,
    schedule_interval='0 * * * *',
    start_date=datetime(2019, 9, 9),
)

dummy_operator = DummyOperator(task_id='started_pipeline', retries=3, dag=dag)

create_redshift = CreateRedshiftOperator(task_id='create_cluster',
                                         dag=dag,
                                         aws_credentials_id='aws_credentials')

create_table = CreateTableOperator(
    task_id='create_table',
    dag=dag,
    aws_credentials_id='aws_credentials',
    create_tables=SqlQueries.create_table_queries,
    drop_tables=SqlQueries.drop_table_queries)

load_csv_files = PythonOperator(task_id='load_csv_files',
                                python_callable=load_csv_data,
                                dag=dag,
                                provide_context=True)
load_sas_file = PythonOperator(task_id='clean_and_load_i94',
                               python_callable=load_sas_data,
                               dag=dag,
                               provide_context=True)

##
##clean_data_check = PythonOperator(task_id='clean_data_query', python_callable = clean_data, dag=dag, provide_context=True)