def get_s3_to_redshift_dag(parent_dag_name, task_id, redshift_conn_id,
                           aws_credentials_id, table, create_sql_stmt,
                           s3_bucket, s3_key, *args, **kwargs):
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    create_task = PostgresOperator(task_id=f"create_{table}_table",
                                   dag=dag,
                                   postgres_conn_id=redshift_conn_id,
                                   sql=create_sql_stmt)

    copy_task = S3ToRedshiftOperator(
        task_id=f"load_{table}_from_s3_to_redshift",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        s3_bucket=s3_bucket,
        s3_key=s3_key)

    #
    # TODO: Move the HasRowsOperator task here from the DAG
    #
    check_task = HasRowsOperator(task_id=f"check_{table}_has_rows",
                                 dag=dag,
                                 redshift_conn_id=redshift_conn_id,
                                 table=table)

    create_task >> copy_task
    copy_task >> check_task
    #
    # TODO: Use DAG ordering to place the check task
    #

    return dag
Beispiel #2
0
    'owner': 'sandeep',
    'start_date': datetime.utcnow() - timedelta(hours=5),
    'depends_on_past': False,
    'email_on_retry': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=1),
    'catchup_by_default': False,
}

dag = DAG('create_tables_dag',
          default_args=default_args,
          description='Create tables in Redshift using Airflow',
          schedule_interval=None,
          max_active_runs=1
        )

start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

create_all_tables = PostgresOperator(
    task_id="create_all_tables",
    dag=dag,
    postgres_conn_id="redshift",
    sql=create_all_tables
)

end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)

# task dependencies
start_operator >> create_all_tables
create_all_tables >> end_operator
Beispiel #3
0
    'start_date': datetime(2020, 1, 1),
}
default_args = {
    'owner': 'udacity',
    'start_date' : datetime.now()}

dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow'        
        )

start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

create_tables_task = PostgresOperator(
  task_id="create_tables",
  dag=dag,
  sql='create_tables.sql',
  postgres_conn_id="redshift"
)

# Taking a reference from Project 3 - Cloud Data Warehouses

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    redshift_conn_id = "redshift",
    aws_conn_id = "aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="log_data",
    s3_json_path = "s3://udacity-dend/log_json_path.json",
    table = "staging_events",
    file_type = "json"
Beispiel #4
0
from airflow.operators import PostgresOperator
from airflow.operators.sensors import TimeDeltaSensor
from helpers import SqlQueries

default_args = {
    'owner': 'gabriel',
    'start_date': datetime(2009, 12, 31, 23, 59, 59),
    'end_date': datetime(2012, 12, 31, 23, 59, 59),
    'depends_on_past': False,
    'retries': 1,
    'retry_delay': timedelta(seconds=300),
    'catchup': True
}

dag = DAG('recreate_bi_tables',
          default_args=default_args,
          description='Recreate the business intelligence tables',
          schedule_interval=None,
          max_active_runs=1)

dummy_wait = TimeDeltaSensor(task_id='dummy_wait',
                             dag=dag,
                             delta=timedelta(seconds=1))

recreate_bi_tables = PostgresOperator(task_id="recreate_bi_tables_task",
                                      dag=dag,
                                      postgres_conn_id="redshift",
                                      sql=SqlQueries.recreate_bi_tables)

dummy_wait >> recreate_bi_tables
Beispiel #5
0
)

del_suc_task = BashOperator(
    task_id="delete_success_data",
    bash_command='rm -r ' + home_dir +
    '/git_{{ (execution_date - macros.timedelta(days=2)).strftime("%Y-%m-%d") }}.parquet/_SUCCESS',
)

upload_to_S3_task = PythonOperator(task_id='upload_parquet_to_S3',
                                   python_callable=upload_file_to_S3_with_hook,
                                   dag=dag,
                                   provide_context=True)

create_main_tables = PostgresOperator(task_id="create_main_tables",
                                      dag=dag,
                                      postgres_conn_id=config.get(
                                          "REDSHIFT", "CONNECTION"),
                                      sql="create_main_tables.sql")

create_stage_tables = PostgresOperator(task_id="create_stage_tables",
                                       dag=dag,
                                       postgres_conn_id=config.get(
                                           "REDSHIFT", "CONNECTION"),
                                       sql="create_stage_tables.sql")

stage_events_dict_to_redshift = StageToRedshiftOperator(
    task_id='staging_events_dict_copy',
    redshift_conn_id=config.get("REDSHIFT", "CONNECTION"),
    aws_credentials_id=config.get("AWS", "CREDENTIALS"),
    table='events_dict_staging',
    s3_bucket=config.get("S3", "BUCKET"),
Beispiel #6
0
    'retries': 3,
    'retry_delay': timedelta(minutes=1),
    'catchup': False,
    'email_on_retry': False
}

dag = DAG('sustainify_fact_dag',
          default_args=default_args,
          description='Load and transform fact data in Redshift with Airflow',
          schedule_interval='@monthly')

start_operator = DummyOperator(task_id='Begin_Execution', dag=dag)

create_fact_table = PostgresOperator(
    task_id='Create_Fact_Table',
    dag=dag,
    sql=SqlQueries.create_fact_table.format(table_name=FACT_TABLE),
    postgres_conn_id='redshift',
)

create_staging_table = PostgresOperator(
    task_id='Create_Staging_Table',
    dag=dag,
    sql=SqlQueries.create_fact_table.format(table_name=STAGING_TABLE),
    postgres_conn_id='redshift',
)

drop_staging_table = PostgresOperator(
    task_id='Drop_Staging_Table',
    dag=dag,
    sql=SqlQueries.drop_fact_table.format(table_name=STAGING_TABLE),
    postgres_conn_id='redshift',
Beispiel #7
0
def make_task_group(dagname, name, path, pdag, trigger_rule):
    args = {
        'owner':
        'deploy',
        'start_date':
        datetime.strptime(
            (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d'),
            '%Y-%m-%d')
    }
    dag = DAG(dag_id=dagname,
              default_args=args,
              schedule_interval=timedelta(hours=24))
    sdOp = SubDagOperator(task_id=name,
                          subdag=dag,
                          dag=pdag,
                          trigger_rule=trigger_rule)
    start = DummyOperator(task_id=name + '_start', dag=dag)
    end = DummyOperator(task_id=name + '_end', dag=dag)
    task_dict = {}
    task_dict[name + '_start'] = start
    task_dict[name + '_end'] = end

    config_str = open(path, 'r').read()
    config_arr = config_str.split(
        '[dependency]'
    )  # a -> b means data flow from a to b, though not valid ini format, but i think it is better to understand
    config_fp = StringIO.StringIO(config_arr[0])
    config = ConfigParser.RawConfigParser()
    config.readfp(config_fp)

    sections = config.sections()
    #print sections
    for section in sections:
        #print section
        if section != 'dependency':
            options = config.options(section)
            #print options
            if 'type' not in options or 'cmd' not in options:
                continue
            operator_type = config.get(section, 'type').strip()
            operator_cmd = config.get(section, 'cmd').strip()
            if operator_type == 'PostgresOperator':
                task = PostgresOperator(task_id=section.strip(),
                                        depends_on_past=False,
                                        postgres_conn_id='postgres_sha2dw03',
                                        sql=operator_cmd,
                                        dag=dag)
                task_dict[section.strip()] = task
                task.set_downstream(end)
                task.set_upstream(start)
            elif operator_type == 'BashOperator':
                task = BashOperator(task_id=section.strip(),
                                    depends_on_past=False,
                                    bash_command=operator_cmd,
                                    dag=dag)
                task_dict[section.strip()] = task
                task.set_downstream(end)
                task.set_upstream(start)
            else:
                print "Error: currently not support %s operator type" % operator_type

    if len(config_arr) == 1:
        return (start, end, dag, sdOp, task_dict)

    for line in config_arr[1].split('\n'):
        arr = line.split('->')
        if len(arr) != 2:
            continue
        left_side = arr[0].strip()
        right_side = arr[1].strip()
        if left_side in task_dict.keys() and right_side in task_dict.keys():
            if end in task_dict[left_side].downstream_list:
                task_dict[left_side].downstream_list.remove(end)
            task_dict[left_side].set_downstream(task_dict[right_side])
            if start in task_dict[right_side].upstream_list:
                task_dict[right_side].upstream_list.remove(start)
            if task_dict[right_side] in start.downstream_list:
                start.downstream_list.remove(task_dict[right_side])
            if task_dict[left_side] in end.upstream_list:
                end.upstream_list.remove(task_dict[left_side])
    return (start, end, dag, sdOp, task_dict)
    'retries': 3,
    'retry_delay': 300,
    'email_on_retry': False
}

dag = DAG('s3_to_redshift_dag',
          default_args=default_args,
          description='Extract Load and Transform data from S3 to Redshift',
          schedule_interval='@hourly',
          catchup=False)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_staging_events_table = PostgresOperator(
    task_id='Create_staging_events_table',
    dag=dag,
    postgres_conn_id='redshift',
    sql=SqlQueries.staging_events_table_create)

create_staging_songs_table = PostgresOperator(
    task_id='Create_staging_songs_table',
    dag=dag,
    postgres_conn_id='redshift',
    sql=SqlQueries.staging_songs_table_create)

create_songplays_table = PostgresOperator(
    task_id='Create_songplays_table',
    dag=dag,
    postgres_conn_id='redshift',
    sql=SqlQueries.songplays_table_create)
    'retry_delay': timedelta(minutes=1),
    'catchup': False,
    'email_on_retry': False
}

dag = DAG('sustainify_dimensions_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='@once'
        )

start_operator = DummyOperator(task_id='Begin_Execution',  dag=dag)

create_sql_tables = PostgresOperator(
    task_id='Create_Tables',
    dag=dag,
    sql=SqlQueries.create_tables,
    postgres_conn_id='redshift',
)

transport_to_redshift = StageToRedshiftOperator(
    task_id='Transport_to_Redshift',
    dag=dag,
    table='public.transport',
    redshift_conn_id='redshift',
    s3_bucket='xwoe-udacity',
    s3_key='deng_capstone/tables/transport.parquet',
    aws_credentials_id='aws_credentials',
    aws_iam=Variable.get('arn'),
    file_format="FORMAT AS PARQUET"
)
default_args = {
    'owner': 'Gaurav',
    'start_date': datetime(2019, 1, 12),
    'depends_on_past': False
}

with DAG(
        dag_id='sparkify_data_pipeline',
        default_args=default_args,
        schedule_interval='0 0 * * *',
        description='Load and transform data in Redshift with Airflow') as dag:

    start_operator = DummyOperator(task_id='Begin_execution')

    create_tables = PostgresOperator(task_id="create_tables",
                                     postgres_conn_id="redshift",
                                     sql="create_tables.sql")

    stage_events_to_redshift = StageToRedshiftOperator(
        task_id='Stage_events',
        redshift_conn_id='redshift',
        aws_credentials_id='aws_credentials',
        table='staging_events',
        s3_bucket='udacity-dend',
        s3_key='log_data',
        copy_json_option='s3://udacity-dend/log_json_path.json',
        region='us-west-2')

    stage_songs_to_redshift = StageToRedshiftOperator(
        task_id='Stage_songs',
        redshift_conn_id='redshift',
)

from helpers import SqlQueries


dag = DAG(
    "reset_tables_dag",
    start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
    schedule_interval=None,
    max_active_runs=1
)


drop_staging_events = PostgresOperator(
    task_id="staging_events_table_drop",
    dag=dag,
    postgres_conn_id="redshift",
    sql=SqlQueries.staging_events_table_drop
)

drop_staging_songs = PostgresOperator(
    task_id="staging_songs_table_drop",
    dag=dag,
    postgres_conn_id="redshift",
    sql=SqlQueries.staging_songs_table_drop
)

drop_user = PostgresOperator(
    task_id="user_table_drop",
    dag=dag,
    postgres_conn_id="redshift",
    sql=SqlQueries.user_table_drop
import datetime
import sql_statements
from airflow import DAG
from airflow.operators import (PostgresOperator, S3ToRedshiftOperator,
                               HasRowsOperator, FactsCalculatorOperator)

customDag = DAG(dag_id='lesson3Exercise4',
                start_date=datetime.datetime(2020, 6, 21),
                max_active_runs=1)

# create trips table
createTripsTable = PostgresOperator(task_id='createTripsTable',
                                    postgres_conn_id='redshift',
                                    sql=sql_statements.CREATE_TRIPS_TABLE_SQL,
                                    dag=customDag)

# load data from S3 to Resdhift
loadTripData = S3ToRedshiftOperator(
    task_id='loadTripData',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    table='trips',
    truncate=False,
    s3_bucket='udacity-dend',
    s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv',
    delimiter=',',
    ignore_headers=1,
    dag=customDag)

# check data quality
checkDataQuality = HasRowsOperator(task_id='checkDataQuality',
Beispiel #13
0
dag = DAG(
    'sparkify_etl_dag',
    default_args=default_args,
    description='Load and transform data in Redshift with Airflow',
    schedule_interval='0 * * * *',
    max_active_runs=1,
)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

# Create destination SQL tables
# Dimension tables
create_artists_task = PostgresOperator(
    dag=dag,
    postgres_conn_id="redshift",
    sql=CreateTablesSqlQueries.create_artists,
    task_id="create_artists")

create_songplays_task = PostgresOperator(
    dag=dag,
    postgres_conn_id="redshift",
    sql=CreateTablesSqlQueries.create_songplays,
    task_id="create_songplays")

create_songs_task = PostgresOperator(dag=dag,
                                     postgres_conn_id="redshift",
                                     sql=CreateTablesSqlQueries.create_songs,
                                     task_id="create_songs")

create_time_task = PostgresOperator(dag=dag,
my_aws_credentials = "aws_credentials"

#
# TODO: Create a DAG which performs the following functions:
#
#       1. Loads Trip data from S3 to RedShift
#       2. Performs a data quality check on the Trips table in RedShift
#       3. Uses the FactsCalculatorOperator to create a Facts table in Redshift
#           a. **NOTE**: to complete this step you must complete the FactsCalcuatorOperator
#              skeleton defined in plugins/operators/facts_calculator.py
#
dag = DAG("lesson3.exercise4", start_date=datetime.datetime.utcnow())

create_trips_table = PostgresOperator(
    task_id="create_trips_table",
    dag=dag,
    postgres_conn_id=my_redshift_conn,
    sql=sql_statements.CREATE_TRIPS_TABLE_SQL)

#
# TODO: Load trips data from S3 to RedShift. Use the s3_key
#       "data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv"
#       and the s3_bucket "udacity-dend"
#
copy_trips_task = S3ToRedshiftOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    redshift_conn_id=my_redshift_conn,
    aws_credentials_id=my_aws_credentials,
    table="trips",
    s3_bucket="udacity-dend",
    'email_on_retry': False
}

dag = DAG(
    'udac_example_dag',
    default_args=default_args,
    description='Load and transform data in Redshift with Airflow',
    schedule_interval='0 * * * *',
    catchup=
    False  # Perform scheduler catchup (or only run latest)? Defaults to True
)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_tables = PostgresOperator(task_id="create_tables",
                                 dag=dag,
                                 postgres_conn_id=redshift_conn_id,
                                 sql=sql_statements.CREATE_TABLES)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    redshift_conn_id=redshift_conn_id,
    aws_credentials_id=aws_credentials_id,
    s3_bucket=s3_bucket,
    s3_key=log_s3_key,
    table=log_stage_table,
    json_format=log_json_format,
)

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
def redshift_table_ddl_plus_bulk_load(parent_dag_id, parent_task_id,
                                      redshift_conn_id, aws_credentials_id,
                                      table, create_table_statement, s3_bucket,
                                      s3_key, *args, **kwargs):
    """
        This Python Factory Function returns an Airflow DAG Class instance to
    be used as a SubDag within the (outer) DAG that calls it.

    The DAG in turn executes three Task within a Redshift database instance:
        - execute CREATE TABLE (DDL) statement through PostgresOperator;
        - execute COPY (bulk load) to pull data from S3 into the newly created
    table through custom S3ToRedshiftOperator
        - perform data quality check through custom HasRowsOperator

    Args:
        parent_dag_id (str): 

    """

    # instantiate DAG Class
    subDagInstance = DAG(
        #----------------------------------------------------------------------
        #
        #   ATTENTION! "parent_dag_id.parent_task_id" IS MANDATORY
        #
        #   It's an Airflow Convention that subdags must have their "dag_id"
        # attribute set to the combination of:
        #
        #   1. "dag_id" of the outer DAG Instance calling the SubDagOperator
        # which in turn calls the Python Factory Function;
        #
        #   2. "task_id" assigned to the SubDagOperator mentioned in item number 1
        #
        #   3. "." a dot must separate  the "dag_id" and "task_id" mentioned in
        # both items 1 and 2, respectively;
        #
        #----------------------------------------------------------------------
        dag_id=f"{parent_dag_id}.{parent_task_id}"
        # make sure keyword arguments are also received
        ,
        **kwargs)

    # NOTICE how Task instances are created in a SubDag just as they would in
    # a "normal" DAG.
    createTable = PostgresOperator(task_id=f'create_{table}_table',
                                   postgres_conn_id=redshift_conn_id,
                                   sql=create_table_statement,
                                   dag=subDagInstance)

    bulkLoadTable = S3ToRedshiftOperator(task_id=f'bulk_load_{table}_table',
                                         redshift_conn_id=redshift_conn_id,
                                         aws_credentials_id=aws_credentials_id,
                                         table=table,
                                         s3_bucket=s3_bucket,
                                         s3_key=s3_key,
                                         dag=subDagInstance)

    checkDataQuality = HasRowsOperator(task_id=f'check_{table}_table',
                                       redshift_conn_id=redshift_conn_id,
                                       table=table,
                                       dag=subDagInstance)

    # Define Task dependencies
    createTable >> bulkLoadTable >> checkDataQuality

    return subDagInstance
Beispiel #17
0
# Credential data and redshift parameters for connection
connection_info = {
        'conn_id': "redshift",
        'aws_credentials_id': "aws_credentials",                
        's3_bucket': "udacity-dend",
        'file_format': f"s3://udacity-dend/log_json_path.json"
}

# Operator that starts pipline
start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

# Operator that create tables
create_tables_task = PostgresOperator(
    task_id="Create_tables",
    dag=dag,
    postgres_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    sql='create_tables.sql'
)


# Operator that moves log data from S3 server to Redshift
stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    conn_id=connection_info["conn_id"],
    aws_credentials_id=connection_info["aws_credentials_id"],
    table="staging_events",                 
    s3_bucket=connection_info["s3_bucket"],
    s3_key="log_data/2018/11/",
    file_format=connection_info["file_format"]  
Beispiel #18
0
    dag=dag,
)

# Spark Job to Clean Immigration Data
data_cleaning = SSHOperator(
    task_id='data_cleaning_spark_job',
    dag=dag,
    ssh_conn_id='my_ssh_connection',
    command=
    '/usr/bin/spark-submit --jars "spark-sas7bdat-2.1.0-s_2.11.jar,parso-2.0.10.jar" --master yarn clean_job.py',
)

# Create Immigration Fact Table
create_immigration_table = PostgresOperator(
    task_id="create_immigration_fact_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_IMMIGRATION_FACT_SQL)

# Create Happiness Table
create_happiness_table = PostgresOperator(
    task_id="create_happiness_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_HAPPINESS_SQL)

# Create Country Table
create_country_table = PostgresOperator(task_id="create_country_table",
                                        dag=dag,
                                        postgres_conn_id="redshift",
                                        sql=sql_statements.CREATE_COUNTRY_SQL)
# DAG definition
dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *',
          max_active_runs=1
)

# start task definition
start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

# create table task
create_table = PostgresOperator(
    task_id="create_table",
    postgres_conn_id="redshift",
    sql=create_tables.drop_and_create_tables,
    dag=dag,
)

# Task stage events to redshift
stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    table='staging_events',
    s3_bucket='udacity-dend',
    s3_key='log_data',
    copy_json_option='s3://udacity-dend/log_json_path.json',
    region='us-west-2',
    dag=dag
)
main_dag = DAG(
    'Project_5',
    description='Load and transform data in Redshift with Airflow',
    start_date=datetime(2018, 11, 1, 0, 0, 0, 0),
    end_date=datetime(2018, 11, 30, 0, 0, 0, 0),
    schedule_interval="@monthly",
    default_args=default_args,
    #catchup=False
)

start_operator = DummyOperator(task_id='Begin_Execution', dag=main_dag)

create_events_in_s3_task = PostgresOperator(
    task_id="Create_Events",
    dag=main_dag,
    postgres_conn_id="redshift",
    sql=SqlQueries.create_staging_events_table)

copy_events_to_s3_task = StageToRedshiftOperator(
    task_id="Stage_Events",
    dag=main_dag,
    table_name="staging_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="log_data/{execution_date.year}/{execution_date.month}/",
    s3_format="json",
    json_path="s3://udacity-dend/log_json_path.json")

create_songs_in_s3_task = PostgresOperator(
Beispiel #21
0
        }
    },
    'query': "SELECT * FROM table_four"
}]
with DAG(dag_id='adb_pipeline',
         default_args=args,
         start_date=datetime(2019, 1, 1),
         schedule_interval='30 4 * * *',
         catchup=False) as dag:

    t1 = DummyOperator(task_id='kick_off_dag')

    t2 = S3KeySensor(task_id='check_for_file',
                     bucket_key='globetelecom/copy_*',
                     poke_interval=45,
                     timeout=600,
                     wildcard_match=True,
                     bucket_name=BUCKET,
                     aws_conn_id=S3_CONN_ID)

    for job in job_info:
        spark = DatabricksRunNowOperator(task_id=job['job_id'],
                                         job_id=job['job_id'],
                                         json=job['config'])

        query = PostgresOperator(task_id='post_{0}_query'.format(
            job['job_id']),
                                 sql=job['query'],
                                 postgres_conn_id='prod_postgres')
        t1 >> t2 >> spark >> query
Beispiel #22
0
    task_id='Copy_immigration_dimensions',
    dag=dag,
    redshift_conn_id='redshift',
    iam_role=Variable.get('iam_role'),
    dimensions=[
        'country_codes', 'port_codes', 'entry_channel_codes', 'state_codes',
        'trip_reason_codes'
    ],
    truncate=True,
    input_s3_bucket='ascfraguas-udacity-deng-capstone',
    input_s3_key='staging/immigration-dimensions')

#### -------> RUN TEMPERATURES SUMMARY

run_temperatures_sumary = PostgresOperator(task_id="Run_temperatures_summary",
                                           dag=dag,
                                           postgres_conn_id="redshift",
                                           sql=SqlQueries.run_temps_summary)

#### -------> RUN DATA QUALITY CHECKS

run_table_quality_checks = RunQualityCheckOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id='redshift',
    test_tables={
        'immigration.us_entries', 'immigration.country_codes',
        'immigration.port_codes', 'immigration.entry_channel_codes',
        'immigration.state_codes', 'immigration.trip_reason_codes',
        'temperature.full_temperature_data', 'temperature.temp_summary'
    },
    dq_checks=[{
        - demonstrate the process to create custom Airflow plugins
        - generalize repetitive code and Task by leveraging custom Operators
----------------------------------------------------------------------------"""

bulkLoadDag = DAG(
    dag_id='lesson3Exercise1',
    start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0)
    # set a date beyond which this DAG won't run
    ,
    end_date=datetime.datetime(2018, 2, 1, 0, 0, 0, 0),
    schedule_interval='@monthly',
    max_active_runs=1)

# create Tasks by instantiating Operator Classes
createTripsTable = PostgresOperator(task_id='createTripsTable',
                                    postgres_conn_id='redshift',
                                    sql=sql_statements.CREATE_TRIPS_TABLE_SQL,
                                    dag=bulkLoadDag)

# IMPORTANT:
# As "s3_key" parameter has been marked as "Templatable" (in its Class
# "template_fields" variable) it'll be able to fetch JINJA template
# variables like {{ds}}, {{execution_date}} and so on.
loadTripsData = S3ToRedshiftOperator(
    task_id="loadTripsData",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    table="trips",
    truncate=False,
    s3_bucket="udacity-dend",
    s3_key=
    "data-pipelines/divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv",
#     if len(records) < 1 or len(records[0]) < 1:
#         raise ValueError(f"Data quality check failed. {table} returned no results")
#     num_records = records[0][0]
#     if num_records < 1:
#         raise ValueError(f"Data quality check failed. {table} contained 0 rows")
#     logging.info(f"Data quality on table {table} check passed with {records[0][0]} records")

dag = DAG("lesson3.exercise1",
          start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
          end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0),
          schedule_interval="@monthly",
          max_active_runs=1)

create_trips_table = PostgresOperator(
    task_id="create_trips_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_TRIPS_TABLE_SQL)

copy_trips_task = S3ToRedshiftOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udac-data-pipelines",
    s3_key=
    "divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv"
)

#
    'retry_delay': timedelta(minutes=5),
    'catchup': False,
    'email_on_retry': False
}

dag = DAG('udacity_airflow_project',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *',
          max_active_runs=3)

# start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

start_operator = PostgresOperator(task_id='Begin_execution',
                                  dag=dag,
                                  sql=['create_table.sql'],
                                  postgres_conn_id='redshift',
                                  autocommit=True)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    provide_context=True,
    aws_credentials_id="aws_credentials",
    redshift_conn_id='redshift',
    s3_bucket="udacity-dend",
    s3_key="log_data",
    table="public.staging_events",
    #     create_stmt=SqlQueries.create_table_staging_events,
    json="s3://udacity-dend/log_json_path.json")
Beispiel #26
0
from helpers import SqlQueries

default_args = {'owner': 'Abderrazzak'}

drop_table_sql = "DROP TABLE IF EXISTS public.{};"

dag = DAG('create_dwh_spakify_tables',
          default_args=default_args,
          description='Creates Sparkify Redshift tables',
          start_date=datetime(2020, 6, 11),
          schedule_interval=None)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

drop_songplays = PostgresOperator(task_id="drop_songplays_table",
                                  dag=dag,
                                  postgres_conn_id="redshift",
                                  sql=(drop_table_sql).format("songplays"))

creat_songplays = PostgresOperator(
    task_id="creat_songplays_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=""" CREATE TABLE IF NOT EXISTS songplays (
        songplay_id int not null IDENTITY(0,1) PRIMARY KEY, 
        start_time timestamp NOT NULL , 
        user_id int  , 
        song_id varchar NOT NULL, 
        artist_id varchar NOT NULL, 
        session_id int, 
        location varchar, 
        user_agent varchar
Beispiel #27
0
    #'end_date': datetime(2018, 11, 30)
}

dag = DAG('sparkify-airflow',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *'
          #,catchup=True
          #,max_active_runs=1
          )

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

## create all tables in redshift
create_tables = PostgresOperator(task_id='Create_tables',
                                 dag=dag,
                                 postgres_conn_id='redshift',
                                 sql='create_tables.sql')

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    table='staging_events',
    clear_table=False,
    s3_bucket='udacity-dend',
    s3_key=
    'log-data/{execution_date.year}/{execution_date.month}/{ds}-events.json',
    #s3_key='log-data/{execution_date.year}/{execution_date.month}',
    json_path='log_json_path.json')
Beispiel #28
0
    'owner': 'udacity',
    'start_date': datetime(2019, 1, 12),
    'depends_on_past': False,
    'retries': 3,
    'retray_delay': timedelta(minutes=5),
    'catchup_by_default': False,
    'email_on_retry': False
}

dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *')

start_operator = PostgresOperator(task_id='Begin_execution',
                                  dag=dag,
                                  postgres_conn_id="redshift",
                                  sql='create_tables.sql')
# If tables are already created
# start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table="staging_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="log_data",
    region="us-west-2",
    extra_params="FORMAT AS JSON 's3://udacity-dend/log_json_path.json'")
    'start_date': datetime(2019, 8, 21),
    'depends_on_past': False,
    'retries': 1,
    'retry_delay': timedelta(seconds=300),
    'catchup': False
}

dag = DAG('dice_com_jobs_dag',
          default_args=default_args,
          description='Load the jobs dataset and insert into Redshift',
          schedule_interval='@once'
        )

recreate_staging_dice_com_jobs_table = PostgresOperator(
    task_id="recreate_staging_dice_com_jobs_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=SqlQueries.recreate_staging_dice_com_jobs_table
)

stage_dice_com_jobs = StageCsvToRedshiftOperator(
    task_id='stage_dice_com_jobs',
    dag=dag,
    table="staging_dice_com_jobs",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="social-wiki-datalake",
    s3_key="capstone/Dice_US_jobs.csv",
    extra_copy_parameters="DATEFORMAT AS 'MM/DD/YYYY' MAXERROR AS 6000"
)

check_staging_dice_com_jobs_table = DataQualityOperator(
Beispiel #30
0
import sql_statements

#
# TODO: Replace the data quality checks with the HasRowsOperator
#
dag = DAG(
    "lesson3.exercise1",
    start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
    end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0),
    schedule_interval="@monthly",
    max_active_runs=1
)

create_trips_table = PostgresOperator(
    task_id="create_trips_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_TRIPS_TABLE_SQL
)

copy_trips_task = S3ToRedshiftOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udac-data-pipelines",
    s3_key="divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv"
)

#
# TODO: Replace this data quality check with the HasRowsOperator
Beispiel #31
0
def make_task_group(dagname, name, path, pdag, trigger_rule):
    args = {
        'owner': 'deploy',
        'start_date': datetime.strptime((datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d'), '%Y-%m-%d')
    }
    dag = DAG(dag_id=dagname, default_args=args, schedule_interval=timedelta(hours=24))
    sdOp = SubDagOperator(task_id=name, subdag=dag, dag=pdag ,trigger_rule = trigger_rule)
    start = DummyOperator(task_id=name+'_start', dag=dag)
    end   = DummyOperator(task_id=name+'_end', dag=dag)
    task_dict = {}
    task_dict[name+'_start'] = start
    task_dict[name+'_end'] = end

    config_str = open(path, 'r').read()
    config_arr = config_str.split('[dependency]') # a -> b means data flow from a to b, though not valid ini format, but i think it is better to understand
    config_fp = StringIO.StringIO(config_arr[0])
    config = ConfigParser.RawConfigParser()
    config.readfp(config_fp)

    sections = config.sections()
    #print sections
    for section in sections:
        #print section
        if section != 'dependency':
            options = config.options( section )    
            #print options
            if 'type' not in options or 'cmd' not in options:
                continue
            operator_type = config.get(section, 'type').strip()
            operator_cmd  = config.get(section, 'cmd').strip()
            if operator_type == 'PostgresOperator':
                task =PostgresOperator(
                    task_id=section.strip(),
                    depends_on_past=False,
                    postgres_conn_id='postgres_sha2dw03',
                    sql=operator_cmd,
                    dag=dag)
                task_dict[ section.strip() ] = task
                task.set_downstream(end)
                task.set_upstream(start)
            elif operator_type == 'BashOperator':
                task =BashOperator(
                    task_id=section.strip(),
                    depends_on_past=False,
                    bash_command=operator_cmd,
                    dag=dag)
                task_dict[ section.strip() ] = task
                task.set_downstream(end)
                task.set_upstream(start)
            else:
                print "Error: currently not support %s operator type"%operator_type
                

    if len(config_arr) == 1:
        return (start, end, dag, sdOp, task_dict)

    for line in config_arr[1].split('\n'):
        arr = line.split('->')
        if len(arr) != 2:
            continue
        left_side = arr[0].strip()
        right_side= arr[1].strip()
        if left_side in task_dict.keys() and right_side in task_dict.keys():
            if end in task_dict[left_side].downstream_list:
                task_dict[left_side].downstream_list.remove(end)
            task_dict[left_side].set_downstream( task_dict[right_side] )
            if start in task_dict[right_side].upstream_list:
                task_dict[right_side].upstream_list.remove(start)
            if task_dict[right_side] in start.downstream_list:
                start.downstream_list.remove(task_dict[right_side])
            if task_dict[left_side] in end.upstream_list:
                end.upstream_list.remove(task_dict[left_side])
    return (start, end, dag, sdOp, task_dict)