Example #1
0
)

local_file_sensor = FileSensor(
    task_id='check_for_local_file',
    filepath='/usr/local/airflow/data/sample.txt',
    dag=word_count_dag,
    timeout=30,
    poke_interval=10,
)

s3_file_sensor = S3KeySensor(
    task_id='check_for_s3_file',
    aws_conn_id='my_conn_S3',
    bucket_name='calculator-api',
    bucket_key='twitter-raw/eia-prod/input.txt',
    wildcard_match=True,
    timeout=30,
    poke_interval=10,
    dag=word_count_dag,
    trigger_rule=TriggerRule.ALL_FAILED,
)

s3_file_download = PythonOperator(
    task_id='download_text_file_from_s3',
    python_callable=download_s3_file,
    op_args=[
        'calculator-api', 'twitter-raw/eia-prod/input.txt',
        '/usr/local/airflow/data/sample.txt'
    ],
    dag=word_count_dag,
)
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

dag = DAG(dag_id='s3_sensor_123',
          schedule_interval='@once',
          default_args=default_args)


def download(key, bucket_name, local_file):
    hook = S3Hook(aws_conn_id='aws_default')
    data = hook.read_key(key, bucket_name)
    with open(local_file, 'wb') as file:
        file.write(data)


bucket_key = 'driver-data/s3_sensor.csv'
bucket_name = 'wdt-datalake'

t0 = S3KeySensor(task_id='s3_sensor',
                 bucket_key=bucket_key,
                 bucket_name=bucket_name,
                 sla=timedelta(minutes=2),
                 dag=dag)

t1 = PythonOperator(task_id='download',
                    python_callable=download,
                    op_args=[bucket_key, bucket_name, local_file],
                    dag=dag)

t0 >> t1
Example #3
0
    DAG_NAME,
    description='Load data from Procon complaints from S3 to Redshift.',
    start_date=start_date,
    schedule_interval=datetime.timedelta(hours=1),
    catchup=False,
    max_active_runs=1,
)

start_operator = DummyOperator(task_id='begin_execution', dag=dag)

has_file_to_process = S3KeySensor(
    task_id='has_file_to_process',
    dag=dag,
    bucket_name=S3_BUCKET,
    bucket_key=f'{S3_KEY}/*.csv',
    wildcard_match=True,
    aws_conn_id=AWS_CREDENTIALS,
    timeout=31,
    poke_interval=30,
    soft_fail=True,
)

create_procon_stage_table = PostgresOperator(
    task_id='create_procon_stage_table',
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN,
    sql=[
        procon_queries['drop_stage_table'],
        procon_queries['create_stage_table']
    ])
Example #4
0
    #     dag=dag
    #     )

    moment = datetime.now()
    b_name = Variable.get("s3_bucket")
    source = Variable.get("pinpoint")
    year = moment.year
    month = '%02d' % moment.month
    day = '%02d' % moment.day
    hr = moment.hour
    bucket_key_template = f'{source}/{year}/{month}/{day}/ypsource.json'

    get_new_json = S3KeySensor(task_id="get_new_json",
                               poke_interval=60 * 2,
                               timeout=60 * 60 * 3,
                               bucket_key=bucket_key_template,
                               bucket_name=b_name,
                               wildcard_match=False,
                               aws_conn_id="s3_task",
                               dag=dag)

    # get_from_S3 = PythonOperator(
    #     task_id='get_from_S3',
    #     python_callable=get_file_from_s3,
    #     dag=dag
    #     )

    upload_to_S3_task = PythonOperator(
        task_id='upload_file_to_S3',
        python_callable=upload_file_to_S3_with_hook,
        params={
            'filename': '/home/akorede/Documents/mycsv.csv',
Example #5
0
with DAG(
        "csv2postgres",
        description="Read CSV files from S3 and load into Postgres db",
        schedule_interval="0 12 * * *",
        start_date=days_ago(2),
        catchup=False,
) as csv2postgres:
    bucket_name = s3_conn.extra_dejson["bucket_name"]
    file_key = "s3://{{params.bucket_name}}/{{ds}}/spire2csv__query_{{params.table}}_to_csv__{{ds_nodash}}/{{params.table}}.csv"
    for table, query in queries.items():
        check_for_files = S3KeySensor(
            task_id=f"check_s3_for_{table}_file",
            bucket_key=file_key,
            poke_interval=60,
            params={
                "table": table,
                "bucket_name": bucket_name
            },
            timeout=60 * 60 *
            12,  # timeout after 12 hours of waiting for the file
        )

        csv2postgres_task = PythonOperator(
            task_id=f"{table}_csv_to_db",
            python_callable=csv_to_postgres,
            provide_context=True,
            params={
                "table": table,
                "bucket_name": bucket_name
            },
            op_kwargs={
Example #6
0
    :return: bool
    """
    with managed_connection() as connection:
        process_source_dir(connection, OUTPUT_DIR, is_batch_mode=True)

    return SUCCESS


dag = DAG('data_pipeline',
          default_args=default_args,
          schedule_interval='@daily')

bucket_watcher = S3KeySensor(task_id='bucket_watcher',
                             poke_interval=5,
                             timeout=300,
                             soft_fail=True,
                             wildcard_match=True,
                             bucket_key='*',
                             bucket_name=INCOMING_BUCKET,
                             dag=dag)

get_file_keys = PythonOperator(task_id='get_file_keys',
                               provide_context=True,
                               python_callable=_get_file_keys,
                               dag=dag)

download_zip_files = PythonOperator(task_id='download_zip_files',
                                    provide_context=True,
                                    python_callable=_download_zip_files,
                                    dag=dag)

move_zip_files_to_archive = PythonOperator(
Example #7
0
    "start_date": datetime(2019, 1, 24),
    "email": ["*****@*****.**"],
    "email_on_retry": False,
    "retry_delay": timedelta(minutes=5),
    "retries": 2
}

dag = DAG('EMR_TEST_1',
          default_args=DEFAULT_ARGS,
          catchup=False,
          schedule_interval="0 1 * * *")

with dag:
    file_sensor = S3KeySensor(task_id='file_sensor',
                              poke_interval=600,
                              timeout=1000,
                              soft_fail=False,
                              bucket_name='ds-afarrell',
                              bucket_key='manybla.txt')

    create_cluster = EmrCreateJobFlowOperator(
        task_id='create_cluster',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_default',
        emr_conn_id='emr_benchmarks_connection')

    run_some_pyspark = EmrAddStepsOperator(
        task_id='run_some_pyspark',
        job_flow_id=
        "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=EMR_STEP_1)
from airflow.operators.sensors import S3KeySensor
from airflow.operators.bash_operator import BashOperator
from datetime import datetime as dt

from airflow import DAG

with DAG('s3_event') as dag:

    t1 = S3KeySensor(
        task_id='s3_sensor',
        bucket_name='airflow-input-coke',
        bucket_key='*',
        start_date=dt.now(),
        dag=dag)
    
    t2 = BashOperator(
        task_id='print_key',
        bash_command='echo "I Win"',
        start_date=dt.now(),
        dag=dag
    )

    t1 >> t2
Example #9
0
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('dms-cassandraV3',
          default_args=default_args,
          schedule_interval='@daily')

s3ready = S3KeySensor(
    task_id='s3_file',
    poke_interval=0,
    timeout=15,
    soft_fail=False,
    bucket_key=
    's3://dms-deploy/flood-monitoring/archive/readings-full-{{ yesterday_ds }}.csv.gz',
    bucket_name=None,
    s3_conn_id=Variable.get("s3_connection"),
    dag=dag)


def downloadDatafile(date, credentials):
    filename = "readings-full-" + date + ".csv.gz"
    s3 = boto3.resource(
        "s3",
        aws_access_key_id=credentials['aws_access_key_id'],
        aws_secret_access_key=credentials['aws_secret_access_key'])
    try:
        print("Downloading File")
        s3.Bucket("dms-deploy").download_file(
Example #10
0
"""
dag.doc_md =
#### Update DB
Update DB takes in the csv downloaded from oura and loads it into postgres database
as soon as dag is triggered by file landing in s3 bucket
"""

dag = DAG(dag_id='oura_pipeline',
          default_args=default_args,
          description='ETL',
          schedule_interval=timedelta(days=1))

sensor = S3KeySensor(task_id='s3_file_check',
                     bucket_key='oura_*',
                     wildcard_match=True,
                     bucket_name='ouraringbackupdata',
                     s3_conn_id='my_conn_S3',
                     timeout=18 * 60 * 60,
                     poke_interval=120,
                     dag=dag)

t1 = PythonOperator(task_id='read_csv',
                    provide_context=False,
                    python_callable=read_csv,
                    dag=dag)

t2 = PythonOperator(task_id='transform_data',
                    provide_context=False,
                    python_callable=transform,
                    dag=dag)

t3 = PythonOperator(task_id='Load_to_postgre',
Example #11
0
    os.system(f'spark-submit --conf spark.cores.max={max_cores} --executor-memory=3G ' +\
    '$sparkf ~/eCommerce/data-processing/ingestion.py')


def run_time_window():
    ''' spark-submit pyspark script that maintains 24-hour window for the
     minute-level datatable on PostgreSQL DB
    '''
    os.system(f'spark-submit --conf spark.cores.max=14 --executor-memory=5G ' +\
    '$sparkf ~/eCommerce/data-processing/table_time_window.py')


new_file_sensor = S3KeySensor(
    task_id='new_csv_sensor',
    poke_interval=5,  # (seconds); checking file every 5 seconds
    timeout=30,  # timeout in 1 hours
    bucket_key=f"s3://maxwell-insight/serverpool/*.csv",
    bucket_name=None,
    wildcard_match=True,
    dag=dag)

spark_ingestion = PythonOperator(task_id='spark_ingestion',
                                 python_callable=run_ingestion,
                                 trigger_rule='none_failed',
                                 dag=dag)

table_time_window = PythonOperator(task_id='table_time_window',
                                   python_callable=run_time_window,
                                   dag=dag)

new_file_sensor >> spark_ingestion >> table_time_window
Example #12
0
    'retries': 0,
    # 'retry_delay': timedelta(minutes=5),
    # 'execution_timeout': timedelta(minutes=10),
}

dag = DAG(
    dag_id='deploy_stack_on_file_upload',
    schedule_interval=schedule,
    default_args=args,
    catchup=False,
)

file_sensor = S3KeySensor(
    task_id='s3_key_sensor_task',
    poke_interval=60 * 1,  # seconds
    timeout=60 * 10,  # seconds
    bucket_key="s3://auto-bench/docker-compose.yml",
    bucket_name=None,
    wildcard_match=False,
    dag=dag)

move_file = BashOperator(
    task_id="move_yml",
    bash_command=
    "aws s3 mv s3://auto-bench/docker-compose.yml /home/ec2-user/docker-compose.yml",
    dag=dag)

rm_prev_stack = BashOperator(task_id="rm_prev_stack",
                             bash_command="docker stack rm AutoBench",
                             dag=dag)

docker_prune = BashOperator(task_id="docker_prune",
Example #13
0
    task_id='s3_ingest', 
    bash_command='aws s3 sync s3://air-flow-lightning s3://airflow-lightning-origin', 
    queue='default',
    dag=dag)

# Spark processing operator 

spark_batch = BashOperator(
    task_id='spark_batch', 
    bash_command='spark-submit ~/code/Duo-flow/spark.py',
    queue='default', 
    dag=dag)

# S3 file sensor operator which senses the newly creatly file in S3
s3_file_sensor = S3KeySensor(
    task_id='s3_file_sensor',
    queue='default'
    bucket_key='s3://air-flow-lightning-output/lightning_2020output.csv',
    bucket_name=None,
    queue='default',
    dag=dag)

# Store to DB operator that stores the result in PostgreSQL
store_db = PythonOperator(
    task_id = 'store_db', 
    provide_context=True,
    python_callable=store_db, 
    queue='default',
    dag=dag)

Example #14
0
srcDir = '/home/ubuntu/fault-tolerant-airflow/src/spark/'

# Command to run remote spark batch processing
cmd = 'ssh [email protected] spark-submit' + ' ' + srcDir + 'PDS.py --master ec2-18-235-191-19.compute-1.amazonaws.com --deploy-mode=cluster'

objectKey = 's3n://de-yk-bucket/PDS/XETR/DailyAverages/' + str(todays_date_str) + '.csv'


# Bash operator that synchronizes the Deutsche XETR Public Dataset with my bucket stored in S3
s3_ingest_opr = BashOperator(task_id='s3_ingest', bash_command='aws s3 sync s3://deutsche-boerse-xetra-pds s3://de-yk-bucket/PDS/XETR/ ', dag=dag)

# Remote batch processing operator that calculates the daily averages of stock prices
spark_batch_opr = BashOperator(task_id='spark_batch', bash_command=cmd, dag=dag)

# S3 file sensor operator that senses the temporarily created csv file in S3
s3_file_sensor_opr = S3KeySensor(
    task_id='s3_file_sensor',
    poke_interval=60,
    timeout=10,
    soft_fail=True,
    bucket_key=objectKey,
    bucket_name=None,
    dag=dag)

# Store to DB operator that stores the calculated daily average prices in PostgreSQL
store_to_db_opr = PythonOperator(task_id = 'store_to_db', provide_context=True, python_callable=store_to_db, dag=dag)


# Create dependencies for the DAG
s3_ingest_opr >> spark_batch_opr >> s3_file_sensor_opr >> store_to_db_opr
Example #15
0
                              python_callable=print_context,
                              dag=dag)

    send_file = PythonOperator(
        task_id='send_file',
        #trigger_rule='all_success',
        python_callable=sendFile,
        op_kwargs={'filename': filename},
        dag=dag)

    s3_chk = S3KeySensor(task_id='s3_chk',
                         s3_conn_id='dev1_s3',
                         depends_on_past=False,
                         poke_interval=2,
                         timeout=15,
                         soft_fail=False,
                         bucket_key='{}input/{}'.format(
                             Variable.get('s3_buckey'),
                             Variable.get('s3_filename')),
                         bucket_name=None,
                         wildcard_match=False,
                         dag=dag)

    s3_create_project = PythonOperator(task_id='s3_create_project',
                                       depends_on_past=False,
                                       op_kwargs={
                                           'project_id': '',
                                           'bucket': filename
                                       },
                                       python_callable=create_structure,
                                       dag=dag)
Example #16
0
############################################################
############################################################
# Create S3KeySensor Task:
# 5.Sense_S3_Source
############################################################
############################################################

# Sensor task to verify existence of source data on S3
# aws_conn_id is setup in Admin panel on Web UI (I used default provided by Airflow)
# In this case, AWS CLI is getting credentials from ~/.aws, though these could be
# encrypted as variables in Airflow
t5 = S3KeySensor(task_id='5.Sense_S3_Source',
                 aws_conn_id="aws_default",
                 bucket_key=source_data_path,
                 bucket_name=None,
                 poke_interval=5,
                 timeout=5,
                 trigger_rule="one_failed",
                 dag=dag)

############################################################
############################################################
# Create BashOperator Task:
# 6.Generate_Copied.hash
############################################################
############################################################

# This BashOperator copies data from S3 to local drive
# using AWS CLI (which needs to be available in OS)
# In this case, AWS CLI is getting credentials from ~/.aws, though these could be
# encrypted as variables in Airflow
Example #17
0
#     task_id='redshift_table',
#     dag=dag,
#     postgres_conn_id='redshift',
#     sql='sql/create_tables.sql'
# )

# s3://udacity-dend/log_data/2018/11/
# log_data/2018/11/2018-11-01-events.json
# log_data/{{year}}/{{month}}/{{yyyy-mm-dd}}-events.json
# LOG_JSONPATH='s3://udacity-dend/log_json_path.json'

s3_file = S3KeySensor(
    task_id='s3_file_check',
    bucket_key=
    's3://udacity-dend/log_data/{{macros.ds_format(ds,"%Y-%m-%d","%Y")}}/{{macros.ds_format(ds,"%Y-%m-%d","%m")}}/{{ds}}-events.json',
    bucket_name=None,
    aws_conn_id='aws_credentials',
    poke_interval=2,
    timeout=10,
    soft_fail=True,
    dag=dag)
stage_events_to_redshift = StageToRedshiftOperator(
    task_id='staging_events',
    dag=dag,
    # end_date=datetime(2018, 11, 30, hour=23),
    table='staging_events',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    s3_bucket='udacity-dend',
    copy_sql=SqlQueries.copy_staging_events,
    params={'log_path': 's3://udacity-dend/log_json_path.json'},
    # s3_key='log_data/'
    "retries": 1,
    "retry_delay": timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG("s3_sensor_example",
          default_args=default_args,
          schedule_interval=timedelta(minutes=30),
          catchup=False)

s3_sensor = S3KeySensor(task_id='s3_sensor',
                        bucket_key='sensor_test/*',
                        wildcard_match=True,
                        bucket_name='cdn.getsixthman.com',
                        s3_conn_id='sixthman_airflow_s3',
                        dag=dag)

t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag)

t3 = BashOperator(
    task_id="templated",
    bash_command="echo wooooooooo",
    dag=dag,
)

t2.set_upstream(s3_sensor)
t3.set_upstream(s3_sensor)
Example #19
0
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 5,
    'retry_delay': timedelta(minutes=10)
}

# run every day at 12:01 am
my_dag = DAG('s3_spark_mysql',
             default_args=default_args,
             schedule_interval='1 12 * * *')

t1 = S3KeySensor(
    task_id='s3_file_test',
    poke_interval=30,
    timeout=10,
    soft_fail=False,
    bucket_key=data_uri,  #expected file
    bucket_name=None,
    wildcard_match=True,
    dag=my_dag)

t2 = BashOperator(task_id='extract_users',
                  depends_on_past=False,
                  bash_command="""$SPARK_HOME/bin/spark-submit \
    --packages mysql:mysql-connector-java:5.1.40 \
    --master spark://ip-10-0-0-11:7077 \
    --executor-memory 6G \
    /home/ubuntu/venmo/spark/userinfo.py """ + data_file,
                  dag=my_dag)

t3 = BashOperator(task_id='net_spending',
        task_id='watch_step_pi',
        job_flow_id="{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
        aws_conn_id=get_config('emr')['aws_conn_id']
    )

    monitor_step_op_2 = EmrStepSensor(
        task_id='watch_step_distcp',
        job_flow_id="{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[1] }}",
        aws_conn_id=get_config('emr')['aws_conn_id']
    )

    validate_path_exists = S3KeySensor(
        task_id='validate_pii_exist',
        bucket_name='{{ params.bucket_name }}',
        bucket_key='{{ params.bucket_key }}',
        wildcard_match=True)

    terminate_cluster_op = EmrTerminateJobFlowOperator(
        task_id='remove_cluster',
        job_flow_id="{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}",
        aws_conn_id=get_config('emr')['aws_conn_id']
    )

    handle_failure_op = PythonOperator(
        task_id='handle_failure',
        python_callable=handle_failure_task,
        trigger_rule=trigger_rule.TriggerRule.ONE_FAILED)

    create_cluster_op >> monitor_cluster_op >> handle_failure_op >> terminate_cluster_op
def grab_file():
    s3_conn_id = 'my_conn_S3'
    s3 = S3Hook(s3_conn_id)

    key_label = "file-to-watch-3.txt"
    key = s3.get_key(key_label, 'superconductive-airflow-bucket')
    key_string = key.get_contents_as_string()

    return key_string


dag = DAG('s3_connect_dag',
          default_args=default_args,
          schedule_interval='@once')

file_processor = PythonOperator(task_id='grab_file_from_s3',
                                python_callable=grab_file,
                                dag=dag)

file_trigger = S3KeySensor(task_id='check_s3_for_file_in_s3',
                           bucket_key='file-to-watch-*',
                           wildcard_match=True,
                           bucket_name='superconductive-airflow-bucket',
                           s3_conn_id='my_conn_S3',
                           timeout=18 * 60 * 60,
                           poke_interval=120,
                           dag=dag)

file_processor.set_upstream(file_trigger)
Example #22
0
    'email_on_retry': False,
    'owner': 'airflow',
    'start_date': datetime.now() - timedelta(days=1),
}

dag = DAG('af-dnaseq-align-wgs',
          default_args=default_args,
          schedule_interval=None
          #          schedule_interval='@once'
          )

sensor = S3KeySensor(task_id='check_s3',
                     bucket_name='secure-east2-test-bucket',
                     s3_host='s3-us-east-2.amazonaws.com',
                     bucket_key='jobs*.json',
                     wildcard_match=True,
                     s3_conn_id='fs_default',
                     timeout=0,
                     poke_interval=0,
                     soft_fail=False,
                     dag=dag)


def create_run_jobs(queue_json_file):
    with TemporaryDirectory() as temp_git_dir:
        # create jobs
        job_creation_uuid = str(uuid.uuid4())

        ## get job data
        with open(queue_json_file.name, 'r') as f:
            f.seek(0)
            queue_dict = json.loads(f.read())