Python S3KeySensor Examples, airflow.operators.sensors.S3KeySensor Python Examples

Example #1

0

Show file

)

local_file_sensor = FileSensor(
    task_id='check_for_local_file',
    filepath='/usr/local/airflow/data/sample.txt',
    dag=word_count_dag,
    timeout=30,
    poke_interval=10,
)

s3_file_sensor = S3KeySensor(
    task_id='check_for_s3_file',
    aws_conn_id='my_conn_S3',
    bucket_name='calculator-api',
    bucket_key='twitter-raw/eia-prod/input.txt',
    wildcard_match=True,
    timeout=30,
    poke_interval=10,
    dag=word_count_dag,
    trigger_rule=TriggerRule.ALL_FAILED,
)

s3_file_download = PythonOperator(
    task_id='download_text_file_from_s3',
    python_callable=download_s3_file,
    op_args=[
        'calculator-api', 'twitter-raw/eia-prod/input.txt',
        '/usr/local/airflow/data/sample.txt'
    ],
    dag=word_count_dag,
)

Example #2

0

Show file

File: s3_sensor_123.py Project: chrisliny/airflow_basics

    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

dag = DAG(dag_id='s3_sensor_123',
          schedule_interval='@once',
          default_args=default_args)


def download(key, bucket_name, local_file):
    hook = S3Hook(aws_conn_id='aws_default')
    data = hook.read_key(key, bucket_name)
    with open(local_file, 'wb') as file:
        file.write(data)


bucket_key = 'driver-data/s3_sensor.csv'
bucket_name = 'wdt-datalake'

t0 = S3KeySensor(task_id='s3_sensor',
                 bucket_key=bucket_key,
                 bucket_name=bucket_name,
                 sla=timedelta(minutes=2),
                 dag=dag)

t1 = PythonOperator(task_id='download',
                    python_callable=download,
                    op_args=[bucket_key, bucket_name, local_file],
                    dag=dag)

t0 >> t1

Example #3

0

Show file

    DAG_NAME,
    description='Load data from Procon complaints from S3 to Redshift.',
    start_date=start_date,
    schedule_interval=datetime.timedelta(hours=1),
    catchup=False,
    max_active_runs=1,
)

start_operator = DummyOperator(task_id='begin_execution', dag=dag)

has_file_to_process = S3KeySensor(
    task_id='has_file_to_process',
    dag=dag,
    bucket_name=S3_BUCKET,
    bucket_key=f'{S3_KEY}/*.csv',
    wildcard_match=True,
    aws_conn_id=AWS_CREDENTIALS,
    timeout=31,
    poke_interval=30,
    soft_fail=True,
)

create_procon_stage_table = PostgresOperator(
    task_id='create_procon_stage_table',
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN,
    sql=[
        procon_queries['drop_stage_table'],
        procon_queries['create_stage_table']
    ])

Example #4

0

Show file

File: s3_dag.py Project: RoadRunner11/airflow_sandbox

    #     dag=dag
    #     )

    moment = datetime.now()
    b_name = Variable.get("s3_bucket")
    source = Variable.get("pinpoint")
    year = moment.year
    month = '%02d' % moment.month
    day = '%02d' % moment.day
    hr = moment.hour
    bucket_key_template = f'{source}/{year}/{month}/{day}/ypsource.json'

    get_new_json = S3KeySensor(task_id="get_new_json",
                               poke_interval=60 * 2,
                               timeout=60 * 60 * 3,
                               bucket_key=bucket_key_template,
                               bucket_name=b_name,
                               wildcard_match=False,
                               aws_conn_id="s3_task",
                               dag=dag)

    # get_from_S3 = PythonOperator(
    #     task_id='get_from_S3',
    #     python_callable=get_file_from_s3,
    #     dag=dag
    #     )

    upload_to_S3_task = PythonOperator(
        task_id='upload_file_to_S3',
        python_callable=upload_file_to_S3_with_hook,
        params={
            'filename': '/home/akorede/Documents/mycsv.csv',

Example #5

0

Show file

with DAG(
        "csv2postgres",
        description="Read CSV files from S3 and load into Postgres db",
        schedule_interval="0 12 * * *",
        start_date=days_ago(2),
        catchup=False,
) as csv2postgres:
    bucket_name = s3_conn.extra_dejson["bucket_name"]
    file_key = "s3://{{params.bucket_name}}/{{ds}}/spire2csv__query_{{params.table}}_to_csv__{{ds_nodash}}/{{params.table}}.csv"
    for table, query in queries.items():
        check_for_files = S3KeySensor(
            task_id=f"check_s3_for_{table}_file",
            bucket_key=file_key,
            poke_interval=60,
            params={
                "table": table,
                "bucket_name": bucket_name
            },
            timeout=60 * 60 *
            12,  # timeout after 12 hours of waiting for the file
        )

        csv2postgres_task = PythonOperator(
            task_id=f"{table}_csv_to_db",
            python_callable=csv_to_postgres,
            provide_context=True,
            params={
                "table": table,
                "bucket_name": bucket_name
            },
            op_kwargs={

Example #6

0

Show file

    :return: bool
    """
    with managed_connection() as connection:
        process_source_dir(connection, OUTPUT_DIR, is_batch_mode=True)

    return SUCCESS


dag = DAG('data_pipeline',
          default_args=default_args,
          schedule_interval='@daily')

bucket_watcher = S3KeySensor(task_id='bucket_watcher',
                             poke_interval=5,
                             timeout=300,
                             soft_fail=True,
                             wildcard_match=True,
                             bucket_key='*',
                             bucket_name=INCOMING_BUCKET,
                             dag=dag)

get_file_keys = PythonOperator(task_id='get_file_keys',
                               provide_context=True,
                               python_callable=_get_file_keys,
                               dag=dag)

download_zip_files = PythonOperator(task_id='download_zip_files',
                                    provide_context=True,
                                    python_callable=_download_zip_files,
                                    dag=dag)

move_zip_files_to_archive = PythonOperator(

Example #7

0

Show file

    "start_date": datetime(2019, 1, 24),
    "email": ["*****@*****.**"],
    "email_on_retry": False,
    "retry_delay": timedelta(minutes=5),
    "retries": 2
}

dag = DAG('EMR_TEST_1',
          default_args=DEFAULT_ARGS,
          catchup=False,
          schedule_interval="0 1 * * *")

with dag:
    file_sensor = S3KeySensor(task_id='file_sensor',
                              poke_interval=600,
                              timeout=1000,
                              soft_fail=False,
                              bucket_name='ds-afarrell',
                              bucket_key='manybla.txt')

    create_cluster = EmrCreateJobFlowOperator(
        task_id='create_cluster',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_default',
        emr_conn_id='emr_benchmarks_connection')

    run_some_pyspark = EmrAddStepsOperator(
        task_id='run_some_pyspark',
        job_flow_id=
        "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=EMR_STEP_1)

Example #8

0

Show file

File: dag.py Project: infinityworks/airflow-hacknight-coke

from airflow.operators.sensors import S3KeySensor
from airflow.operators.bash_operator import BashOperator
from datetime import datetime as dt

from airflow import DAG

with DAG('s3_event') as dag:

    t1 = S3KeySensor(
        task_id='s3_sensor',
        bucket_name='airflow-input-coke',
        bucket_key='*',
        start_date=dt.now(),
        dag=dag)
    
    t2 = BashOperator(
        task_id='print_key',
        bash_command='echo "I Win"',
        start_date=dt.now(),
        dag=dag
    )

    t1 >> t2

Example #9

0

Show file

File: cassandrafillerV3.py Project: quorauk/dags

    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('dms-cassandraV3',
          default_args=default_args,
          schedule_interval='@daily')

s3ready = S3KeySensor(
    task_id='s3_file',
    poke_interval=0,
    timeout=15,
    soft_fail=False,
    bucket_key=
    's3://dms-deploy/flood-monitoring/archive/readings-full-{{ yesterday_ds }}.csv.gz',
    bucket_name=None,
    s3_conn_id=Variable.get("s3_connection"),
    dag=dag)


def downloadDatafile(date, credentials):
    filename = "readings-full-" + date + ".csv.gz"
    s3 = boto3.resource(
        "s3",
        aws_access_key_id=credentials['aws_access_key_id'],
        aws_secret_access_key=credentials['aws_secret_access_key'])
    try:
        print("Downloading File")
        s3.Bucket("dms-deploy").download_file(

Example #10

0

Show file

"""
dag.doc_md =
#### Update DB
Update DB takes in the csv downloaded from oura and loads it into postgres database
as soon as dag is triggered by file landing in s3 bucket
"""

dag = DAG(dag_id='oura_pipeline',
          default_args=default_args,
          description='ETL',
          schedule_interval=timedelta(days=1))

sensor = S3KeySensor(task_id='s3_file_check',
                     bucket_key='oura_*',
                     wildcard_match=True,
                     bucket_name='ouraringbackupdata',
                     s3_conn_id='my_conn_S3',
                     timeout=18 * 60 * 60,
                     poke_interval=120,
                     dag=dag)

t1 = PythonOperator(task_id='read_csv',
                    provide_context=False,
                    python_callable=read_csv,
                    dag=dag)

t2 = PythonOperator(task_id='transform_data',
                    provide_context=False,
                    python_callable=transform,
                    dag=dag)

t3 = PythonOperator(task_id='Load_to_postgre',

Example #11

0

Show file

    os.system(f'spark-submit --conf spark.cores.max={max_cores} --executor-memory=3G ' +\
    '$sparkf ~/eCommerce/data-processing/ingestion.py')


def run_time_window():
    ''' spark-submit pyspark script that maintains 24-hour window for the
     minute-level datatable on PostgreSQL DB
    '''
    os.system(f'spark-submit --conf spark.cores.max=14 --executor-memory=5G ' +\
    '$sparkf ~/eCommerce/data-processing/table_time_window.py')


new_file_sensor = S3KeySensor(
    task_id='new_csv_sensor',
    poke_interval=5,  # (seconds); checking file every 5 seconds
    timeout=30,  # timeout in 1 hours
    bucket_key=f"s3://maxwell-insight/serverpool/*.csv",
    bucket_name=None,
    wildcard_match=True,
    dag=dag)

spark_ingestion = PythonOperator(task_id='spark_ingestion',
                                 python_callable=run_ingestion,
                                 trigger_rule='none_failed',
                                 dag=dag)

table_time_window = PythonOperator(task_id='table_time_window',
                                   python_callable=run_time_window,
                                   dag=dag)

new_file_sensor >> spark_ingestion >> table_time_window

Example #12

0

Show file

    'retries': 0,
    # 'retry_delay': timedelta(minutes=5),
    # 'execution_timeout': timedelta(minutes=10),
}

dag = DAG(
    dag_id='deploy_stack_on_file_upload',
    schedule_interval=schedule,
    default_args=args,
    catchup=False,
)

file_sensor = S3KeySensor(
    task_id='s3_key_sensor_task',
    poke_interval=60 * 1,  # seconds
    timeout=60 * 10,  # seconds
    bucket_key="s3://auto-bench/docker-compose.yml",
    bucket_name=None,
    wildcard_match=False,
    dag=dag)

move_file = BashOperator(
    task_id="move_yml",
    bash_command=
    "aws s3 mv s3://auto-bench/docker-compose.yml /home/ec2-user/docker-compose.yml",
    dag=dag)

rm_prev_stack = BashOperator(task_id="rm_prev_stack",
                             bash_command="docker stack rm AutoBench",
                             dag=dag)

docker_prune = BashOperator(task_id="docker_prune",

Example #13

0

Show file

File: dag.py Project: CottenNoTail/Duo-flow

    task_id='s3_ingest', 
    bash_command='aws s3 sync s3://air-flow-lightning s3://airflow-lightning-origin', 
    queue='default',
    dag=dag)

# Spark processing operator 

spark_batch = BashOperator(
    task_id='spark_batch', 
    bash_command='spark-submit ~/code/Duo-flow/spark.py',
    queue='default', 
    dag=dag)

# S3 file sensor operator which senses the newly creatly file in S3
s3_file_sensor = S3KeySensor(
    task_id='s3_file_sensor',
    queue='default'
    bucket_key='s3://air-flow-lightning-output/lightning_2020output.csv',
    bucket_name=None,
    queue='default',
    dag=dag)

# Store to DB operator that stores the result in PostgreSQL
store_db = PythonOperator(
    task_id = 'store_db', 
    provide_context=True,
    python_callable=store_db, 
    queue='default',
    dag=dag)

Example #14

0

Show file

srcDir = '/home/ubuntu/fault-tolerant-airflow/src/spark/'

# Command to run remote spark batch processing
cmd = 'ssh [email protected] spark-submit' + ' ' + srcDir + 'PDS.py --master ec2-18-235-191-19.compute-1.amazonaws.com --deploy-mode=cluster'

objectKey = 's3n://de-yk-bucket/PDS/XETR/DailyAverages/' + str(todays_date_str) + '.csv'


# Bash operator that synchronizes the Deutsche XETR Public Dataset with my bucket stored in S3
s3_ingest_opr = BashOperator(task_id='s3_ingest', bash_command='aws s3 sync s3://deutsche-boerse-xetra-pds s3://de-yk-bucket/PDS/XETR/ ', dag=dag)

# Remote batch processing operator that calculates the daily averages of stock prices
spark_batch_opr = BashOperator(task_id='spark_batch', bash_command=cmd, dag=dag)

# S3 file sensor operator that senses the temporarily created csv file in S3
s3_file_sensor_opr = S3KeySensor(
    task_id='s3_file_sensor',
    poke_interval=60,
    timeout=10,
    soft_fail=True,
    bucket_key=objectKey,
    bucket_name=None,
    dag=dag)

# Store to DB operator that stores the calculated daily average prices in PostgreSQL
store_to_db_opr = PythonOperator(task_id = 'store_to_db', provide_context=True, python_callable=store_to_db, dag=dag)


# Create dependencies for the DAG
s3_ingest_opr >> spark_batch_opr >> s3_file_sensor_opr >> store_to_db_opr

Example #15

0

Show file

                              python_callable=print_context,
                              dag=dag)

    send_file = PythonOperator(
        task_id='send_file',
        #trigger_rule='all_success',
        python_callable=sendFile,
        op_kwargs={'filename': filename},
        dag=dag)

    s3_chk = S3KeySensor(task_id='s3_chk',
                         s3_conn_id='dev1_s3',
                         depends_on_past=False,
                         poke_interval=2,
                         timeout=15,
                         soft_fail=False,
                         bucket_key='{}input/{}'.format(
                             Variable.get('s3_buckey'),
                             Variable.get('s3_filename')),
                         bucket_name=None,
                         wildcard_match=False,
                         dag=dag)

    s3_create_project = PythonOperator(task_id='s3_create_project',
                                       depends_on_past=False,
                                       op_kwargs={
                                           'project_id': '',
                                           'bucket': filename
                                       },
                                       python_callable=create_structure,
                                       dag=dag)

Example #16

0

Show file

############################################################
############################################################
# Create S3KeySensor Task:
# 5.Sense_S3_Source
############################################################
############################################################

# Sensor task to verify existence of source data on S3
# aws_conn_id is setup in Admin panel on Web UI (I used default provided by Airflow)
# In this case, AWS CLI is getting credentials from ~/.aws, though these could be
# encrypted as variables in Airflow
t5 = S3KeySensor(task_id='5.Sense_S3_Source',
                 aws_conn_id="aws_default",
                 bucket_key=source_data_path,
                 bucket_name=None,
                 poke_interval=5,
                 timeout=5,
                 trigger_rule="one_failed",
                 dag=dag)

############################################################
############################################################
# Create BashOperator Task:
# 6.Generate_Copied.hash
############################################################
############################################################

# This BashOperator copies data from S3 to local drive
# using AWS CLI (which needs to be available in OS)
# In this case, AWS CLI is getting credentials from ~/.aws, though these could be
# encrypted as variables in Airflow

Example #17

0

Show file

#     task_id='redshift_table',
#     dag=dag,
#     postgres_conn_id='redshift',
#     sql='sql/create_tables.sql'
# )

# s3://udacity-dend/log_data/2018/11/
# log_data/2018/11/2018-11-01-events.json
# log_data/{{year}}/{{month}}/{{yyyy-mm-dd}}-events.json
# LOG_JSONPATH='s3://udacity-dend/log_json_path.json'

s3_file = S3KeySensor(
    task_id='s3_file_check',
    bucket_key=
    's3://udacity-dend/log_data/{{macros.ds_format(ds,"%Y-%m-%d","%Y")}}/{{macros.ds_format(ds,"%Y-%m-%d","%m")}}/{{ds}}-events.json',
    bucket_name=None,
    aws_conn_id='aws_credentials',
    poke_interval=2,
    timeout=10,
    soft_fail=True,
    dag=dag)
stage_events_to_redshift = StageToRedshiftOperator(
    task_id='staging_events',
    dag=dag,
    # end_date=datetime(2018, 11, 30, hour=23),
    table='staging_events',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    s3_bucket='udacity-dend',
    copy_sql=SqlQueries.copy_staging_events,
    params={'log_path': 's3://udacity-dend/log_json_path.json'},
    # s3_key='log_data/'

Example #18

0

Show file

File: s3_sensor_example.py Project: ShrayBans/docker-airflow

    "retries": 1,
    "retry_delay": timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG("s3_sensor_example",
          default_args=default_args,
          schedule_interval=timedelta(minutes=30),
          catchup=False)

s3_sensor = S3KeySensor(task_id='s3_sensor',
                        bucket_key='sensor_test/*',
                        wildcard_match=True,
                        bucket_name='cdn.getsixthman.com',
                        s3_conn_id='sixthman_airflow_s3',
                        dag=dag)

t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag)

t3 = BashOperator(
    task_id="templated",
    bash_command="echo wooooooooo",
    dag=dag,
)

t2.set_upstream(s3_sensor)
t3.set_upstream(s3_sensor)

Example #19

0

Show file

File: s3_spark_mysql.py Project: zhangshuo1996/venus

    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 5,
    'retry_delay': timedelta(minutes=10)
}

# run every day at 12:01 am
my_dag = DAG('s3_spark_mysql',
             default_args=default_args,
             schedule_interval='1 12 * * *')

t1 = S3KeySensor(
    task_id='s3_file_test',
    poke_interval=30,
    timeout=10,
    soft_fail=False,
    bucket_key=data_uri,  #expected file
    bucket_name=None,
    wildcard_match=True,
    dag=my_dag)

t2 = BashOperator(task_id='extract_users',
                  depends_on_past=False,
                  bash_command="""$SPARK_HOME/bin/spark-submit \
    --packages mysql:mysql-connector-java:5.1.40 \
    --master spark://ip-10-0-0-11:7077 \
    --executor-memory 6G \
    /home/ubuntu/venmo/spark/userinfo.py """ + data_file,
                  dag=my_dag)

t3 = BashOperator(task_id='net_spending',

Example #20

0

Show file

File: example_emr_job_flow_dag.py Project: angadsingh/airflow-ditto

        task_id='watch_step_pi',
        job_flow_id="{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
        aws_conn_id=get_config('emr')['aws_conn_id']
    )

    monitor_step_op_2 = EmrStepSensor(
        task_id='watch_step_distcp',
        job_flow_id="{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[1] }}",
        aws_conn_id=get_config('emr')['aws_conn_id']
    )

    validate_path_exists = S3KeySensor(
        task_id='validate_pii_exist',
        bucket_name='{{ params.bucket_name }}',
        bucket_key='{{ params.bucket_key }}',
        wildcard_match=True)

    terminate_cluster_op = EmrTerminateJobFlowOperator(
        task_id='remove_cluster',
        job_flow_id="{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}",
        aws_conn_id=get_config('emr')['aws_conn_id']
    )

    handle_failure_op = PythonOperator(
        task_id='handle_failure',
        python_callable=handle_failure_task,
        trigger_rule=trigger_rule.TriggerRule.ONE_FAILED)

    create_cluster_op >> monitor_cluster_op >> handle_failure_op >> terminate_cluster_op

Example #21

0

Show file

File: s3_dag_test.py Project: workforce-data-initiative/tpot-airflow

def grab_file():
    s3_conn_id = 'my_conn_S3'
    s3 = S3Hook(s3_conn_id)

    key_label = "file-to-watch-3.txt"
    key = s3.get_key(key_label, 'superconductive-airflow-bucket')
    key_string = key.get_contents_as_string()

    return key_string


dag = DAG('s3_connect_dag',
          default_args=default_args,
          schedule_interval='@once')

file_processor = PythonOperator(task_id='grab_file_from_s3',
                                python_callable=grab_file,
                                dag=dag)

file_trigger = S3KeySensor(task_id='check_s3_for_file_in_s3',
                           bucket_key='file-to-watch-*',
                           wildcard_match=True,
                           bucket_name='superconductive-airflow-bucket',
                           s3_conn_id='my_conn_S3',
                           timeout=18 * 60 * 60,
                           poke_interval=120,
                           dag=dag)

file_processor.set_upstream(file_trigger)

Example #22

0

Show file

    'email_on_retry': False,
    'owner': 'airflow',
    'start_date': datetime.now() - timedelta(days=1),
}

dag = DAG('af-dnaseq-align-wgs',
          default_args=default_args,
          schedule_interval=None
          #          schedule_interval='@once'
          )

sensor = S3KeySensor(task_id='check_s3',
                     bucket_name='secure-east2-test-bucket',
                     s3_host='s3-us-east-2.amazonaws.com',
                     bucket_key='jobs*.json',
                     wildcard_match=True,
                     s3_conn_id='fs_default',
                     timeout=0,
                     poke_interval=0,
                     soft_fail=False,
                     dag=dag)


def create_run_jobs(queue_json_file):
    with TemporaryDirectory() as temp_git_dir:
        # create jobs
        job_creation_uuid = str(uuid.uuid4())

        ## get job data
        with open(queue_json_file.name, 'r') as f:
            f.seek(0)
            queue_dict = json.loads(f.read())