default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2016, 11, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 5,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('s3_dag_test', default_args=default_args, schedule_interval= '@once')

t1 = BashOperator(
    task_id='bash_test',
    bash_command='echo "hello world" > s3_conn_test.txt',
    dag=dag)

sensor = S3KeySensor(
    task_id='check_s3_for_file_in_s3',
    bucket_key='*',
    wildcard_match=True,
    bucket_name='airflow-input-sprite',
    s3_conn_id='aws_default',
    timeout=18*60*60,
    poke_interval=120,
    dag=dag)

t1.set_upstream(sensor)

    'ml-pipeline',
    default_args=default_args,
    concurrency=1,
    description='A simple ML data pipeline DAG',
    schedule_interval='@daily',
)

t_export_bq_to_s3 = PythonOperator(task_id='export_bq_to_s3',
                                   python_callable=bq_to_s3,
                                   dag=dag,
                                   retries=1)

check_s3_for_key = S3KeySensor(task_id='check_s3_for_key',
                               bucket_key=OUTPUT_FILE_KEY,
                               wildcard_match=True,
                               bucket_name=BUCKET_NAME,
                               s3_conn_id='aws_default',
                               timeout=20,
                               poke_interval=5,
                               dag=dag)

t_check_dataset_group = BranchPythonOperator(
    task_id='check_dataset_group',
    provide_context=True,
    python_callable=check_dataset_group,
    retries=1,
    dag=dag,
)

t_init_personalize = DummyOperator(
    task_id="init_personalize",
    trigger_rule=TriggerRule.ALL_SUCCESS,
Example #3
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2016, 11, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 5,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('s3_dag_test', default_args=default_args, schedule_interval= '@once')

t1 = BashOperator(
    task_id='bash_test',
    bash_command='echo "hello, it should work" > s3_conn_test.txt',
    dag=dag)

sensor = S3KeySensor(
    task_id='check_s3_for_file_in_s3',
    bucket_key='XXX/YYY/ZZZ.xml',
    wildcard_match=True,
    bucket_name='{BUCKET_NAME}',
    timeout=18*60*60,
    poke_interval=120,
    aws_conn_id='s3_connection',
    dag=dag)

t1.set_upstream(sensor)
Example #4
0
from datetime import datetime, timedelta

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2016, 11, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 5,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('s3_dag_test', default_args=default_args, schedule_interval='@once')

t1 = BashOperator(
    task_id='bash_test',
    bash_command='echo "hello, it should work" > s3_conn_test.txt',
    dag=dag)

sensor = S3KeySensor(task_id='check_s3_for_file_in_s3',
                     bucket_key='*',
                     wildcard_match=True,
                     bucket_name='uploadonly',
                     s3_conn_id='minio_connection',
                     timeout=18 * 60 * 60,
                     poke_interval=120,
                     dag=dag)

t1.set_upstream(sensor)
        "raw-ingester-out",
        "manifests",
        table,
        # tested using specific arguments
        "20190704",
        "15",
        "completed.manifest"
        )

    query_log = SqlUtils.load_query(stage_sql_path).split("---")

    sensor = S3KeySensor(
        task_id="s3_key_sensor_{}_task".format(table),
        #bucket_key="raw-ingester-out/manifests/*",
        bucket_key=KEY_PATH,
        wildcard_match=True,
        bucket_name=BUCKET_NAME,
        aws_conn_id=AWS_CONN_ID,
        timeout=18*60*60,
        poke_interval=120
    )

    stage_adlogs_hourly_job = SnowflakeOperator(
        task_id="stage_logs_{}_hourly".format(table),
        snowflake_conn_id=SF_CONN_ID,
        warehouse=SF_WAREHOUSE,
        database=SF_DATABASE,
        sql=query_log,
        params={
            "env": ENV,
            "team_name": TEAM_NAME
        },
Example #6
0
from airflow.operators import SimpleHttpOperator, HttpSensor,\
                        BashOperator, EmailOperator, S3KeySensor
from datetime import datetime, timedelta

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 10, 29),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 5,
    'retry_delay': timedelta(minutes=0.5)
}

dag = DAG('s3_dag_test', default_args=default_args, schedule_interval='@once')

t1 = BashOperator(task_id='bash_test',
                  bash_command='echo "Hello, Billionaire!" > s3_conn_test.txt',
                  dag=dag)

sensor = S3KeySensor(task_id='check_for_file_in_s3',
                     bucket_key='file-to-watch-*',
                     wildcard_match=True,
                     bucket_name='intellia-sensor-bucket',
                     timeout=18 * 60 * 60,
                     poke_interval=120,
                     dag=dag)

t1.set_upstream(sensor)
Example #7
0
templated_command = """
    {% for i in range(5) %}
        echo "{{ ds }}"
        echo "{{ macros.ds_add(ds, 7)}}"
        echo "{{ params.my_param }}"
    {% endfor %}
"""

t3 = BashOperator(
    task_id="templated",
    bash_command=templated_command,
    params={"my_param": "Parameter I passed in"},
    dag=dag,
)

s3sensor = S3KeySensor(
    task_id='new_s3_file_in_fanta-bucket',
    bucket_key='*',
    wildcard_match=True,
    bucket_name='airflow-input-fanta',
    s3_conn_id='My-funky-s3-connector',
    timeout=18*60*60,
    poke_interval=120,
    dag=dag)

t2.set_upstream(t1)
t3.set_upstream(t1)

s3sensor >> t1

    print(resp)
    return "OK"

with DAG(
    dag_id=DAG_ID,
    default_args=DEFAULT_ARGS,
    dagrun_timeout=timedelta(hours=2),
    start_date=days_ago(2),
    schedule_interval=None,
    tags=['athena','redshift'],
) as dag:
    check_s3_for_key = S3KeySensor(
        task_id='check_s3_for_key',
        bucket_key=s3_key,
        wildcard_match=True,
        bucket_name=s3_bucket_name,
        s3_conn_id='aws_default',
        timeout=20,
        poke_interval=5,
        dag=dag
    )
    files_to_s3 = PythonOperator(
        task_id="files_to_s3",
        python_callable=download_zip
    )
    
    create_athena_movie_table = AWSAthenaOperator(task_id="create_athena_movie_table",query=create_athena_movie_table_query, database=athena_db, output_location='s3://'+s3_bucket_name+"/"+athena_results+'create_athena_movie_table')
    
    create_athena_ratings_table = AWSAthenaOperator(task_id="create_athena_ratings_table",query=create_athena_ratings_table_query, database=athena_db, output_location='s3://'+s3_bucket_name+"/"+athena_results+'create_athena_ratings_table')
    
    create_athena_tags_table = AWSAthenaOperator(task_id="create_athena_tags_table",query=create_athena_tags_table_query, database=athena_db, output_location='s3://'+s3_bucket_name+"/"+athena_results+'create_athena_tags_table')