Ejemplo n.º 1
0
 def test_bucket_name_None_and_bucket_key_as_relative_path(self):
     """
     Test if exception is raised when bucket_name is None
     and bucket_key is provided as relative path rather than s3:// url.
     :return:
     """
     with self.assertRaises(AirflowException):
         S3KeySensor(task_id='s3_key_sensor', bucket_key="file_in_bucket")
Ejemplo n.º 2
0
 def test_parse_bucket_key(self, key, bucket, parsed_key, parsed_bucket):
     s = S3KeySensor(
         task_id='s3_key_sensor',
         bucket_key=key,
         bucket_name=bucket,
     )
     self.assertEqual(s.bucket_key, parsed_key)
     self.assertEqual(s.bucket_name, parsed_bucket)
 def test_bucket_name_provided_and_bucket_key_is_s3_url(self):
     """
     Test if exception is raised when bucket_name is provided
     while bucket_key is provided as a full s3:// url.
     :return:
     """
     with self.assertRaises(AirflowException):
         S3KeySensor(bucket_key="s3://test_bucket/file",
                     bucket_name='test_bucket')
Ejemplo n.º 4
0
    def test_poke(self, mock_hook):
        s = S3KeySensor(task_id='s3_key_sensor',
                        bucket_key='s3://test_bucket/file')

        mock_check_for_key = mock_hook.return_value.check_for_key
        mock_check_for_key.return_value = False
        self.assertFalse(s.poke(None))
        mock_check_for_key.assert_called_once_with(s.bucket_key, s.bucket_name)

        mock_hook.return_value.check_for_key.return_value = True
        self.assertTrue(s.poke(None))
Ejemplo n.º 5
0
def get_check_wat_file_in_s3_sensor(dag, aws_conn_id):
    return S3KeySensor(
        task_id="check_for_wat_file",
        retries=0,
        aws_conn_id=aws_conn_id,
        bucket_name="commoncrawl",
        bucket_key=f"crawl-data/{_get_cc_index_template()}/wat.paths.gz",
        poke_interval=60,
        timeout=60 * 60 * 24 * 3,
        soft_fail=True,
        mode="reschedule",
    )
Ejemplo n.º 6
0
    def test_poke_wildcard(self, mock_hook):
        op = S3KeySensor(task_id='s3_key_sensor',
                         bucket_key='s3://test_bucket/file',
                         wildcard_match=True)

        mock_check_for_wildcard_key = mock_hook.return_value.check_for_wildcard_key
        mock_check_for_wildcard_key.return_value = False
        self.assertFalse(op.poke(None))
        mock_check_for_wildcard_key.assert_called_once_with(
            op.bucket_key, op.bucket_name)

        mock_check_for_wildcard_key.return_value = True
        self.assertTrue(op.poke(None))
Ejemplo n.º 7
0
        "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}"
    ],
    dag=dag)

create_job_flow_file = PythonOperator(
    task_id="create_job_flow_file",
    python_callable=create_job_flow_file,
    op_args=[
        "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}"
    ],
    dag=dag)

file_sensor = S3KeySensor(
    task_id='recap_cntrl_file_sensor',
    poke_interval=60,  # (seconds); checking file every 60 seconds
    timeout=60 * 60 * 18,  # timeout in 18 hours
    bucket_key="s3://vivek-mathew/recap-cntrl.txt",
    bucket_name=None,
    wildcard_match=False,
    dag=dag)

terminate_cluster = EmrTerminateJobFlowOperator(
    task_id="terminate_cluster",
    job_flow_id=
    "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
    aws_conn_id='aws_default',
    dag=dag)

recap_file_delete = S3DeleteObjectsOperator(task_id="delete_recap_cntrl_file",
                                            bucket="vivek-mathew",
                                            keys="recap-cntrl.txt",
                                            dag=dag)
Ejemplo n.º 8
0
from airflow.sensors.s3_key_sensor import S3KeySensor
from datetime import datetime, timedelta

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2020, 2, 25),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 5,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('s3_dag_test', default_args=default_args, schedule_interval='@once')

t1 = BashOperator(
    task_id='bash_test',
    bash_command='echo "hello, it should work" > file-to-watch-1.txt',
    dag=dag)

sensor = S3KeySensor(task_id='check_s3_for_file_in_s3',
                     bucket_key='/file-to-watch-*',
                     wildcard_match=True,
                     bucket_name='airflow-s3log',
                     aws_conn_id='my_conn_S3',
                     timeout=18 * 60 * 60,
                     poke_interval=120,
                     dag=dag)

t1.set_upstream(sensor)
Ejemplo n.º 9
0
    options = ['branch_prod', 'branch_test', 'branch_dev']
    return random.choice(options)


args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 4, 8)
}

dag = DAG(dag_id='airflow_demo', default_args=args, schedule_interval=None)

file_sensor = S3KeySensor(task_id='check_s3_for_file',
                          bucket_key=s3_file_trigger,
                          wildcard_match=True,
                          provide_context=False,
                          bucket_name=s3_bucket,
                          timeout=18 * 60 * 60,
                          poke_interval=30,
                          dag=dag)

job_dl_file = PythonOperator(task_id='download_file',
                             provide_context=False,
                             python_callable=download_file.run,
                             op_kwargs={
                                 'bucket': s3_bucket,
                                 'file': s3_file
                             },
                             dag=dag)

job_1_move_files = PythonOperator(task_id='move_the_files',
                                  provide_context=False,
args = {
    'owner': 'airflow',
    'start_date': airflow.utils.dates.days_ago(0),
    'depends_on_past': False,
}

dag = DAG(dag_id='s3_key_sensor_demo_dag',
          schedule_interval=schedule,
          default_args=args)


def new_file_detection(**kwargs):
    print("A new file has arrived in s3 bucket")


file_sensor = S3KeySensor(
    task_id='s3_key_sensor_task',
    poke_interval=60 * 30,  # (seconds); checking file every half an hour
    timeout=60 * 60 * 12,  # timeout in 12 hours
    bucket_key="s3://[bucket_name]/[key]",
    bucket_name=None,
    wildcard_match=False,
    dag=dag)

print_message = PythonOperator(task_id='print_message',
                               provide_context=True,
                               python_callable=new_file_detection,
                               dag=dag)

file_sensor >> print_message
from datetime import datetime

from airflow.sensors.s3_key_sensor import S3KeySensor
from airflow.operators.s3_to_redshift_operator import S3ToRedshiftTransfer
from airflow.models import DAG

args = {
    'owner': 'Adil',
    'start_date': datetime(2019, 6, 20),
    'retries': 1,
}

with DAG(dag_id='nyc_taxi_to_redshift',
         default_args=args,
         schedule_interval=None) as dag:
    wait_for_s3_file = S3KeySensor(bucket_name='mktg-redshift-exchange',
                                   bucket_key='nyc-taxi/temp-taxi-data',
                                   wildcard_match=False,
                                   dag=dag)
    upload_to_redshift = S3ToRedshiftTransfer(
        schema='public',
        table='temp-taxi-data',
        s3_bucket='mktg-redshift-exchange',
        s3_key='nyc-taxi',
        copy_options=['CSV', 'IGNOREHEADER 2'])

    wait_for_s3_file >> upload_to_redshift
Ejemplo n.º 12
0
def terminate_instance(**context):
    pass


with DAG(dag_id="cluster_manager", schedule_interval="@daily",
         catchup=False) as dag:

    get_latest_ami_id = PythonOperator(task_id="get_latest_ami",
                                       python_callable=get_latest_ami(),
                                       provide_context=True)

    create_instance = PythonOperator(task_id="create_instance",
                                     python_callable=create_instance())

    #create_cluster = EmrCreateJobFlowOperator(
    #    task_id="create_cluster"
    #)

    create_cluster_ctrl_file = PythonOperator(
        task_id="create_cluster_ctrl_file")

    check_recap_ctrl_file = S3KeySensor(task_id="check_recap_ctrl_file")

    terminate_instance = PythonOperator(task_id="terminate_instance",
                                        python_callable=terminate_instance())

    #terminate_cluster = EmrTerminateJobFlowOperator(
    #    task_id="terminate_cluster"
    #)
Ejemplo n.º 13
0
        'depends_on_past': False,
        'start_date': datetime.utcnow(),
        'retries': 1,
        'retry_delay': timedelta(minutes=5)
}

with DAG(dag_id = "s3_snowflake_slack_pipeline", 
         default_args = DAG_DEFAULT_ARGS,
         schedule_interval = "0 8 * * *", 
         catchup = False
         ) as dag:
    
    file_sensor = S3KeySensor(task_id = 's3_key_sensor_task',
                              poke_interval = 60 * 30, 
                              timeout = 60 * 60 * 12, 
                              bucket_key = "s3://%s/%s" % (),
                              bucket_name = None,
                              wildcard_match = False,
                              on_failure_callback = failure_slack_message,
                              dag = dag)
    
    upload_file = PythonOperator(task_id = "upload_to_snowflake_task",
                                 python_callable = upload_to_snowflake,
                                 on_failure_callback = failure_slack_message,
                                 dag = dag)
    
    completion_slack_message = PythonOperator(task_id = "completion_slack_message_task",
                                              python_callable = completion_slack_message,
                                              on_failure_callback = failure_slack_message,
                                              dag = dag)
    
    file_sensor >> upload_file >> completion_slack_message
        task_id="stage_adlogs_{}_hourly".format(table),
        snowflake_conn_id=SF_CONN_ID,
        warehouse=SF_WAREHOUSE,
        database=SF_DATABASE,
        sql=STAGE_ADLOGS_HOURLY_QUERY,
        params={"env": ENV},
        autocommit=True,
        trigger_rule='all_done',
        dag=DAG)
    if table in JOB_ARGS["int_tables"]:  #conditions for specific tables
        stage_adlogs_hourly_job >> stage_int_tables
    elif table == 'seg':
        stage_adlogs_hourly_job >> stage_onetag_table
    else:
        manifest_bucket_key = os.path.join(
            JOB_ARGS["mii_table_path"].format(table), DATEHOUR_PATH,
            JOB_ARGS["mii_manifest_path"])
        check_for_logs_job = S3KeySensor(
            task_id="{}_logs_all_present".format(table),
            aws_conn_id=AWS_CONN_ID,
            bucket_name=AWS_BUCKET_NAME,
            bucket_key=manifest_bucket_key,
            wildcard_match=True,
            retries=0,
            execution_timeout=timedelta(minutes=5),
            dag=DAG)
        check_for_logs_job >> stage_adlogs_hourly_job >> stage_finish

stage_onetag_table >> stage_onetag_hourly_job >> stage_finish
stage_int_tables >> stage_int_hourly_job >> stage_finish
    python_callable=process_log_files,
    provide_context=True,
    templates_dict={
        'filepath': FILEPATH,
        'dy': DY,
        'dm': DM,
        'dd': DD,
        'dh': DH
    },
    dag=dag)

sensor = S3KeySensor(
    task_id='check_s3_for_file_in_s3',
    bucket_key=FILEPATH + "/*.log",
    wildcard_match=True,
    bucket_name='bountiesapilog',
    s3_conn_id='bounties_s3',
    timeout=12 * 60 * 60,  # 12h timeout for poking
    poke_interval=5 * 60,  # 5m between pokes
    dag=dag)

clear_partitions = PostgresOperator(task_id='clear_partitions',
                                    postgres_conn_id='postgres_data_warehouse',
                                    sql=sql_truncate_table_command % {
                                        'y': DY,
                                        'm': DM,
                                        'd': DD,
                                        'h': DH
                                    },
                                    dag=dag)
Ejemplo n.º 16
0
code_map = eval(Variable.get("sys_flag"))

#判断ufile(cdh环境)
if code_map["id"].lower() == "ufile":
    dim_oride_driver_base_task = UFileSensor(
        task_id='dim_oride_driver_base_task',
        filepath='{hdfs_path_str}/country_code=nal/dt={pt}/_SUCCESS'.format(
            hdfs_path_str='oride/oride_dw/dim_oride_driver_base', pt='{{ds}}'),
        bucket_name='opay-datalake',
        poke_interval=60,
        dag=dag)

    dwd_oride_order_base_include_test_di_task = S3KeySensor(
        task_id='dwd_oride_order_base_include_test_di_task',
        bucket_key='{hdfs_path_str}/country_code=NG/dt={pt}/_SUCCESS'.format(
            hdfs_path_str="oride/oride_dw/dwd_oride_order_base_include_test_di",
            pt='{{ds}}'),
        bucket_name='opay-bi',
        poke_interval=60,
        dag=dag)

    dwd_oride_driver_data_group_df_task = UFileSensor(
        task_id='dwd_oride_driver_data_group_df_task',
        filepath='{hdfs_path_str}/country_code=nal/dt={pt}/_SUCCESS'.format(
            hdfs_path_str="oride/oride_dw/dwd_oride_driver_data_group_df",
            pt='{{ds}}'),
        bucket_name='opay-datalake',
        poke_interval=60,
        dag=dag)
    #路径
    hdfs_path = "ufile://opay-datalake/oride/oride_dw/" + table_name
else:
Ejemplo n.º 17
0
args = {
    'owner': 'airflow',
    'start_date': days_ago(1),
    'depends_on_past': False,
}
s3dag = DAG(dag_id='s3_sensor',
            schedule_interval=schedule,
            default_args=args,
            concurrency=1,
            max_active_runs=1)


def new_file_detection(**kwargs):
    print("A new file has arrived in s3 bucket")


file_sensor = S3KeySensor(
    task_id='s3_key_sensor_task',
    poke_interval=10,  # (seconds); checking file every half an hour
    timeout=60 * 5,  # timeout in 12 hours
    bucket_key="s3://gpongracz/hello1.csv",
    bucket_name=None,
    wildcard_match=False,
    dag=s3dag)

print_message = PythonOperator(task_id='print_message',
                               provide_context=True,
                               python_callable=new_file_detection,
                               dag=s3dag)

file_sensor >> print_message
Ejemplo n.º 18
0
#         key=key,
#         bucket_name=bucket_name)

# Using the context manager alllows you not to duplicate the dag parameter in each operator
with DAG(MAIN_DAG_NAME, default_args=default_args,
         schedule_interval='@once') as main_dag:

    # Parameters required in S3:
    # 1. bucket_name: You need to create a bucket in your S3 first
    # 2. bucket_key (as a prefix) , for example, your file is called datafile.py, the key could be anything which sounds like a prefix, like dat*
    # 3. aws_conn_id: you need to create this via Airflow UI first

    inputsensor = S3KeySensor(task_id='check_s3_for_file_in_s3',
                              bucket_key='dat*',
                              wildcard_match=True,
                              bucket_name='your-bucket-name',
                              aws_conn_id='s3conn',
                              timeout=18 * 60 * 60,
                              poke_interval=30,
                              dag=main_dag)

    # Parameters required in S3:
    # 1. bucket_name: You need to create a bucket in your S3 first
    # 2. key : a file called 'datafile.py' should be present in your bucket

    download_file_from_S3_task = PythonOperator(
        task_id='download_file_from_S3',
        depends_on_past=True,
        python_callable=download_file_from_s3,
        op_kwargs={
            'filename':
            '/tmp/airflow/datafile.py',  # this will store it in a temp location created by default
        f'athena_redshift_ingestor_dag_{GlobalArgs.DEMO_SUFFIX}',
        'miztiik_automation'
    ],
)

pull_files_to_s3_tsk = PythonOperator(task_id="pull_files_to_s3_tsk",
                                      python_callable=fetch_files,
                                      provide_context=True)

check_s3_for_key_tsk = S3KeySensor(
    task_id='check_s3_for_key',
    depends_on_past=False,
    timeout=20,
    poke_interval=5,
    soft_fail=True,
    # bucket_key=f"{GlobalArgs.S3_RAW_DATA_PREFIX}/movie_ratings_*",
    bucket_key=
    f"{GlobalArgs.S3_RAW_DATA_PREFIX}/dt={datetime.datetime.now().strftime('%Y_%m_%d')}/{GlobalArgs.S3_KEY_NAME}",
    bucket_name=GlobalArgs.S3_BKT_NAME,
    wildcard_match=True,
    s3_conn_id='aws_default',
    dag=redshift_ingestor_dag)

# Task to create Athena Database
create_athena_database_movie_ratings = AWSAthenaOperator(
    task_id="create_athena_database_movie_ratings",
    query=CREATE_ATHENA_DATABASE_MOVIES_QUERY,
    database=GlobalArgs.ATHENA_DB,
    output_location=
    f"s3://{GlobalArgs.S3_BKT_NAME}/{GlobalArgs.ATHENA_RESULTS}/create_athena_database_movie_ratings"
)
          catchup=False)

stage_finish = DummyOperator(task_id="adlogs_snowflake_staging_finish")

# staging ad logs hourly
for table in JOB_ARGS["tables"]:

    manifest_key = os.path.join(JOB_ARGS["manifest_path_base"],
                                JOB_ARGS["{}_manifest_name".format(table)],
                                DATEHOUR_PATH, JOB_ARGS["manifest_filename"])

    check_manifest_job = S3KeySensor(
        task_id="{}_logs_all_present".format(table),
        depends_on_past=True,
        aws_conn_id=AWS_CONN_ID,
        bucket_name=AWS_BUCKET_NAME,
        bucket_key=manifest_key,
        wildcard_match=True,
        retries=0,
        dag=DAG)

    stage_sql_path = os.path.join(JOB_ARGS["stage_sql_path"], table)

    query_log = SqlUtils.load_query(stage_sql_path).split("---")

    stage_adlogs_hourly_job = SnowflakeOperator(
        task_id="stage_logs_{}_hourly".format(table),
        snowflake_conn_id=SF_CONN_ID,
        warehouse=SF_WAREHOUSE,
        database=SF_DATABASE,
        sql=query_log,
)

##----------------------------------------- 变量 ---------------------------------------##
db_name = "oride_dw"
table_name = "dwm_oride_passenger_act_w"
##----------------------------------------- 依赖 ---------------------------------------##
#获取变量
code_map = eval(Variable.get("sys_flag"))

#判断ufile(cdh环境)
if code_map["id"].lower() == "ufile":
    dependence_dwd_oride_order_base_include_test_di_prev_day_task = S3KeySensor(
        task_id='dwd_oride_order_base_include_test_di_prev_day_task',
        bucket_key='{hdfs_path_str}/dt={pt}/_SUCCESS'.format(
            hdfs_path_str=
            "oride/oride_dw/dwd_oride_order_base_include_test_di/country_code=NG",
            pt='{{ds}}'),
        bucket_name='opay-bi',
        poke_interval=60,  # 依赖不满足时,一分钟检查一次依赖状态
        dag=dag)
    #路径
    hdfs_path = "ufile://opay-datalake/oride/oride_dw/" + table_name
else:
    print("成功")
    dependence_dwd_oride_order_base_include_test_di_prev_day_task = OssSensor(
        task_id='dwd_oride_order_base_include_test_di_prev_day_task',
        bucket_key='{hdfs_path_str}/dt={pt}/_SUCCESS'.format(
            hdfs_path_str=
            "oride/oride_dw/dwd_oride_order_base_include_test_di/country_code=NG",
            pt='{{ds}}'),
        bucket_name='opay-datalake',
Ejemplo n.º 22
0
S3_BUCKET_NAME = 'ofss-compute-aws'
INPUT_DIR = 'market_data/in'
# CURR_DATE_YYYYMMDD = datetime.now().strftime('%Y%m%d')
CURR_DATE_YYYYMMDD = '20200508'


def trigger_data_formatter(**context):
    files_list = s3_helper.get_file_list(
        S3_BUCKET_NAME, f'{INPUT_DIR}/ticker*data*{CURR_DATE_YYYYMMDD}.csv')
    if files_list:
        for in_file in files_list:
            print(in_file)


with DAG('aws_dp_poc', default_args=default_args,
         schedule_interval=None) as dag:
    in_file_sensor = S3KeySensor(
        task_id='in_file_sensor',
        bucket_key=f'{INPUT_DIR}/ticker*data*{CURR_DATE_YYYYMMDD}.csv',
        wildcard_match=True,
        bucket_name=f'{S3_BUCKET_NAME}',
        aws_conn_id=S3_CONNECTION_ID,
        poke_interval=60 * 2,  # (seconds); checking file every 2 minutes
        timeout=60 * 4,  # timeout in 4 minutes
    )

    trigger_transform = PythonOperator(task_id='trigger_transform',
                                       python_callable=trigger_data_formatter)

    in_file_sensor >> trigger_transform
Ejemplo n.º 23
0
          schedule_interval='*/30 * * * *',
          user_defined_macros={
              "get_half_hour": get_half_hour,
              "get_s3key_location": get_s3key_location
          })

sensor = S3KeySensor(
    task_id='s3keysensor',
    poke_interval=20,
    timeout=10,
    soft_fail=True,
    wildcard_match=True,
    bucket_key=
    's3://useast1-nlsn-cfn-dataload-univloader-dataloader-qa9-nonprod/{{ get_s3key_location(execution_date, "/*/_SUCCESS") }}',
    bucket_name=None,
    aws_conn_id='aws_default',
    executor_config={
        "KubernetesExecutor": {
            "annotations": {
                "iam.amazonaws.com/role":
                "arn:aws:iam::407121322241:role/alexk8-test-cluster-s3"
            }
        }
    },
    dag=dag)
check_file = KubernetesPodOperator(
    in_cluster=True,
    namespace='airflow-blue',
    service_account_name='mike',
    image="hmike96/checksuccess:0.0.1",
    cmds=["/bin/sh", "-c"],
Ejemplo n.º 24
0
    'process_data', default_args=default_args, schedule_interval=None)

ssh_dbt_run = SSHOperator(
        task_id='ssh_dbt_run',
        ssh_conn_id="ssh_ec2",
        command=dbt_run_model,
        do_xcom_push=True,
        dag=dag
    )

for sensor_file in source_data:
    sensor = S3KeySensor(
            aws_conn_id='s3_conection',
            task_id='sensor_s3_file_{0}'.format(sensor_file['data']),
            bucket_key='{0}_temp/{1}{2}'.format(sensor_file['data'], sensor_file['data'], '.csv'),
            wildcard_match=True,
            bucket_name=buket,
            s3_conn_id='s3://{0}'.format(buket),
            timeout=18 * 60 * 60,
            poke_interval=60,
            dag=dag)

    ssh_download = SSHOperator(
        task_id='ssh_download_{0}'.format(sensor_file['data']),
        ssh_conn_id="ssh_ec2",
        command=aws_download.format(sensor_file['data'], sensor_file['data']),
        do_xcom_push=True,
        dag=dag
    )

    validate_remove_caracteres = SSHOperator(
        task_id='ssh_validate_remove_caracteres_{0}'.format(sensor_file['data']),
Ejemplo n.º 25
0
    {% for i in range(5) %}
        echo "{{ ds }}"
        echo "{{ macros.ds_add(ds, 7)}}"
        echo "{{ params.my_param }}"
    {% endfor %}
"""

t3 = BashOperator(task_id='templated',
                  bash_command=templated_command,
                  params={'my_param': 'Parameter I passed in'},
                  dag=dag)

s3_sensor = S3KeySensor(task_id='s3_key_sensor',
                        bucket_key='some_folder/input/*',
                        wildcard_match=True,
                        bucket_name=AWS_BUCKET_NAME,
                        timeout=10,
                        poke_interval=60,
                        dag=dag)

email_op = EmailOperator(task_id='send_email',
                         subject='test email',
                         to='*****@*****.**',
                         html_content='TEXT CONTENT',
                         dag=dag)

fail_op = BashOperator(task_id='fail_op', bash_command='exit 1', dag=dag)

s3_sensor >> t1 >> t3 >> fail_op

#t3 >> email_op
Ejemplo n.º 26
0
        }
    },
    'query': "SELECT * FROM table_four"
}]
with DAG(dag_id='adb_pipeline',
         default_args=args,
         start_date=datetime(2019, 1, 1),
         schedule_interval='30 4 * * *',
         catchup=False) as dag:

    t1 = DummyOperator(task_id='kick_off_dag')

    t2 = S3KeySensor(task_id='check_for_file',
                     bucket_key='globetelecom/copy_*',
                     poke_interval=45,
                     timeout=600,
                     wildcard_match=True,
                     bucket_name=BUCKET,
                     aws_conn_id=S3_CONN_ID)

    for job in job_info:
        spark = DatabricksRunNowOperator(task_id=job['job_id'],
                                         job_id=job['job_id'],
                                         json=job['config'])

        query = PostgresOperator(task_id='post_{0}_query'.format(
            job['job_id']),
                                 sql=job['query'],
                                 postgres_conn_id='prod_postgres')
        t1 >> t2 >> spark >> query
Ejemplo n.º 27
0
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('pipeline', default_args=default_args)

t0 = FileSensor(
    task_id="local_file_test",
    filepath="/etc/hosts",
    fs_conn_id='fs_default',
    dag=dag)

t1 = S3KeySensor(
    task_id='s3_file_test',
    poke_interval=0,
    timeout=10,
    soft_fail=True,
    bucket_key='s3://dev.canopydata.com/airflow/example_qubole_operator.py',
    bucket_name=None,
    dag=dag)

t2 = BashOperator(
    task_id='task1',
    depends_on_past=False,
    bash_command='echo start',
    dag=dag)

t3 = BashOperator(
    task_id='task2',
    depends_on_past=False,
    bash_command='cat /etc/hosts',
    trigger_rule='all_success',
Ejemplo n.º 28
0
                    'tdy':TDY, 'tdm':TDM, 'tdd':TDD},
    dag=dag
)

load_logs_to_postgres = PythonOperator(
    task_id='parse_logs_and_load_to_staging',
    python_callable=process_log_files,
    provide_context=True,
    templates_dict={'filepath':FILEPATH, 'dy':DY, 'dm':DM, 'dd':DD, 'dh':DH},
    dag=dag)

sensor = S3KeySensor(
    task_id='check_s3_for_file_in_s3',
    bucket_key=FILEPATH +"/*.log",
    wildcard_match=True,
    bucket_name='bountiesapilog',
    s3_conn_id='bounties_s3',
    timeout=18*60*60,
    poke_interval=120,
    dag=dag)

clear_partitions = PostgresOperator(
    task_id='clear_partitions',
    postgres_conn_id='postgres_data_warehouse',
    sql=sql_truncate_table_command % {'y':DY, 'm':DM, 'd':DD, 'h':DH},
    dag=dag
)

create_partitions = PostgresOperator(
    task_id="create_table",
    postgres_conn_id='postgres_data_warehouse',
s3_backup_volume_config = {
    "persistentVolumeClaim": {
        "claimName": "s3-backup-volume"
    }
}

s3_backup_volume = Volume(name="s3-backup-volume",
                          configs=s3_backup_volume_config)

with dag:
    START = DummyOperator(task_id="start")

    # Wait for S3 Key
    S3_BACKUP_SENSE = S3KeySensor(
        task_id="s3-backup-sense",
        poke_interval=60 * 30,
        bucket_key=S3_KEY,
        aws_conn_id="aws_nci_db_backup",
    )

    # Download NCI db incremental backup from S3 and restore to RDS Aurora
    RESTORE_NCI_INCREMENTAL_SYNC = KubernetesPodOperator(
        namespace="processing",
        image=S3_TO_RDS_IMAGE,
        image_pull_policy="Always",
        annotations={"iam.amazonaws.com/role": NCI_DBSYNC_ROLE},
        cmds=["./import_from_s3.sh"],
        secrets=SECRET_RESTORE_INCREMENTAL_SYNC,
        labels={"step": "nci-db-restore-incremental-sync"},
        name="nci-db-restore-incremental-sync",
        task_id="nci-db-restore-incremental-sync",
        get_logs=True,
Ejemplo n.º 30
0
dag = DAG(dag_id='gdelt_redshift',
          default_args=default_args,
          schedule_interval='@daily',
          default_view='graph',
          max_active_runs=1)

# filename example - YYYYMMDD
bucket_key_template = 's3://gdelt-open-data/events/{}.export.csv'.format(
    '{{ yesterday_ds_nodash }}')

file_sensor = S3KeySensor(
    task_id='s3_key_sensor_task',
    poke_interval=60 * 30,  # (seconds); checking file every half an hour
    timeout=60 * 60 * 36,  # timeout in  hours
    bucket_key=bucket_key_template,
    bucket_name=None,
    wildcard_match=False,
    aws_conn_id='conn_aws_s3',
    dag=dag)

success_bucket = BashOperator(task_id='success_key_sensor',
                              bash_command='echo "{{ yesterday_ds_nodash }}"',
                              dag=dag)

s3_to_stage = S3ToRedshiftTransfer(task_id='s3_to_stage',
                                   schema='staging',
                                   table='event',
                                   bucket_key=bucket_key_template,
                                   redshift_conn_id='conn_aws_redshift',
                                   aws_conn_id='conn_aws_s3',