Example #1
0
def delete_s3_key_files_subdag(parent_dag_name, child_dag_name, start_date,
                               s3_bucket, s3_key, aws_credentials):
    dag = DAG(
        f'{parent_dag_name}.{child_dag_name}',
        description='Delete all S3 files in the provided key.',
        start_date=start_date,
        schedule_interval=None,
        catchup=False,
    )

    list_s3_processed_s3_files = S3ListOperator(
        task_id='list_s3_processed_s3_files',
        dag=dag,
        bucket=s3_bucket,
        prefix=s3_key,
        aws_conn_id=aws_credentials,
    )

    delete_processed_s3_files = S3DeleteFromContextOperator(
        task_id='delete_processed_s3_files',
        dag=dag,
        bucket=s3_bucket,
        context_task_id='list_s3_processed_s3_files',
        aws_conn_id=aws_credentials,
    )

    chain(list_s3_processed_s3_files, delete_processed_s3_files)

    return dag
Example #2
0
    def _get_test_dag(self):
        with DAG(dag_id='test_dag', default_args=DEFAULT_DAG_ARGS) as dag:
            op1 = SparkSubmitOperator(task_id='op1')
            op2 = EmrAddStepsOperator(task_id='op2', job_flow_id='foo')
            op3 = S3ListOperator(task_id='op3', bucket='foo')
            op4 = EmrCreateJobFlowOperator(task_id='op4')
            op5 = TriggerDagRunOperator(task_id='op5', trigger_dag_id='foo')
            op6 = FileToWasbOperator(task_id='op6',
                                     container_name='foo',
                                     blob_name='foo',
                                     file_path='foo')
            op7 = EmailOperator(task_id='op7',
                                subject='foo',
                                to='foo',
                                html_content='foo')
            op8 = S3CopyObjectOperator(task_id='op8',
                                       dest_bucket_key='foo',
                                       source_bucket_key='foo')
            op9 = BranchPythonOperator(task_id='op9', python_callable=print)
            op10 = PythonOperator(task_id='op10', python_callable=range)

            op1 >> [op2, op3, op4]
            op2 >> [op5, op6]
            op6 >> [op7, op8, op9]
            op3 >> [op7, op8]
            op8 >> [op9, op10]

        return dag
Example #3
0
    def test_execute(self, mock_hook):

        mock_hook.return_value.list_keys.return_value = MOCK_FILES

        operator = S3ListOperator(
            task_id=TASK_ID, bucket=BUCKET, prefix=PREFIX, delimiter=DELIMITER)

        files = operator.execute(None)

        mock_hook.return_value.list_keys.assert_called_once_with(
            bucket_name=BUCKET, prefix=PREFIX, delimiter=DELIMITER)
        self.assertEqual(sorted(files), sorted(MOCK_FILES))
Example #4
0
    def transform(self, subdag: nx.DiGraph,
                  parent_fragment: DAGFragment) -> DAGFragment:
        subdag_roots = [n for n, d in subdag.in_degree() if d == 0]
        first_root = subdag_roots[0].task_id

        task_id_prefix = '' if first_root in ['op2', 'op3'] else '2'

        TestSubDagTransformer1.op1 = SparkSubmitOperator(
            task_id=f"t{task_id_prefix}p1", dag=self.dag)
        TestSubDagTransformer1.op2 = EmrAddStepsOperator(
            task_id=f"t{task_id_prefix}p2", job_flow_id='foo', dag=self.dag)
        TestSubDagTransformer1.op3 = S3ListOperator(
            task_id=f"t{task_id_prefix}p3", bucket='foo', dag=self.dag)
        TestSubDagTransformer1.op4 = EmrCreateJobFlowOperator(
            task_id=f"t{task_id_prefix}p4", dag=self.dag)
        TestSubDagTransformer1.op5 = DummyOperator(
            task_id=f"t{task_id_prefix}p5", dag=self.dag)

        TestSubDagTransformer1.op1 >> [
            TestSubDagTransformer1.op2, TestSubDagTransformer1.op3
        ] >> TestSubDagTransformer1.op4

        return DAGFragment(
            [TestSubDagTransformer1.op1, TestSubDagTransformer1.op5])
Example #5
0
with DAG(
        's3client_dag',
        start_date=datetime(2019, 1, 1),
        max_active_runs=3,
        schedule_interval=timedelta(
            minutes=30
        ),  # https://airflow.apache.org/docs/stable/scheduler.html#dag-runs
        default_args=default_args,
        # catchup=False # enable if you don't want historical dag runs to run
) as dag:

    t0 = DummyOperator(task_id='start')

    t1 = S3ListOperator(task_id='list_3s_files',
                        bucket='datalake-nonprod-raw',
                        prefix='S3Upload/dwh5013-prefijos',
                        delimiter='/',
                        aws_conn_id='my_aws')

    t2 = PythonOperator(task_id=f'python_files',
                        python_callable=route_on_attribute)

    t3 = DummyOperator(task_id='end')
    # t2 = S3CopyObjectOperator(
    #     source_bucket_key='source_file',
    #     dest_bucket_key='rfmtest',
    #     aws_conn_id='my_aws',
    #     source_bucket_name='source-bucket',
    #     dest_bucket_name='dest-bucket'
    # )
from airflow.models import DAG
from airflow.contrib.operators.s3_list_operator import S3ListOperator
from datetime import datetime

with DAG(dag_id='s3_list_bucket',
         schedule_interval=None,
         start_date=datetime(2019, 6, 7)) as dag:

    s3_file = S3ListOperator(task_id='list_3s_files',
                             bucket='airflow-dag-test-bucket',
                             prefix='test',
                             delimiter='/',
                             aws_conn_id='aws_default')
        dag_path = os.path.join(dag_folder, dag_filename)
        logging.info(f"Write DAG: '{new_dag}'")
        logging.info(f"Write new DAG to {dag_bucket} at {dag_path}")
        hook = GCSHook()
        hook.upload(bucket_name=dag_bucket, object_name=dag_path, data=new_dag)

with models.DAG(
      dag_id=DAG_ID,
      max_active_runs=1,
      default_args=default_dag_args) as dag:
    
    start = PythonOperator(task_id="start_task", python_callable=save_config_variable)
    
    #listing folders from s3 bucket and creating dags based on folder prefix
    folder_list = S3ListOperator(
        task_id='list_s3_folders',
        bucket=s3_bucket
        )
        
    list_folders = folder_list.execute(None)
    file_list = set()
    for i in list_folders:
        file_list.add(i.split('/')[0])
        
    
    logging.info(f"List of folders in s3 bucket: '{file_list}'")
        
    generate = PythonOperator(
          task_id="generate_dag_files",
          task_concurrency=1,
          python_callable=generate_dags,
          op_kwargs={
def load_definition(json_file, **context):
    """
    Loads the definition file from s3 and remove the required s3 files
    Args:
        json_file: Json definition file to load from s3

    """
    s3 = S3Hook(aws_conn_id='s3_etl')
    file_load = json.loads(
        s3.read_key(bucket_name=Variable.get(
            'sanitization_s3_sanit_def_files_folder').split('/')[0],
                    key=json_file))
    logger.info('Definition file is loaded successfully')

    try:
        dt.datetime.strptime(file_load.get('back_date_from'), '%Y-%m-%d')
        remove_s3_task = S3ListOperator(
            task_id='remove_s3',
            bucket=Variable.get(
                'sanitization_s3_sanit_def_files_folder').split('/')[0],
            prefix='___SCHEMA___' + '/' + '___TABLE_NAME___' + '/',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            aws_conn_id='s3_etl',
            startafter='___SCHEMA___' + '/' + '___TABLE_NAME___' +
            '/batch_date=' + file_load.get('back_date_from') + '/',
            retries=3)

        s3_keys = remove_s3_task.execute(context=context)

        if s3_keys:
            delete_s3_list = [
                s3_keys[file:file + int(dag_config["delete_key_chunk_size"])]
                for file in range(0, len(s3_keys),
                                  int(dag_config["delete_key_chunk_size"]))
            ]
            for s3_keys in delete_s3_list:
                s3.delete_objects(bucket=Variable.get(
                    'sanitization_s3_sanit_def_files_folder').split('/')[0],
                                  keys=s3_keys)

            delete_rows_task = ABCRedshiftOperator(
                task_id='delete_rows',
                source_name='___SCHEMA___',
                redshift_conn_id='snowplow_redshift',
                sql=
                'delete from {table_name} where batch_date >= {back_date_from};'
                .format(table_name=file_load.get('schema') + '.' +
                        file_load.get('table_name'),
                        back_date_from=file_load.get('back_date_from')),
                retries=3)

            delete_rows_task.execute(context=context)
        else:
            logger.info(
                'S3 and table are already backdated from the desired date!')

    except ValueError:
        logger.warning(
            "Incorrect data format, should be YYYY-MM-DD. No keys will be deleted!!"
        )

    context['task_instance'].xcom_push(key='query',
                                       value=file_load.get('query'))
    context['task_instance'].xcom_push(key='schema',
                                       value=file_load.get('schema'))
    context['task_instance'].xcom_push(key='table_name',
                                       value=file_load.get('table_name'))
    context['task_instance'].xcom_push(key='start_date',
                                       value=file_load.get('start_date'))
    context['task_instance'].xcom_push(key='table_columns',
                                       value=file_load.get('table_columns'))
    context['task_instance'].xcom_push(key='back_date_from',
                                       value=file_load.get('back_date_from'))
    context['task_instance'].xcom_push(key='batch_size',
                                       value=file_load.get('batch_size'))
         schedule_interval='___SCHEDULE_INTERVAL___',
         max_active_runs=1) as main_dag:
    doc_md = __doc__

    load_definition_task = PythonOperator(task_id='load_definition',
                                          python_callable=load_definition,
                                          op_args=['___TEMPLATE_JSON___'],
                                          trigger_rule=TriggerRule.ALL_SUCCESS,
                                          provide_context=True,
                                          retries=3)

    list_s3_task = S3ListOperator(
        task_id='list_s3',
        bucket=Variable.get('sanitization_s3_sanit_def_files_folder').split(
            '/')[0],
        prefix='___SCHEMA___' + '/' + '___TABLE_NAME___' + '/',
        trigger_rule=TriggerRule.ALL_SUCCESS,
        aws_conn_id='s3_etl',
        startafter='___SCHEMA___' + '/' + '___TABLE_NAME___' + '/batch_date=' +
        LOOK_BACK_DAYS.strftime("%Y-%m-%d") + '/',
        retries=3)

    compute_next_gather_task = BranchPythonOperator(
        task_id='compute_next_gather',
        python_callable=compute_next_gather,
        provide_context=True,
        trigger_rule=TriggerRule.ALL_SUCCESS,
        retries=3)

    create_staging_table_task = ABCRedshiftOperator(
        task_id='create_staging_table',
        source_name='___SCHEMA___',
    Fetch the file with a pattern
    :param kwargs:
    :return:
    """
    print(kwargs)
    xcom_data = kwargs["ti"]
    s3_files_paths_list = xcom_data.xcom_pull(key=None, task_ids="list_s3_files")
    print(s3_files_paths_list)
    if s3_files_paths_list:
        return [path for path in s3_files_paths_list if re.search(s3_file_pattern, path)]




list_s3_files = S3ListOperator(task_id="list_s3_files",
                               dag=dag,
                               aws_conn_id="aws_conn",
                               bucket=src_bucket, prefix=src_prefix)

load_s3_data_mysql = PythonOperator(task_id='load_s3_data_mysql',
                                    dag=dag,
                                    provide_context=True,
                                    python_callable=readS3FilesAndLoadtoMySql,
                                    op_kwargs={"aws_conn_id":aws_conn_id,'src_bucket': src_bucket, 'mysql_conn': mysql_conn_id,"schema":schema,"table":table})
copy_src_files_to_archive = PythonOperator(task_id="copy_src_files_to_archive",
                                           dag=dag,
                                           provide_context=True,
                                           python_callable=archiveS3Files,
                                           op_kwargs={'src_bucket': src_bucket,'trg_bucket':archive_bucket, 'trg_path': archive_path,"aws_conn_id":aws_conn_id}
                                           )

#delete_s3_files = S3DeleteObjectsOperator(task_id="delete_s3_files",
        "bucket_name": AIRFLOW_DATA_BUCKET,
        "harvest_from_date": None,
        "harvest_until_date": None,
        "metadata_prefix": CATALOG_OAI_BW_MD_PREFIX,
        "oai_endpoint": CATALOG_OAI_BW_ENDPOINT,
        "records_per_file": 10000,
        "included_sets": CATALOG_OAI_BW_INCLUDED_SETS,
        "timestamp": "{{ ti.xcom_pull(task_ids='set_s3_namespace') }}/bw"
    },
    dag=DAG
)

LIST_CATALOG_BW_S3_DATA = S3ListOperator(
    task_id="list_catalog_bw_s3_data",
    bucket=AIRFLOW_DATA_BUCKET,
    prefix=DAG.dag_id + "/{{ ti.xcom_pull(task_ids='set_s3_namespace') }}/bw/",
    delimiter="",
    aws_conn_id=AIRFLOW_S3.conn_id,
    dag=DAG
)

PREPARE_BOUNDWITHS = PythonOperator(
    task_id='prepare_boundwiths',
    provide_context=True,
    python_callable=prepare_oai_boundwiths,
    op_kwargs={
        "AWS_ACCESS_KEY_ID": AIRFLOW_S3.login,
        "AWS_SECRET_ACCESS_KEY": AIRFLOW_S3.password,
        "BUCKET": AIRFLOW_DATA_BUCKET,
        "DEST_FOLDER": DAG.dag_id + "/{{ ti.xcom_pull(task_ids='set_s3_namespace') }}/lookup.tsv",
        "S3_KEYS": "{{ ti.xcom_pull(task_ids='list_catalog_bw_s3_data') }}",
        "SOURCE_FOLDER": DAG.dag_id + "/{{ ti.xcom_pull(task_ids='set_s3_namespace') }}/bw"
Tasks with custom logic are relegated to individual Python files.
"""

SAFETY_CHECK = PythonOperator(task_id="safety_check",
                              python_callable=helpers.catalog_safety_check,
                              dag=DAG)

SET_S3_NAMESPACE = PythonOperator(task_id="set_s3_namespace",
                                  python_callable=datetime.now().strftime,
                                  op_args=["%Y-%m-%d_%H-%M-%S"],
                                  dag=DAG)

LIST_ALMA_S3_DATA = S3ListOperator(task_id="list_alma_s3_data",
                                   bucket=AIRFLOW_DATA_BUCKET,
                                   prefix=ALMASFTP_S3_PREFIX + "/" +
                                   ALMASFTP_S3_ORIGINAL_DATA_NAMESPACE +
                                   "/alma_bibs__",
                                   delimiter="/",
                                   aws_conn_id=AIRFLOW_S3.conn_id,
                                   dag=DAG)

LIST_BOUNDWITH_S3_DATA = S3ListOperator(task_id="list_boundwith_s3_data",
                                        bucket=AIRFLOW_DATA_BUCKET,
                                        prefix=ALMASFTP_S3_PREFIX +
                                        "/alma_bibs__boundwith",
                                        delimiter="/",
                                        aws_conn_id=AIRFLOW_S3.conn_id,
                                        dag=DAG)

PREPARE_BOUNDWITHS = PythonOperator(
    task_id="prepare_boundwiths",
    provide_context=True,
Example #13
0
                    "--context_param DATABRICKS_ENDPOINT=XXX",
                    "--context_param DATABRICKS_TOKEN=XXX",
                    "--context_param DATABRICKS_CLUSTER_ID={{ task_instance.xcom_pull(task_ids='create_databricks_cluster') }}"
                ]
            },
        ]
    },
    region_name='us-east-1',
    launch_type='EC2',
    dag=dag)

# define list of lobs we want to run for
# loop through the lob's we want to use to build up our dag

s3_list_files = S3ListOperator(task_id="s3_list_files",
                               bucket="tgourdel-storage",
                               aws_conn_id="aws_default",
                               dag=dag)

s3_list_files.set_upstream(create_cluster_notify)

files = s3_list_files.execute(None)
for x in files:
    run_job = ECSOperator(
        task_id="Copy_%s_to_DBFS" % (x),
        task_definition='uploadtodbfs',
        cluster='TalendECS',
        aws_conn_id='aws_default',
        overrides={
            'containerOverrides': [
                {
                    'name':