def delete_s3_key_files_subdag(parent_dag_name, child_dag_name, start_date, s3_bucket, s3_key, aws_credentials): dag = DAG( f'{parent_dag_name}.{child_dag_name}', description='Delete all S3 files in the provided key.', start_date=start_date, schedule_interval=None, catchup=False, ) list_s3_processed_s3_files = S3ListOperator( task_id='list_s3_processed_s3_files', dag=dag, bucket=s3_bucket, prefix=s3_key, aws_conn_id=aws_credentials, ) delete_processed_s3_files = S3DeleteFromContextOperator( task_id='delete_processed_s3_files', dag=dag, bucket=s3_bucket, context_task_id='list_s3_processed_s3_files', aws_conn_id=aws_credentials, ) chain(list_s3_processed_s3_files, delete_processed_s3_files) return dag
def _get_test_dag(self): with DAG(dag_id='test_dag', default_args=DEFAULT_DAG_ARGS) as dag: op1 = SparkSubmitOperator(task_id='op1') op2 = EmrAddStepsOperator(task_id='op2', job_flow_id='foo') op3 = S3ListOperator(task_id='op3', bucket='foo') op4 = EmrCreateJobFlowOperator(task_id='op4') op5 = TriggerDagRunOperator(task_id='op5', trigger_dag_id='foo') op6 = FileToWasbOperator(task_id='op6', container_name='foo', blob_name='foo', file_path='foo') op7 = EmailOperator(task_id='op7', subject='foo', to='foo', html_content='foo') op8 = S3CopyObjectOperator(task_id='op8', dest_bucket_key='foo', source_bucket_key='foo') op9 = BranchPythonOperator(task_id='op9', python_callable=print) op10 = PythonOperator(task_id='op10', python_callable=range) op1 >> [op2, op3, op4] op2 >> [op5, op6] op6 >> [op7, op8, op9] op3 >> [op7, op8] op8 >> [op9, op10] return dag
def test_execute(self, mock_hook): mock_hook.return_value.list_keys.return_value = MOCK_FILES operator = S3ListOperator( task_id=TASK_ID, bucket=BUCKET, prefix=PREFIX, delimiter=DELIMITER) files = operator.execute(None) mock_hook.return_value.list_keys.assert_called_once_with( bucket_name=BUCKET, prefix=PREFIX, delimiter=DELIMITER) self.assertEqual(sorted(files), sorted(MOCK_FILES))
def transform(self, subdag: nx.DiGraph, parent_fragment: DAGFragment) -> DAGFragment: subdag_roots = [n for n, d in subdag.in_degree() if d == 0] first_root = subdag_roots[0].task_id task_id_prefix = '' if first_root in ['op2', 'op3'] else '2' TestSubDagTransformer1.op1 = SparkSubmitOperator( task_id=f"t{task_id_prefix}p1", dag=self.dag) TestSubDagTransformer1.op2 = EmrAddStepsOperator( task_id=f"t{task_id_prefix}p2", job_flow_id='foo', dag=self.dag) TestSubDagTransformer1.op3 = S3ListOperator( task_id=f"t{task_id_prefix}p3", bucket='foo', dag=self.dag) TestSubDagTransformer1.op4 = EmrCreateJobFlowOperator( task_id=f"t{task_id_prefix}p4", dag=self.dag) TestSubDagTransformer1.op5 = DummyOperator( task_id=f"t{task_id_prefix}p5", dag=self.dag) TestSubDagTransformer1.op1 >> [ TestSubDagTransformer1.op2, TestSubDagTransformer1.op3 ] >> TestSubDagTransformer1.op4 return DAGFragment( [TestSubDagTransformer1.op1, TestSubDagTransformer1.op5])
with DAG( 's3client_dag', start_date=datetime(2019, 1, 1), max_active_runs=3, schedule_interval=timedelta( minutes=30 ), # https://airflow.apache.org/docs/stable/scheduler.html#dag-runs default_args=default_args, # catchup=False # enable if you don't want historical dag runs to run ) as dag: t0 = DummyOperator(task_id='start') t1 = S3ListOperator(task_id='list_3s_files', bucket='datalake-nonprod-raw', prefix='S3Upload/dwh5013-prefijos', delimiter='/', aws_conn_id='my_aws') t2 = PythonOperator(task_id=f'python_files', python_callable=route_on_attribute) t3 = DummyOperator(task_id='end') # t2 = S3CopyObjectOperator( # source_bucket_key='source_file', # dest_bucket_key='rfmtest', # aws_conn_id='my_aws', # source_bucket_name='source-bucket', # dest_bucket_name='dest-bucket' # )
from airflow.models import DAG from airflow.contrib.operators.s3_list_operator import S3ListOperator from datetime import datetime with DAG(dag_id='s3_list_bucket', schedule_interval=None, start_date=datetime(2019, 6, 7)) as dag: s3_file = S3ListOperator(task_id='list_3s_files', bucket='airflow-dag-test-bucket', prefix='test', delimiter='/', aws_conn_id='aws_default')
dag_path = os.path.join(dag_folder, dag_filename) logging.info(f"Write DAG: '{new_dag}'") logging.info(f"Write new DAG to {dag_bucket} at {dag_path}") hook = GCSHook() hook.upload(bucket_name=dag_bucket, object_name=dag_path, data=new_dag) with models.DAG( dag_id=DAG_ID, max_active_runs=1, default_args=default_dag_args) as dag: start = PythonOperator(task_id="start_task", python_callable=save_config_variable) #listing folders from s3 bucket and creating dags based on folder prefix folder_list = S3ListOperator( task_id='list_s3_folders', bucket=s3_bucket ) list_folders = folder_list.execute(None) file_list = set() for i in list_folders: file_list.add(i.split('/')[0]) logging.info(f"List of folders in s3 bucket: '{file_list}'") generate = PythonOperator( task_id="generate_dag_files", task_concurrency=1, python_callable=generate_dags, op_kwargs={
def load_definition(json_file, **context): """ Loads the definition file from s3 and remove the required s3 files Args: json_file: Json definition file to load from s3 """ s3 = S3Hook(aws_conn_id='s3_etl') file_load = json.loads( s3.read_key(bucket_name=Variable.get( 'sanitization_s3_sanit_def_files_folder').split('/')[0], key=json_file)) logger.info('Definition file is loaded successfully') try: dt.datetime.strptime(file_load.get('back_date_from'), '%Y-%m-%d') remove_s3_task = S3ListOperator( task_id='remove_s3', bucket=Variable.get( 'sanitization_s3_sanit_def_files_folder').split('/')[0], prefix='___SCHEMA___' + '/' + '___TABLE_NAME___' + '/', trigger_rule=TriggerRule.ALL_SUCCESS, aws_conn_id='s3_etl', startafter='___SCHEMA___' + '/' + '___TABLE_NAME___' + '/batch_date=' + file_load.get('back_date_from') + '/', retries=3) s3_keys = remove_s3_task.execute(context=context) if s3_keys: delete_s3_list = [ s3_keys[file:file + int(dag_config["delete_key_chunk_size"])] for file in range(0, len(s3_keys), int(dag_config["delete_key_chunk_size"])) ] for s3_keys in delete_s3_list: s3.delete_objects(bucket=Variable.get( 'sanitization_s3_sanit_def_files_folder').split('/')[0], keys=s3_keys) delete_rows_task = ABCRedshiftOperator( task_id='delete_rows', source_name='___SCHEMA___', redshift_conn_id='snowplow_redshift', sql= 'delete from {table_name} where batch_date >= {back_date_from};' .format(table_name=file_load.get('schema') + '.' + file_load.get('table_name'), back_date_from=file_load.get('back_date_from')), retries=3) delete_rows_task.execute(context=context) else: logger.info( 'S3 and table are already backdated from the desired date!') except ValueError: logger.warning( "Incorrect data format, should be YYYY-MM-DD. No keys will be deleted!!" ) context['task_instance'].xcom_push(key='query', value=file_load.get('query')) context['task_instance'].xcom_push(key='schema', value=file_load.get('schema')) context['task_instance'].xcom_push(key='table_name', value=file_load.get('table_name')) context['task_instance'].xcom_push(key='start_date', value=file_load.get('start_date')) context['task_instance'].xcom_push(key='table_columns', value=file_load.get('table_columns')) context['task_instance'].xcom_push(key='back_date_from', value=file_load.get('back_date_from')) context['task_instance'].xcom_push(key='batch_size', value=file_load.get('batch_size'))
schedule_interval='___SCHEDULE_INTERVAL___', max_active_runs=1) as main_dag: doc_md = __doc__ load_definition_task = PythonOperator(task_id='load_definition', python_callable=load_definition, op_args=['___TEMPLATE_JSON___'], trigger_rule=TriggerRule.ALL_SUCCESS, provide_context=True, retries=3) list_s3_task = S3ListOperator( task_id='list_s3', bucket=Variable.get('sanitization_s3_sanit_def_files_folder').split( '/')[0], prefix='___SCHEMA___' + '/' + '___TABLE_NAME___' + '/', trigger_rule=TriggerRule.ALL_SUCCESS, aws_conn_id='s3_etl', startafter='___SCHEMA___' + '/' + '___TABLE_NAME___' + '/batch_date=' + LOOK_BACK_DAYS.strftime("%Y-%m-%d") + '/', retries=3) compute_next_gather_task = BranchPythonOperator( task_id='compute_next_gather', python_callable=compute_next_gather, provide_context=True, trigger_rule=TriggerRule.ALL_SUCCESS, retries=3) create_staging_table_task = ABCRedshiftOperator( task_id='create_staging_table', source_name='___SCHEMA___',
Fetch the file with a pattern :param kwargs: :return: """ print(kwargs) xcom_data = kwargs["ti"] s3_files_paths_list = xcom_data.xcom_pull(key=None, task_ids="list_s3_files") print(s3_files_paths_list) if s3_files_paths_list: return [path for path in s3_files_paths_list if re.search(s3_file_pattern, path)] list_s3_files = S3ListOperator(task_id="list_s3_files", dag=dag, aws_conn_id="aws_conn", bucket=src_bucket, prefix=src_prefix) load_s3_data_mysql = PythonOperator(task_id='load_s3_data_mysql', dag=dag, provide_context=True, python_callable=readS3FilesAndLoadtoMySql, op_kwargs={"aws_conn_id":aws_conn_id,'src_bucket': src_bucket, 'mysql_conn': mysql_conn_id,"schema":schema,"table":table}) copy_src_files_to_archive = PythonOperator(task_id="copy_src_files_to_archive", dag=dag, provide_context=True, python_callable=archiveS3Files, op_kwargs={'src_bucket': src_bucket,'trg_bucket':archive_bucket, 'trg_path': archive_path,"aws_conn_id":aws_conn_id} ) #delete_s3_files = S3DeleteObjectsOperator(task_id="delete_s3_files",
"bucket_name": AIRFLOW_DATA_BUCKET, "harvest_from_date": None, "harvest_until_date": None, "metadata_prefix": CATALOG_OAI_BW_MD_PREFIX, "oai_endpoint": CATALOG_OAI_BW_ENDPOINT, "records_per_file": 10000, "included_sets": CATALOG_OAI_BW_INCLUDED_SETS, "timestamp": "{{ ti.xcom_pull(task_ids='set_s3_namespace') }}/bw" }, dag=DAG ) LIST_CATALOG_BW_S3_DATA = S3ListOperator( task_id="list_catalog_bw_s3_data", bucket=AIRFLOW_DATA_BUCKET, prefix=DAG.dag_id + "/{{ ti.xcom_pull(task_ids='set_s3_namespace') }}/bw/", delimiter="", aws_conn_id=AIRFLOW_S3.conn_id, dag=DAG ) PREPARE_BOUNDWITHS = PythonOperator( task_id='prepare_boundwiths', provide_context=True, python_callable=prepare_oai_boundwiths, op_kwargs={ "AWS_ACCESS_KEY_ID": AIRFLOW_S3.login, "AWS_SECRET_ACCESS_KEY": AIRFLOW_S3.password, "BUCKET": AIRFLOW_DATA_BUCKET, "DEST_FOLDER": DAG.dag_id + "/{{ ti.xcom_pull(task_ids='set_s3_namespace') }}/lookup.tsv", "S3_KEYS": "{{ ti.xcom_pull(task_ids='list_catalog_bw_s3_data') }}", "SOURCE_FOLDER": DAG.dag_id + "/{{ ti.xcom_pull(task_ids='set_s3_namespace') }}/bw"
Tasks with custom logic are relegated to individual Python files. """ SAFETY_CHECK = PythonOperator(task_id="safety_check", python_callable=helpers.catalog_safety_check, dag=DAG) SET_S3_NAMESPACE = PythonOperator(task_id="set_s3_namespace", python_callable=datetime.now().strftime, op_args=["%Y-%m-%d_%H-%M-%S"], dag=DAG) LIST_ALMA_S3_DATA = S3ListOperator(task_id="list_alma_s3_data", bucket=AIRFLOW_DATA_BUCKET, prefix=ALMASFTP_S3_PREFIX + "/" + ALMASFTP_S3_ORIGINAL_DATA_NAMESPACE + "/alma_bibs__", delimiter="/", aws_conn_id=AIRFLOW_S3.conn_id, dag=DAG) LIST_BOUNDWITH_S3_DATA = S3ListOperator(task_id="list_boundwith_s3_data", bucket=AIRFLOW_DATA_BUCKET, prefix=ALMASFTP_S3_PREFIX + "/alma_bibs__boundwith", delimiter="/", aws_conn_id=AIRFLOW_S3.conn_id, dag=DAG) PREPARE_BOUNDWITHS = PythonOperator( task_id="prepare_boundwiths", provide_context=True,
"--context_param DATABRICKS_ENDPOINT=XXX", "--context_param DATABRICKS_TOKEN=XXX", "--context_param DATABRICKS_CLUSTER_ID={{ task_instance.xcom_pull(task_ids='create_databricks_cluster') }}" ] }, ] }, region_name='us-east-1', launch_type='EC2', dag=dag) # define list of lobs we want to run for # loop through the lob's we want to use to build up our dag s3_list_files = S3ListOperator(task_id="s3_list_files", bucket="tgourdel-storage", aws_conn_id="aws_default", dag=dag) s3_list_files.set_upstream(create_cluster_notify) files = s3_list_files.execute(None) for x in files: run_job = ECSOperator( task_id="Copy_%s_to_DBFS" % (x), task_definition='uploadtodbfs', cluster='TalendECS', aws_conn_id='aws_default', overrides={ 'containerOverrides': [ { 'name':