"bucketname": "https://s3.amazonaws.com/data-sprints-eng-test", "bashcommand": "curl -k -X GET", "remote_file": "data-vendor_lookup-csv.csv", "local_file": f"{AIRFLOW_HOME}/dags/data/csv/nyc_vendor.csv", "csv_file": "nyc_vendor.csv", "folder_s3": "batch/vendor" } } dag = DAG(dag_id=dag_name, default_args=args, catchup=False, schedule_interval='30 3 * * *') # 00:30 GMT-3 with open(f'{AIRFLOW_HOME}/dags/copy/copy_S3.md', 'r') as f: dag.doc_md = f.read() start_log = DummyOperator(task_id='start_log', dag=dag) def loop_files(): loop_get_files = [] for arquivo, val in job_info.items(): bucketname = val['bucketname'] bashcommand = val['bashcommand'] remote_file = val['remote_file'] local_file = val['local_file']
default_args = { 'owner': DAG_OWNER_NAME, 'email': ALERT_EMAIL_ADDRESSES, 'email_on_failure': True, 'email_on_retry': False, 'start_date': START_DATE, 'retries': 1, 'retry_delay': timedelta(minutes=1) } dag = DAG(DAG_ID, default_args=default_args, schedule_interval=SCHEDULE_INTERVAL, start_date=START_DATE) dag.doc_md = __doc__ log_cleanup = """ echo "Getting Configurations..." BASE_LOG_FOLDER="{{params.directory}}" TYPE="{{params.type}}" MAX_LOG_AGE_IN_DAYS="{{dag_run.conf.maxLogAgeInDays}}" if [ "${MAX_LOG_AGE_IN_DAYS}" == "" ]; then echo "maxLogAgeInDays conf variable isn't included. Using Default '""" + str( DEFAULT_MAX_LOG_AGE_IN_DAYS) + """'." MAX_LOG_AGE_IN_DAYS='""" + str(DEFAULT_MAX_LOG_AGE_IN_DAYS) + """' fi ENABLE_DELETE=""" + str("true" if ENABLE_DELETE else "false") + """ echo "Finished Getting Configurations" echo ""
'owner': DAG_OWNER_NAME, 'depends_on_past': False, 'email': ALERT_EMAIL_ADDRESSES, 'email_on_failure': True, 'email_on_retry': False, 'start_date': START_DATE, 'retries': 1, 'retry_delay': timedelta(minutes=1) } dag = DAG(DAG_ID, default_args=default_args, schedule_interval=SCHEDULE_INTERVAL, start_date=START_DATE, is_paused_upon_creation=False) dag.doc_md = """ Airflow produces quite a lot of log files, and the log pvc gets full fairly easily, which in turn prevents the whole application from working. This is why this DAG that removes old log files is added and enabled by default. **It is strongly encouraged to keep this DAG enabled!** By default log files get removed after two weeks, but you can define when log files get removed by either modifying the DAG directly or creating a variable in the web UI (Admin -> Variables): * Key: airflow\_log\_cleanup\_\_max\_log\_age\_in\_days * Value: number of days after a log file is deleted, for example 30 You can manually trigger individual DAG runs with different number of days as configuration by setting maxLogAgeInDays (for example {"maxLogAgeInDays":30}) as the DAG run configuration JSON. """