コード例 #1
0
        "bucketname": "https://s3.amazonaws.com/data-sprints-eng-test",
        "bashcommand": "curl -k -X GET",
        "remote_file": "data-vendor_lookup-csv.csv",
        "local_file": f"{AIRFLOW_HOME}/dags/data/csv/nyc_vendor.csv",
        "csv_file": "nyc_vendor.csv",
        "folder_s3": "batch/vendor"
    }
}

dag = DAG(dag_id=dag_name,
          default_args=args,
          catchup=False,
          schedule_interval='30 3 * * *')  # 00:30 GMT-3

with open(f'{AIRFLOW_HOME}/dags/copy/copy_S3.md', 'r') as f:
    dag.doc_md = f.read()

start_log = DummyOperator(task_id='start_log', dag=dag)


def loop_files():

    loop_get_files = []

    for arquivo, val in job_info.items():

        bucketname = val['bucketname']
        bashcommand = val['bashcommand']
        remote_file = val['remote_file']
        local_file = val['local_file']
コード例 #2
0
default_args = {
    'owner': DAG_OWNER_NAME,
    'email': ALERT_EMAIL_ADDRESSES,
    'email_on_failure': True,
    'email_on_retry': False,
    'start_date': START_DATE,
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

dag = DAG(DAG_ID,
          default_args=default_args,
          schedule_interval=SCHEDULE_INTERVAL,
          start_date=START_DATE)
dag.doc_md = __doc__

log_cleanup = """
echo "Getting Configurations..."
BASE_LOG_FOLDER="{{params.directory}}"
TYPE="{{params.type}}"
MAX_LOG_AGE_IN_DAYS="{{dag_run.conf.maxLogAgeInDays}}"
if [ "${MAX_LOG_AGE_IN_DAYS}" == "" ]; then
    echo "maxLogAgeInDays conf variable isn't included. Using Default '""" + str(
    DEFAULT_MAX_LOG_AGE_IN_DAYS) + """'."
    MAX_LOG_AGE_IN_DAYS='""" + str(DEFAULT_MAX_LOG_AGE_IN_DAYS) + """'
fi
ENABLE_DELETE=""" + str("true" if ENABLE_DELETE else "false") + """
echo "Finished Getting Configurations"
echo ""
コード例 #3
0
    'owner': DAG_OWNER_NAME,
    'depends_on_past': False,
    'email': ALERT_EMAIL_ADDRESSES,
    'email_on_failure': True,
    'email_on_retry': False,
    'start_date': START_DATE,
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

dag = DAG(DAG_ID,
          default_args=default_args,
          schedule_interval=SCHEDULE_INTERVAL,
          start_date=START_DATE,
          is_paused_upon_creation=False)
dag.doc_md = """
Airflow produces quite a lot of log files, and the log pvc gets full fairly easily, 
which in turn prevents the whole application from working. This is why this DAG that removes 
old log files is added and enabled by default. 
**It is strongly encouraged to keep this DAG enabled!**

By default log files get removed after two weeks, but you can define when log files get 
removed by either modifying the DAG directly or creating a variable in the web UI (Admin -> Variables):

* Key: airflow\_log\_cleanup\_\_max\_log\_age\_in\_days
* Value: number of days after a log file is deleted, for example 30

You can manually trigger individual DAG runs with different number of days as configuration by setting
maxLogAgeInDays (for example {"maxLogAgeInDays":30}) as the DAG run configuration JSON.
"""