def delete_r_config(): try: Variable.set('r_config', '{}') except KeyError: raise ConfigVariableNotFoundException( "Variable 'r_config' not found !")
def create_r_config(self, ids, session): rows = session.query(FailedDagRun).filter( FailedDagRun.id.in_(ids)).all() r_obj = {} for d in rows: if r_obj.__contains__(d.dag_id): if not (r_obj[d.dag_id]).__contains__(d.execution_date): r_obj[d.dag_id].append(str(d.execution_date)) else: r_obj[d.dag_id] = [str(d.execution_date)[:19]] Variable.set(key='r_config', value=json.dumps(r_obj)) for id in ids: execution_date = session.query(FailedDagRun).filter( FailedDagRun.id == id).one().execution_date dag_id = session.query(FailedDagRun).filter( FailedDagRun.id == id).one().dag_id session.query(FailedDagRun).filter(FailedDagRun.id == id).update( {'state': 'recovery_executed'}, synchronize_session='fetch') Variable.delete( key="{}${}".format(str(execution_date)[:19], dag_id))
def test_variable_metastore_secrets_backend(self): Variable.set(key="hello", value="World") metastore_backend = MetastoreBackend() variable_value = metastore_backend.get_variable(key="hello") self.assertEqual("World", variable_value) self.assertIsNone( metastore_backend.get_variable(key="non_existent_key"))
def get(self, key, default: Any = NOTSET) -> Any: from airflow.models.variable import Variable if default is NOTSET: return Variable.get(key, deserialize_json=self._deserialize_json) return Variable.get(key, default, deserialize_json=self._deserialize_json)
def test_variable_metastore_secrets_backend(self): Variable.set(key="hello", value="World") Variable.set(key="empty_str", value="") metastore_backend = MetastoreBackend() variable_value = metastore_backend.get_variable(key="hello") assert "World" == variable_value assert metastore_backend.get_variable(key="non_existent_key") is None assert '' == metastore_backend.get_variable(key="empty_str")
def test_variables_as_arguments_dag(): override_command = 'value_from_variable' if version.parse(AIRFLOW_VERSION) >= version.parse("1.10.10"): os.environ['AIRFLOW_VAR_VAR1'] = override_command else: Variable.set("var1",override_command) td = dagfactory.DagFactory(DAG_FACTORY_VARIABLES_AS_ARGUMENTS) td.generate_dags(globals()) tasks = globals()['example_dag'].tasks for task in tasks: if task.task_id == "task_3": assert task.bash_command == override_command
def execute(self, context): ( fetch_record_count, send_data_to_submission, ) = context['ti'].xcom_pull(key='exception', task_ids=('fetch_record_count', 'send_data_to_submission')) if fetch_record_count is None: message = '<img src="https://airflow.apache.org/images/feature-image.png" width="400" height="100"/>' \ '<h2>AIRFLOW TASK FAILURE:</h2><hr/>' \ '<strong>DAG : </strong> {} <br/><hr/>' \ '<strong>TASKS:</strong> {}<br/><hr/>' \ '<strong>Reason:</strong> {}<br/><hr/>' \ .format(self.dag_id, 'send_data_to_submission', send_data_to_submission) elif send_data_to_submission is None: message = '<img src="https://airflow.apache.org/images/feature-image.png" width="400" height="100"/>' \ '<h2>AIRFLOW TASK FAILURE:</h2><hr/>' \ '<strong>DAG : </strong> {} <br/><hr/>' \ '<strong>TASKS:</strong> {}<br/><hr/>' \ '<strong>Reason:</strong> {}<br/><hr/>' \ .format(self.dag_id, 'fetch_record_count', fetch_record_count) try: config = json.loads(Variable.get("config")) email = config['email'] except NameError as e: raise ConfigVariableNotFoundException() send_email(to=email, subject='Airflow Notification', html_content=message)
def get_sheet(self, sheet_url_name: str) -> None: google_sheet_url = Variable.get(sheet_url_name) scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'] creds = sac.from_json_keyfile_dict(self.secret_key, scope) client = gspread.authorize(creds) self.sheet = client.open_by_url(google_sheet_url).sheet1
def is_recovery_variable_set(): global r_config try: r_config = json.loads(Variable.get("r_config")) return True except KeyError: raise ConfigVariableNotFoundException("Variable 'r_config' not found")
def is_config_variable_set(): global config try: config = json.loads(Variable.get("config")) return True except KeyError: raise ConfigVariableNotFoundException("Variable 'config' not found")
def execute(self, context): message = "<h3> Dag Successfull </h3>" try: config = json.loads(Variable.get("config")) email = config['email'] except NameError as e: raise ConfigVariableNotFoundException() send_email(to=email, subject='Airflow Notification', html_content=message)
def create_configuration_variables(): # 'config' variable Variable.set( key='config', value=json.dumps({ "tables": [], "start_date": "1da", "frequency": "hourly", "threshold": 10000, "export_format": "xml", "storage_type": "sftp", "email": "" })) # 'r_config' variable Variable.set( key='r_config', value='{}' ) # 'dag_creation_dates' variable Variable.set( key='dag_creation_dates', value=json.dumps({}) )
def delete_system_generated_tmp_files(): config = json.loads(Variable.get(key='config')) tables = config['tables'] tmp_path = "{}/{}".format(configuration.get_airflow_home(), 'backup/ServiceNow') for file in os.listdir(path=tmp_path): if file not in tables: shutil.rmtree('{}/{}'.format(tmp_path, file)) else: data_dir = "{}/{}".format(tmp_path, file) for xml in os.listdir(path=data_dir): expression = str(datetime.date(datetime.now())) if not xml.__contains__(expression): os.remove("{}/{}".format(data_dir, xml))
def test_parse_bucket_key_from_jinja(self, mock_hook): mock_hook.return_value.check_for_key.return_value = False Variable.set("test_bucket_key", "s3://bucket/key") execution_date = datetime(2020, 1, 1) dag = DAG("test_s3_key", start_date=execution_date) op = S3KeySensor( task_id='s3_key_sensor', bucket_key='{{ var.value.test_bucket_key }}', bucket_name=None, dag=dag, ) ti = TaskInstance(task=op, execution_date=execution_date) context = ti.get_template_context() ti.render_templates(context) op.poke(None) self.assertEqual(op.bucket_key, "key") self.assertEqual(op.bucket_name, "bucket")
def trigger_dag(self, ids, session=None): rows = session.query(FailedDagRun).filter( FailedDagRun.id.in_(ids)).all() try: r_config = Variable.get(key='r_config') r_obj = json.loads(r_config) for d in rows: if r_obj.__contains__(d.dag_id): if not (r_obj[d.dag_id]).__contains__( str(d.execution_date)[:19]): r_obj[d.dag_id].append(str(d.execution_date)[:19]) else: pass else: r_obj[d.dag_id] = [str(d.execution_date)[:19]] Variable.set(key='r_config', value=json.dumps(r_obj)) for id in ids: execution_date = session.query(FailedDagRun).filter( FailedDagRun.id == id).one().execution_date dag_id = session.query(FailedDagRun).filter( FailedDagRun.id == id).one().dag_id session.query(FailedDagRun).filter( FailedDagRun.id == id).update( {'state': 'recovery_executed'}, synchronize_session='fetch') Variable.delete( key="{}${}".format(str(execution_date)[:19], dag_id)) except KeyError as e: LoggingMixin().log.warn(e.__str__()) Variable.set(key='r_config', value='{}') self.create_r_config(ids, session)
# # create a new virtualenv in a convenient location # `virtualenv .sodavenv && .sodavenv/bin/pip install soda-sql` # # Now you can modify the BashOperator in the above DAG as follows: # from airflow import DAG from airflow.models.variable import Variable from airflow.operators.bash import BashOperator from airflow.operators.dummy import DummyOperator from airflow.utils.dates import days_ago from datetime import timedelta # Use the same variable name that you used in airflow variable creation soda_sql_project_path = Variable.get('soda_sql_project_path') default_args = { 'owner': 'soda_sql', 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG( 'soda_sql_scan', default_args=default_args, description='A simple Soda SQL scan DAG', schedule_interval=timedelta(days=1), start_date=days_ago(1), ) # A dummy operator to simulate data ingestion
from datetime import datetime from airflow import DAG from airflow.operators.bash import BashOperator from airflow.operators.python import PythonOperator from airflow.operators.subdag import SubDagOperator from airflow.models.variable import Variable from airflow.operators.trigger_dagrun import TriggerDagRunOperator from airflow.sensors.external_task import ExternalTaskSensor from slack import WebClient from slack.errors import SlackApiError from smart_sensor_file import SmartFileSensor import logging # paths to files default_path = '/Users/aleksandarmilosevic/PycharmProjects/AIRFLOW/run.txt' path = Variable.get('path_to_file', default_var=default_path) # slack_token = Variable.get('slack_token') slack_token = 'xoxb-1727046194672-1709378985940-A6HDRTXvZKqhpy8OFC7aBBOf' external_dag = Variable.get('external_dag') external_task = Variable.get('external_task') # function for pulling value from query_table task def print_res(task_id, dag_id, **context): ti = context['ti'] logging.info(ti.xcom_pull(task_ids=task_id, dag_id=dag_id)) # function that sends a message to a slack channel def slack_message(**context): client = WebClient(token=slack_token)
def create_dags(): global dag_creation_dates global new_dags global email_notify_required new_dags = [] dag_creation_dates = json.loads(Variable.get(key='dag_creation_dates')) email_notify_required = is_email_notification_required() try: for table in config.get('tables'): with open(configuration.get_airflow_home() + '/dags/templates/main.py.jinja2') as file_: template = Template(file_.read()) if dag_creation_dates.get(table) is not None: start_date = dag_creation_dates.get(table) else: start_date = get_start_date(config.get('start_date')) dag_creation_dates[table] = str(start_date) output = template.render( data={ 'dag_id': table, 'frequency': config.get('frequency'), 'storage_type': storage_type, 'start_date': start_date, 'email_required': email_notify_required } ) with open(configuration.get_airflow_home() + '/dags/generated/dag_' + '{}'.format(table).replace(' ', '_') + '.py', 'w') as f: f.write(output) new_dags.append('dag_' + '{}'.format(table).replace(' ', '_') + '.py') if len(r_config) != 0: for table in r_config: for exec_date in r_config.get(table): execution_date = str(exec_date).replace(' ', 'T')[0:19] with open(configuration.get_airflow_home() + '/dags/templates/recovery_template.py.jinja2') as file_: template = Template(file_.read()) output = template.render( data={'dag_id': table, 'frequency': config.get('frequency'), 'storage_type': storage_type, 'execution_date': execution_date}) with open(configuration.get_airflow_home() + '/dags/generated/r_dag_' + '{}_{}'.format( table, execution_date).replace(' ', '_') + '.py', 'w') as f: f.write(output) e = '{}'.format(execution_date).replace(' ', 'T') new_dags.append('r_dag_' + '{}_{}'.format(table, e).replace(' ', '_') + '.py') md_dag_ids = settings.Session.query(Dags.dag_id, Dags.fileloc).all() for record in md_dag_ids: (d_id, loc) = record filename = loc[str(loc).rfind('/') + 1:] if filename == 'dag_generator.py' or filename == 'dag_cleanup.py': continue if filename not in new_dags: try: if os.path.exists(str(loc)): os.remove(str(loc)) else: LoggingMixin().log.warning("{} file doesn't exists !".format(filename)) requests.delete( url="http://{}:8080/api/experimental/dags/{}".format( socket.gethostbyname(socket.gethostname()), str(d_id) ), auth=(rest.login, rest.password) ) dag_creation_dates.pop(d_id) except Exception as e: LoggingMixin().log.error(str(e)) Variable.set(key='dag_creation_dates', value=json.dumps(dag_creation_dates)) except AirflowException: raise ConfigVariableNotFoundException()
import os import sys sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) import pandas as pd import sqlite3 from airflow.utils.dates import days_ago from airflow.decorators import dag, task from airflow.operators.dummy import DummyOperator from airflow.providers.google.cloud.transfers.local_to_gcs import LocalFilesystemToGCSOperator from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator from airflow.models.variable import Variable DATASET_ID = Variable.get("DATASET_ID") BASE_PATH = Variable.get("BASE_PATH") BUCKET_NAME = Variable.get("BUCKET_NAME") GOOGLE_CLOUD_CONN_ID = Variable.get("GOOGLE_CLOUD_CONN_ID") BIGQUERY_TABLE_NAME = "bs_database_sqlite" GCS_OBJECT_NAME = "extract_transform_database_sqlite.csv" DATA_PATH = f"{BASE_PATH}/data" OUT_PATH = f"{DATA_PATH}/{GCS_OBJECT_NAME}" @dag( default_args={ 'owner': 'okza', 'email': '*****@*****.**', 'email_on_failure': True }, schedule_interval='0 4 * * * ', # every 4AM
# If you installed Soda SQL in your python environment you can use # PythonOperator to invoke Soda Scan. The following shows a sample Airflow DAG # using PythonOperator that you can use as a starting point. # from airflow import DAG from airflow.models.variable import Variable from airflow.operators.python import PythonOperator from airflow.operators.dummy import DummyOperator from airflow.utils.dates import days_ago from datetime import timedelta from sodasql.scan.scan_builder import ScanBuilder from airflow.exceptions import AirflowFailException # Make sure that this variables are set in your Airflow warehouse_yml = Variable.get('soda_sql_warehouse_yml_path') scan_yml = Variable.get('soda_sql_scan_yml_path') default_args = { 'owner': 'soda_sql', 'retries': 1, 'retry_delay': timedelta(minutes=5), } def run_soda_scan(warehouse_yml_file, scan_yml_file): scan_builder = ScanBuilder() scan_builder.warehouse_yml_file = warehouse_yml_file scan_builder.scan_yml_file = scan_yml_file scan = scan_builder.build() scan_result = scan.execute()
def __init__(self, key_name: str) -> None: self.secret_key = Variable.get(key_name, deserialize_json=True)
def __getattr__(self, key: str) -> Any: from airflow.models.variable import Variable self.var = Variable.get(key, deserialize_json=self._deserialize_json) return self.var
from airflow.providers.google.cloud.operators.bigquery import ( BigQueryInsertJobOperator) from airflow.providers.google.cloud.operators.dataproc import ( DataprocSubmitJobOperator) from airflow.models.variable import Variable from airflow import DAG from datetime import datetime, timedelta from package.api.google.cloud.dataproc import DataprocCreateClusterConfig import pendulum local_tz = pendulum.timezone("Asia/Taipei") gcp_config = Variable.get("gcp_project_1", deserialize_json=True) dataproc_config = gcp_config["dataproc"] bucket_config = dataproc_config["bucket"] cluster_config = DataprocCreateClusterConfig.make(gcp_config) bigquery_load_storage_config = { "load": { "source_uris": [ f"gs://{bucket_config['data_lake']}/'your-path'/upload_date={{ ds }}/*.parquet" ], "source_format": "PARQUET", "destination_table": { "project_id": gcp_config["project_id"], "dataset_id": "YOUR-DATASET", "table_id": "YOUR-TABLE" }, "create_disposition": "CREATE_IF_NEEDED", "write_disposition": "WRITE_APPEND",