Beispiel #1
0
    def delete_r_config():

        try:
            Variable.set('r_config', '{}')
        except KeyError:
            raise ConfigVariableNotFoundException(
                "Variable 'r_config' not found !")
Beispiel #2
0
    def create_r_config(self, ids, session):

        rows = session.query(FailedDagRun).filter(
            FailedDagRun.id.in_(ids)).all()

        r_obj = {}

        for d in rows:
            if r_obj.__contains__(d.dag_id):
                if not (r_obj[d.dag_id]).__contains__(d.execution_date):
                    r_obj[d.dag_id].append(str(d.execution_date))
            else:
                r_obj[d.dag_id] = [str(d.execution_date)[:19]]

        Variable.set(key='r_config', value=json.dumps(r_obj))

        for id in ids:
            execution_date = session.query(FailedDagRun).filter(
                FailedDagRun.id == id).one().execution_date
            dag_id = session.query(FailedDagRun).filter(
                FailedDagRun.id == id).one().dag_id

            session.query(FailedDagRun).filter(FailedDagRun.id == id).update(
                {'state': 'recovery_executed'}, synchronize_session='fetch')
            Variable.delete(
                key="{}${}".format(str(execution_date)[:19], dag_id))
 def test_variable_metastore_secrets_backend(self):
     Variable.set(key="hello", value="World")
     metastore_backend = MetastoreBackend()
     variable_value = metastore_backend.get_variable(key="hello")
     self.assertEqual("World", variable_value)
     self.assertIsNone(
         metastore_backend.get_variable(key="non_existent_key"))
Beispiel #4
0
    def get(self, key, default: Any = NOTSET) -> Any:
        from airflow.models.variable import Variable

        if default is NOTSET:
            return Variable.get(key, deserialize_json=self._deserialize_json)
        return Variable.get(key,
                            default,
                            deserialize_json=self._deserialize_json)
 def test_variable_metastore_secrets_backend(self):
     Variable.set(key="hello", value="World")
     Variable.set(key="empty_str", value="")
     metastore_backend = MetastoreBackend()
     variable_value = metastore_backend.get_variable(key="hello")
     assert "World" == variable_value
     assert metastore_backend.get_variable(key="non_existent_key") is None
     assert '' == metastore_backend.get_variable(key="empty_str")
Beispiel #6
0
def test_variables_as_arguments_dag():
    override_command = 'value_from_variable'
    if version.parse(AIRFLOW_VERSION) >= version.parse("1.10.10"):
        os.environ['AIRFLOW_VAR_VAR1'] = override_command
    else:
        Variable.set("var1",override_command)
    td = dagfactory.DagFactory(DAG_FACTORY_VARIABLES_AS_ARGUMENTS)
    td.generate_dags(globals())
    tasks = globals()['example_dag'].tasks
    for task in tasks:
        if task.task_id == "task_3":
            assert task.bash_command == override_command
Beispiel #7
0
    def execute(self, context):

        (
            fetch_record_count,
            send_data_to_submission,
        ) = context['ti'].xcom_pull(key='exception',
                                    task_ids=('fetch_record_count',
                                              'send_data_to_submission'))

        if fetch_record_count is None:
            message = '<img src="https://airflow.apache.org/images/feature-image.png" width="400" height="100"/>' \
                      '<h2>AIRFLOW TASK FAILURE:</h2><hr/>' \
                      '<strong>DAG : </strong>    {} <br/><hr/>' \
                      '<strong>TASKS:</strong>  {}<br/><hr/>' \
                      '<strong>Reason:</strong> {}<br/><hr/>' \
                .format(self.dag_id, 'send_data_to_submission', send_data_to_submission)
        elif send_data_to_submission is None:
            message = '<img src="https://airflow.apache.org/images/feature-image.png" width="400" height="100"/>' \
                      '<h2>AIRFLOW TASK FAILURE:</h2><hr/>' \
                      '<strong>DAG : </strong>    {} <br/><hr/>' \
                      '<strong>TASKS:</strong>  {}<br/><hr/>' \
                      '<strong>Reason:</strong> {}<br/><hr/>' \
                .format(self.dag_id, 'fetch_record_count', fetch_record_count)

        try:
            config = json.loads(Variable.get("config"))
            email = config['email']
        except NameError as e:
            raise ConfigVariableNotFoundException()

        send_email(to=email,
                   subject='Airflow Notification',
                   html_content=message)
Beispiel #8
0
    def get_sheet(self, sheet_url_name: str) -> None:
        google_sheet_url = Variable.get(sheet_url_name)

        scope = ['https://spreadsheets.google.com/feeds',
                 'https://www.googleapis.com/auth/drive']
        creds = sac.from_json_keyfile_dict(self.secret_key, scope)
        client = gspread.authorize(creds)

        self.sheet = client.open_by_url(google_sheet_url).sheet1
Beispiel #9
0
def is_recovery_variable_set():

    global r_config

    try:
        r_config = json.loads(Variable.get("r_config"))
        return True
    except KeyError:
        raise ConfigVariableNotFoundException("Variable 'r_config' not found")
Beispiel #10
0
def is_config_variable_set():

    global config

    try:
        config = json.loads(Variable.get("config"))
        return True
    except KeyError:
        raise ConfigVariableNotFoundException("Variable 'config' not found")
Beispiel #11
0
    def execute(self, context):

        message = "<h3> Dag Successfull </h3>"
        try:
            config = json.loads(Variable.get("config"))
            email = config['email']
        except NameError as e:
            raise ConfigVariableNotFoundException()

        send_email(to=email,
                   subject='Airflow Notification',
                   html_content=message)
Beispiel #12
0
def create_configuration_variables():

    # 'config' variable

    Variable.set(
        key='config',
        value=json.dumps({
            "tables": [],
            "start_date": "1da",
            "frequency": "hourly",
            "threshold": 10000,
            "export_format": "xml",
            "storage_type": "sftp",
            "email": ""
        }))

    # 'r_config' variable

    Variable.set(
        key='r_config',
        value='{}'
    )

    # 'dag_creation_dates' variable

    Variable.set(
        key='dag_creation_dates',
        value=json.dumps({})
    )
Beispiel #13
0
 def delete_system_generated_tmp_files():
     config = json.loads(Variable.get(key='config'))
     tables = config['tables']
     tmp_path = "{}/{}".format(configuration.get_airflow_home(),
                               'backup/ServiceNow')
     for file in os.listdir(path=tmp_path):
         if file not in tables:
             shutil.rmtree('{}/{}'.format(tmp_path, file))
         else:
             data_dir = "{}/{}".format(tmp_path, file)
             for xml in os.listdir(path=data_dir):
                 expression = str(datetime.date(datetime.now()))
                 if not xml.__contains__(expression):
                     os.remove("{}/{}".format(data_dir, xml))
Beispiel #14
0
    def test_parse_bucket_key_from_jinja(self, mock_hook):
        mock_hook.return_value.check_for_key.return_value = False

        Variable.set("test_bucket_key", "s3://bucket/key")

        execution_date = datetime(2020, 1, 1)

        dag = DAG("test_s3_key", start_date=execution_date)
        op = S3KeySensor(
            task_id='s3_key_sensor',
            bucket_key='{{ var.value.test_bucket_key }}',
            bucket_name=None,
            dag=dag,
        )

        ti = TaskInstance(task=op, execution_date=execution_date)
        context = ti.get_template_context()
        ti.render_templates(context)

        op.poke(None)

        self.assertEqual(op.bucket_key, "key")
        self.assertEqual(op.bucket_name, "bucket")
Beispiel #15
0
    def trigger_dag(self, ids, session=None):

        rows = session.query(FailedDagRun).filter(
            FailedDagRun.id.in_(ids)).all()

        try:
            r_config = Variable.get(key='r_config')
            r_obj = json.loads(r_config)

            for d in rows:
                if r_obj.__contains__(d.dag_id):
                    if not (r_obj[d.dag_id]).__contains__(
                            str(d.execution_date)[:19]):
                        r_obj[d.dag_id].append(str(d.execution_date)[:19])
                    else:
                        pass
                else:
                    r_obj[d.dag_id] = [str(d.execution_date)[:19]]

            Variable.set(key='r_config', value=json.dumps(r_obj))

            for id in ids:
                execution_date = session.query(FailedDagRun).filter(
                    FailedDagRun.id == id).one().execution_date
                dag_id = session.query(FailedDagRun).filter(
                    FailedDagRun.id == id).one().dag_id

                session.query(FailedDagRun).filter(
                    FailedDagRun.id == id).update(
                        {'state': 'recovery_executed'},
                        synchronize_session='fetch')
                Variable.delete(
                    key="{}${}".format(str(execution_date)[:19], dag_id))

        except KeyError as e:
            LoggingMixin().log.warn(e.__str__())
            Variable.set(key='r_config', value='{}')
            self.create_r_config(ids, session)
Beispiel #16
0
#
# create a new virtualenv in a convenient location
# `virtualenv .sodavenv && .sodavenv/bin/pip install soda-sql`
#
# Now you can modify the BashOperator in the above DAG as follows:
#

from airflow import DAG
from airflow.models.variable import Variable
from airflow.operators.bash import BashOperator
from airflow.operators.dummy import DummyOperator
from airflow.utils.dates import days_ago
from datetime import timedelta

# Use the same variable name that you used in airflow variable creation
soda_sql_project_path = Variable.get('soda_sql_project_path')

default_args = {
    'owner': 'soda_sql',
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(
    'soda_sql_scan',
    default_args=default_args,
    description='A simple Soda SQL scan DAG',
    schedule_interval=timedelta(days=1),
    start_date=days_ago(1),
)
# A dummy operator to simulate data ingestion
Beispiel #17
0
from datetime import datetime
from airflow import DAG
from airflow.operators.bash import BashOperator
from airflow.operators.python import PythonOperator
from airflow.operators.subdag import SubDagOperator
from airflow.models.variable import Variable
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
from airflow.sensors.external_task import ExternalTaskSensor
from slack import WebClient
from slack.errors import SlackApiError
from smart_sensor_file import SmartFileSensor
import logging

# paths to files
default_path = '/Users/aleksandarmilosevic/PycharmProjects/AIRFLOW/run.txt'
path = Variable.get('path_to_file', default_var=default_path)
# slack_token = Variable.get('slack_token')
slack_token = 'xoxb-1727046194672-1709378985940-A6HDRTXvZKqhpy8OFC7aBBOf'
external_dag = Variable.get('external_dag')
external_task = Variable.get('external_task')


# function for pulling value from query_table task
def print_res(task_id, dag_id, **context):
    ti = context['ti']
    logging.info(ti.xcom_pull(task_ids=task_id, dag_id=dag_id))


# function that sends a message to a slack channel
def slack_message(**context):
    client = WebClient(token=slack_token)
Beispiel #18
0
def create_dags():

    global dag_creation_dates
    global new_dags
    global email_notify_required

    new_dags = []

    dag_creation_dates = json.loads(Variable.get(key='dag_creation_dates'))
    email_notify_required = is_email_notification_required()

    try:
        for table in config.get('tables'):
            with open(configuration.get_airflow_home() + '/dags/templates/main.py.jinja2') as file_:
                template = Template(file_.read())

            if dag_creation_dates.get(table) is not None:
                start_date = dag_creation_dates.get(table)
            else:
                start_date = get_start_date(config.get('start_date'))
                dag_creation_dates[table] = str(start_date)

            output = template.render(
                data={
                    'dag_id': table,
                    'frequency': config.get('frequency'),
                    'storage_type': storage_type,
                    'start_date': start_date,
                    'email_required': email_notify_required
                }
            )

            with open(configuration.get_airflow_home() + '/dags/generated/dag_'
                      + '{}'.format(table).replace(' ', '_') + '.py', 'w') as f:
                f.write(output)
                new_dags.append('dag_' + '{}'.format(table).replace(' ', '_') + '.py')

        if len(r_config) != 0:

            for table in r_config:
                for exec_date in r_config.get(table):
                    execution_date = str(exec_date).replace(' ', 'T')[0:19]
                    with open(configuration.get_airflow_home()
                              + '/dags/templates/recovery_template.py.jinja2') as file_:
                        template = Template(file_.read())
                        output = template.render(
                            data={'dag_id': table, 'frequency': config.get('frequency'), 'storage_type': storage_type,
                                  'execution_date': execution_date})
                    with open(configuration.get_airflow_home() + '/dags/generated/r_dag_' + '{}_{}'.format(
                            table, execution_date).replace(' ', '_') + '.py', 'w') as f:
                        f.write(output)
                        e = '{}'.format(execution_date).replace(' ', 'T')
                        new_dags.append('r_dag_' + '{}_{}'.format(table, e).replace(' ', '_') + '.py')

        md_dag_ids = settings.Session.query(Dags.dag_id, Dags.fileloc).all()

        for record in md_dag_ids:
            (d_id, loc) = record
            filename = loc[str(loc).rfind('/') + 1:]
            if filename == 'dag_generator.py' or filename == 'dag_cleanup.py':
                continue
            if filename not in new_dags:
                try:
                    if os.path.exists(str(loc)):
                        os.remove(str(loc))
                    else:
                        LoggingMixin().log.warning("{} file doesn't exists !".format(filename))

                    requests.delete(
                        url="http://{}:8080/api/experimental/dags/{}".format(
                            socket.gethostbyname(socket.gethostname()),
                            str(d_id)
                        ),
                        auth=(rest.login, rest.password)
                    )

                    dag_creation_dates.pop(d_id)

                except Exception as e:
                    LoggingMixin().log.error(str(e))

        Variable.set(key='dag_creation_dates', value=json.dumps(dag_creation_dates))

    except AirflowException:

        raise ConfigVariableNotFoundException()
Beispiel #19
0
import os
import sys
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))

import pandas as pd
import sqlite3

from airflow.utils.dates import days_ago
from airflow.decorators import dag, task
from airflow.operators.dummy import DummyOperator
from airflow.providers.google.cloud.transfers.local_to_gcs import LocalFilesystemToGCSOperator
from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator
from airflow.models.variable import Variable

DATASET_ID = Variable.get("DATASET_ID")
BASE_PATH = Variable.get("BASE_PATH")
BUCKET_NAME = Variable.get("BUCKET_NAME")
GOOGLE_CLOUD_CONN_ID = Variable.get("GOOGLE_CLOUD_CONN_ID")
BIGQUERY_TABLE_NAME = "bs_database_sqlite"
GCS_OBJECT_NAME = "extract_transform_database_sqlite.csv"
DATA_PATH = f"{BASE_PATH}/data"
OUT_PATH = f"{DATA_PATH}/{GCS_OBJECT_NAME}"


@dag(
    default_args={
        'owner': 'okza',
        'email': '*****@*****.**',
        'email_on_failure': True
    },
    schedule_interval='0 4 * * * ',  # every 4AM
Beispiel #20
0
# If you installed Soda SQL in your python environment you can  use
# PythonOperator to invoke Soda Scan. The following shows a sample Airflow DAG
# using PythonOperator that you can use as a starting point.
#

from airflow import DAG
from airflow.models.variable import Variable
from airflow.operators.python import PythonOperator
from airflow.operators.dummy import DummyOperator
from airflow.utils.dates import days_ago
from datetime import timedelta
from sodasql.scan.scan_builder import ScanBuilder
from airflow.exceptions import AirflowFailException

# Make sure that this variables are set in your Airflow
warehouse_yml = Variable.get('soda_sql_warehouse_yml_path')
scan_yml = Variable.get('soda_sql_scan_yml_path')

default_args = {
    'owner': 'soda_sql',
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}


def run_soda_scan(warehouse_yml_file, scan_yml_file):
    scan_builder = ScanBuilder()
    scan_builder.warehouse_yml_file = warehouse_yml_file
    scan_builder.scan_yml_file = scan_yml_file
    scan = scan_builder.build()
    scan_result = scan.execute()
Beispiel #21
0
 def __init__(self, key_name: str) -> None:
     self.secret_key = Variable.get(key_name, deserialize_json=True)
Beispiel #22
0
    def __getattr__(self, key: str) -> Any:
        from airflow.models.variable import Variable

        self.var = Variable.get(key, deserialize_json=self._deserialize_json)
        return self.var
from airflow.providers.google.cloud.operators.bigquery import (
    BigQueryInsertJobOperator)
from airflow.providers.google.cloud.operators.dataproc import (
    DataprocSubmitJobOperator)
from airflow.models.variable import Variable
from airflow import DAG
from datetime import datetime, timedelta
from package.api.google.cloud.dataproc import DataprocCreateClusterConfig
import pendulum

local_tz = pendulum.timezone("Asia/Taipei")
gcp_config = Variable.get("gcp_project_1", deserialize_json=True)
dataproc_config = gcp_config["dataproc"]
bucket_config = dataproc_config["bucket"]
cluster_config = DataprocCreateClusterConfig.make(gcp_config)
bigquery_load_storage_config = {
    "load": {
        "source_uris": [
            f"gs://{bucket_config['data_lake']}/'your-path'/upload_date={{ ds }}/*.parquet"
        ],
        "source_format":
        "PARQUET",
        "destination_table": {
            "project_id": gcp_config["project_id"],
            "dataset_id": "YOUR-DATASET",
            "table_id": "YOUR-TABLE"
        },
        "create_disposition":
        "CREATE_IF_NEEDED",
        "write_disposition":
        "WRITE_APPEND",