예제 #1
0
def list_py_file_paths(directory, safe_mode=True,
                       include_examples=conf.getboolean('core', 'LOAD_EXAMPLES')):
    """
    Traverse a directory and look for Python files.

    :param directory: the directory to traverse
    :type directory: unicode
    :param safe_mode: whether to use a heuristic to determine whether a file
    contains Airflow DAG definitions
    :return: a list of paths to Python files in the specified directory
    :rtype: list[unicode]
    """
    file_paths = []
    if directory is None:
        return []
    elif os.path.isfile(directory):
        return [directory]
    elif os.path.isdir(directory):
        patterns_by_dir = {}
        for root, dirs, files in os.walk(directory, followlinks=True):
            patterns = patterns_by_dir.get(root, [])
            ignore_file = os.path.join(root, '.airflowignore')
            if os.path.isfile(ignore_file):
                with open(ignore_file, 'r') as f:
                    # If we have new patterns create a copy so we don't change
                    # the previous list (which would affect other subdirs)
                    patterns = patterns + [p for p in f.read().split('\n') if p]

            # If we can ignore any subdirs entirely we should - fewer paths
            # to walk is better. We have to modify the ``dirs`` array in
            # place for this to affect os.walk
            dirs[:] = [
                d
                for d in dirs
                if not any(re.search(p, os.path.join(root, d)) for p in patterns)
            ]

            # We want patterns defined in a parent folder's .airflowignore to
            # apply to subdirs too
            for d in dirs:
                patterns_by_dir[os.path.join(root, d)] = patterns

            for f in files:
                try:
                    file_path = os.path.join(root, f)
                    if not os.path.isfile(file_path):
                        continue
                    mod_name, file_ext = os.path.splitext(
                        os.path.split(file_path)[-1])
                    if file_ext != '.py' and not zipfile.is_zipfile(file_path):
                        continue
                    if any([re.findall(p, file_path) for p in patterns]):
                        continue

                    # Heuristic that guesses whether a Python file contains an
                    # Airflow DAG definition.
                    might_contain_dag = True
                    if safe_mode and not zipfile.is_zipfile(file_path):
                        with open(file_path, 'rb') as f:
                            content = f.read()
                            might_contain_dag = all(
                                [s in content for s in (b'DAG', b'airflow')])

                    if not might_contain_dag:
                        continue

                    file_paths.append(file_path)
                except Exception:
                    log = LoggingMixin().log
                    log.exception("Error while examining %s", f)
    if include_examples:
        import airflow.example_dags
        example_dag_folder = airflow.example_dags.__path__[0]
        file_paths.extend(list_py_file_paths(example_dag_folder, safe_mode, False))
    return file_paths
예제 #2
0
import attr
import jinja2
from cattr import structure, unstructure

from airflow.models.base import Operator
from airflow.utils.log.logging_mixin import LoggingMixin
from airflow.utils.module_loading import import_string

ENV = jinja2.Environment()

PIPELINE_OUTLETS = "pipeline_outlets"
PIPELINE_INLETS = "pipeline_inlets"
AUTO = "auto"

log = LoggingMixin().log


@attr.s(auto_attribs=True)
class Metadata:
    """
    Class for serialized entities.
    """
    type_name: str = attr.ib()
    data: Dict = attr.ib()


def _get_instance(meta: Metadata):
    """
    Instantiate an object from Metadata
    """
 def get_connection(cls, conn_id):
     conn = random.choice(cls.get_connections(conn_id))
     if conn.host:
         log = LoggingMixin().log
         log.info("Using connection to: %s", conn.host)
     return conn
예제 #4
0
import logging
from typing import TYPE_CHECKING, Dict, Optional, Union

import pendulum
from dateutil import relativedelta

import airflow
from airflow.exceptions import AirflowException
from airflow.models.baseoperator import BaseOperator
from airflow.models.connection import Connection
from airflow.models.dag import DAG
from airflow.serialization.enums import DagAttributeTypes as DAT, Encoding
from airflow.utils.log.logging_mixin import LoggingMixin
from airflow.www.utils import get_python_source

LOG = LoggingMixin().log

# Serialization failure returns 'failed'.
FAILED = 'serialization_failed'


class Serialization:
    """Serialization provides utils for serialization."""

    # JSON primitive types.
    _primitive_types = (int, bool, float, str)

    # Time types.
    # datetime.date and datetime.time are converted to strings.
    _datetime_types = (datetime.datetime, )
예제 #5
0
def fetch_servicenow_record_count(table_name, execution_date, **kwargs):
    """
    This method calls the service now api for a particular table and time period
    and gets count of records for a particular table
    :param: table_name : for which count of records is fetched
    :param: execution_date : airflow execution date of the dag
    :return: task_id
    """

    # check for empty
    if is_empty(table_name) or is_empty(execution_date):
        raise InvalidArguments("table_name, execution_date can't be empty")

    # check for none
    if table_name is None or execution_date is None:
        raise InvalidArguments("table_name, execution_date can't be None")

    try:

        try:
            # Load Configuration Data
            config = json.loads(Variable.get("config"))
            frequency = config['frequency']
            execution_datetime = datetime.strptime(execution_date[:19],
                                                   "%Y-%m-%dT%H:%M:%S")

            if frequency == 'hourly':
                freq_param = timedelta(hours=-1)

            elif frequency == 'daily':
                freq_param = timedelta(days=-1)
            elif frequency == 'monthly':
                freq_param = timedelta(days=-1 * one_month_ago(execution_date))

            elif frequency == 'half-hourly':
                freq_param = timedelta(minutes=-30)
            else:
                freq_param = timedelta(hours=-1)

            to_time = datetime(year=execution_datetime.year,
                               month=execution_datetime.month,
                               day=execution_datetime.day,
                               hour=execution_datetime.hour,
                               minute=execution_datetime.minute,
                               second=execution_datetime.second,
                               tzinfo=pendulum.timezone("UTC"))
            from_time = to_time + freq_param

        except KeyError as e:
            raise ConfigVariableNotFoundException()
        try:
            credentials_snow = BaseHook.get_connection("servicenow_default")
            login = credentials_snow.login
            password = credentials_snow.password
            host = credentials_snow.host
        except AirflowException as e:
            raise ServiceNowConnectionNotFoundException()

        service_now_hook = ServiceNowHook(host=host,
                                          login=login,
                                          password=password)
        response = service_now_hook.api_call(
            route='/api/now/stats/{}'.format(table_name),
            accept='application/json',
            query_params={
                'sysparm_count':
                'true',
                'sysparm_query':
                "sys_updated_onBETWEENjavascript:gs.dateGenerate('{}','{}')"
                "@javascript:gs.dateGenerate('{}','{}')".format(
                    str(from_time.date()), str(from_time.time()),
                    str(to_time.date()), str(to_time.time()))
            })
        print('response :' + response)
        count_of_records = int(
            json.loads(response)['result']['stats']['count'])

        log = LoggingMixin().log
        log.info("Getting count from: {}  to : {} ".format(from_time, to_time))
        log.info("totals number of records %s ", str(count_of_records))

        if int(count_of_records) == 0:
            return 'count_is_zero'
        elif int(count_of_records) > config['threshold']:
            return 'count_exceeds_threshold'
        else:
            return 'count_within_threshold'

    except Exception as e:

        kwargs['ti'].xcom_push(key='exception', value=str(e))

        instance = kwargs['task_instance']
        dag_id = str(instance.dag_id)
        task_id = str(instance.task_id)
        msg = str(e)
        execution_date = str(instance.execution_date)
        run_id = str(kwargs['run_id'])

        execution_date = execution_date.replace('T', ' ')[0:19]
        key = '{}${}'.format(execution_date, dag_id)

        value = {
            'dag_id': dag_id,
            'execution_date': execution_date,
            'task_id': task_id,
            'run_id': run_id,
            'error_msg': msg
        }

        Variable.set(key=key, value=json.dumps(value))

        raise
예제 #6
0
 def get_connection(cls, conn_id):  # type: (str) -> Connection
     conn = random.choice(list(cls.get_connections(conn_id)))
     if conn.host:
         log = LoggingMixin().log
         log.info("Using connection to: %s", conn.debug_info())
     return conn
    def _upload(self, context):

        # dropbox Connection details
        try:
            credentials_dropbox = BaseHook.get_connection(self.storage_conn_id)
            self.dropbox_access_token = credentials_dropbox.password
        except AirflowException as e:
            raise DropboxConnectionNotFoundException

        if self.is_storage_available(self.dropbox_access_token):
            try:
                LoggingMixin().log.info("Dropbox Storage avalaible")
                l_file_path = self.file_name.replace('.csv', '.json')
                file_name = l_file_path[l_file_path.rfind('/') + 1:]

                dt_current = datetime.strptime(self.execution_date[:19],
                                               "%Y-%m-%dT%H:%M:%S")

                exec_hour = str(dt_current.hour)
                exec_minute = str(dt_current.minute)
                exec_second = str(dt_current.second)

                if exec_hour == '0' and exec_minute == '0' and exec_second == '0':
                    dt_current = dt_current - timedelta(days=1)
                    r_file_path = '{}/{}/{}/{}/{}'.format(
                        '/mbrs', 'Servicenow', self.table,
                        '{}-{}-{}'.format(dt_current.year, dt_current.month,
                                          dt_current.day), file_name)
                else:
                    r_file_path = '{}/{}/{}/{}/{}'.format(
                        '/mbrs', 'Servicenow', self.table,
                        '{}-{}-{}'.format(dt_current.year, dt_current.month,
                                          dt_current.day), file_name)

                LoggingMixin().log.info("Running dropbox upload process...")
                try:
                    file_size = os.path.getsize(l_file_path)
                    CHUNK_SIZE = 4 * 1024 * 1024
                    dbx = dropbox.Dropbox(self.dropbox_access_token,
                                          timeout=600)
                    if file_size <= CHUNK_SIZE:
                        with open(l_file_path, 'rb') as f:
                            dbx.files_upload(
                                f.read(),
                                r_file_path,
                                mode=dropbox.files.WriteMode.overwrite)
                            f.close()
                            return True
                    else:
                        with open(l_file_path, 'rb') as f:
                            upload_session_start_result = dbx.files_upload_session_start(
                                f.read(CHUNK_SIZE))
                            cursor = dropbox.files.UploadSessionCursor(
                                session_id=upload_session_start_result.
                                session_id,
                                offset=f.tell())
                            commit = dropbox.files.CommitInfo(path=r_file_path)
                            while f.tell() < file_size:
                                if (file_size - f.tell()) <= CHUNK_SIZE:
                                    print(
                                        dbx.files_upload_session_finish(
                                            f.read(CHUNK_SIZE), cursor,
                                            commit))
                                else:
                                    dbx.files_upload_session_append_v2(
                                        f.read(CHUNK_SIZE), cursor)
                                    cursor.offset = f.tell()

                            f.close()
                            return True
                except Exception as e:
                    LoggingMixin().log.error(
                        "ServiceNow2DropBoxTransOperator : exception in dropbox upload for token : {} {}"
                        .format(self.dropbox_access_token, e))
                    return False
            except Exception as e:
                print(e)
        else:
            LoggingMixin().log.info("Dropbox Storage not avalaible")
            return False
예제 #8
0
def create_airflow_rest_connection():

    from airflow.contrib.auth.backends.password_auth import PasswordUser
    import base64
    import os

    session = settings.Session()
    exists = session.query(models.User).filter(models.User.username == 'application').scalar()

    if exists is None:

        LoggingMixin().log.info("creating 'application' user for mini-BRS...")

        # create 'application' user

        random_key = str(base64.urlsafe_b64encode(os.urandom(32)))
        user = PasswordUser(models.User())
        user.username = '******'
        user.email = '*****@*****.**'
        user.password = random_key
        session.add(user)
        session.commit()
        session.close()

        # create 'application' airflow connection
        rest = Connection(
            conn_id='rest',
            login='******',
            password=random_key
        )

        session = settings.Session()
        session.add(rest)
        session.commit()
        session.close()

        # create 'admin' user
        # admin_password = str(base64.urlsafe_b64encode(os.urandom(32)))

        config_parser = configuration.AirflowConfigParser()

        config_parser.read(
            configuration.get_airflow_config(
                        configuration.get_airflow_home()
                    )
        )

        u = config_parser.get(
            section='core',
            key='username'
        )

        p = config_parser.get(
            section='core',
            key='password'
        )

        user = PasswordUser(models.User())
        user.username = u
        user.email = '*****@*****.**'
        user.password = p
        user.superuser = True
        session = settings.Session()
        session.add(user)
        session.commit()
        session.close()

        config_parser.remove_option(
            section='core',
            option='username'
        )

        config_parser.remove_option(
            section='core',
            option='password'
        )

        file = open(configuration.get_airflow_config(configuration.get_airflow_home()), 'w')

        config_parser.write(file)

        file.close()
예제 #9
0
def create_dags():

    global dag_creation_dates
    global new_dags
    global email_notify_required

    new_dags = []

    dag_creation_dates = json.loads(Variable.get(key='dag_creation_dates'))
    email_notify_required = is_email_notification_required()

    try:
        for table in config.get('tables'):
            with open(configuration.get_airflow_home() + '/dags/templates/main.py.jinja2') as file_:
                template = Template(file_.read())

            if dag_creation_dates.get(table) is not None:
                start_date = dag_creation_dates.get(table)
            else:
                start_date = get_start_date(config.get('start_date'))
                dag_creation_dates[table] = str(start_date)

            output = template.render(
                data={
                    'dag_id': table,
                    'frequency': config.get('frequency'),
                    'storage_type': storage_type,
                    'start_date': start_date,
                    'email_required': email_notify_required
                }
            )

            with open(configuration.get_airflow_home() + '/dags/generated/dag_'
                      + '{}'.format(table).replace(' ', '_') + '.py', 'w') as f:
                f.write(output)
                new_dags.append('dag_' + '{}'.format(table).replace(' ', '_') + '.py')

        if len(r_config) != 0:

            for table in r_config:
                for exec_date in r_config.get(table):
                    execution_date = str(exec_date).replace(' ', 'T')[0:19]
                    with open(configuration.get_airflow_home()
                              + '/dags/templates/recovery_template.py.jinja2') as file_:
                        template = Template(file_.read())
                        output = template.render(
                            data={'dag_id': table, 'frequency': config.get('frequency'), 'storage_type': storage_type,
                                  'execution_date': execution_date})
                    with open(configuration.get_airflow_home() + '/dags/generated/r_dag_' + '{}_{}'.format(
                            table, execution_date).replace(' ', '_') + '.py', 'w') as f:
                        f.write(output)
                        e = '{}'.format(execution_date).replace(' ', 'T')
                        new_dags.append('r_dag_' + '{}_{}'.format(table, e).replace(' ', '_') + '.py')

        md_dag_ids = settings.Session.query(Dags.dag_id, Dags.fileloc).all()

        for record in md_dag_ids:
            (d_id, loc) = record
            filename = loc[str(loc).rfind('/') + 1:]
            if filename == 'dag_generator.py' or filename == 'dag_cleanup.py':
                continue
            if filename not in new_dags:
                try:
                    if os.path.exists(str(loc)):
                        os.remove(str(loc))
                    else:
                        LoggingMixin().log.warning("{} file doesn't exists !".format(filename))

                    requests.delete(
                        url="http://{}:8080/api/experimental/dags/{}".format(
                            socket.gethostbyname(socket.gethostname()),
                            str(d_id)
                        ),
                        auth=(rest.login, rest.password)
                    )

                    dag_creation_dates.pop(d_id)

                except Exception as e:
                    LoggingMixin().log.error(str(e))

        Variable.set(key='dag_creation_dates', value=json.dumps(dag_creation_dates))

    except AirflowException:

        raise ConfigVariableNotFoundException()