Esempio n. 1
0
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False):
    log = LoggingMixin().log

    SMTP_HOST = configuration.conf.get('smtp', 'SMTP_HOST')
    SMTP_PORT = configuration.conf.getint('smtp', 'SMTP_PORT')
    SMTP_STARTTLS = configuration.conf.getboolean('smtp', 'SMTP_STARTTLS')
    SMTP_SSL = configuration.conf.getboolean('smtp', 'SMTP_SSL')
    SMTP_USER = None
    SMTP_PASSWORD = None

    try:
        SMTP_USER = configuration.conf.get('smtp', 'SMTP_USER')
        SMTP_PASSWORD = configuration.conf.get('smtp', 'SMTP_PASSWORD')
    except AirflowConfigException:
        log.debug("No user/password found for SMTP, so logging in with no authentication.")

    if not dryrun:
        s = smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) if SMTP_SSL else smtplib.SMTP(SMTP_HOST, SMTP_PORT)
        if SMTP_STARTTLS:
            s.starttls()
        if SMTP_USER and SMTP_PASSWORD:
            s.login(SMTP_USER, SMTP_PASSWORD)
        log.info("Sent an alert email to %s", e_to)
        s.sendmail(e_from, e_to, mime_msg.as_string())
        s.quit()
Esempio n. 2
0
def load_login():
    log = LoggingMixin().log

    auth_backend = 'airflow.default_login'
    try:
        if conf.getboolean('webserver', 'AUTHENTICATE'):
            auth_backend = conf.get('webserver', 'auth_backend')
    except conf.AirflowConfigException:
        if conf.getboolean('webserver', 'AUTHENTICATE'):
            log.warning(
                "auth_backend not found in webserver config reverting to "
                "*deprecated*  behavior of importing airflow_login")
            auth_backend = "airflow_login"

    try:
        global login
        login = import_module(auth_backend)
    except ImportError as err:
        log.critical(
            "Cannot import authentication module %s. "
            "Please correct your authentication backend or disable authentication: %s",
            auth_backend, err
        )
        if conf.getboolean('webserver', 'AUTHENTICATE'):
            raise AirflowException("Failed to import authentication backend")
def _split_tablename(table_input, default_project_id, var_name=None):
    assert default_project_id is not None, "INTERNAL: No default project is specified"

    def var_print(var_name):
        if var_name is None:
            return ""
        else:
            return "Format exception for {var}: ".format(var=var_name)

    if table_input.count('.') + table_input.count(':') > 3:
        raise Exception((
            '{var}Use either : or . to specify project '
            'got {input}'
        ).format(var=var_print(var_name), input=table_input))

    cmpt = table_input.rsplit(':', 1)
    project_id = None
    rest = table_input
    if len(cmpt) == 1:
        project_id = None
        rest = cmpt[0]
    elif len(cmpt) == 2 and cmpt[0].count(':') <= 1:
        if cmpt[-1].count('.') != 2:
            project_id = cmpt[0]
            rest = cmpt[1]
    else:
        raise Exception((
            '{var}Expect format of (<project:)<dataset>.<table>, '
            'got {input}'
        ).format(var=var_print(var_name), input=table_input))

    cmpt = rest.split('.')
    if len(cmpt) == 3:
        assert project_id is None, (
            "{var}Use either : or . to specify project"
        ).format(var=var_print(var_name))
        project_id = cmpt[0]
        dataset_id = cmpt[1]
        table_id = cmpt[2]

    elif len(cmpt) == 2:
        dataset_id = cmpt[0]
        table_id = cmpt[1]
    else:
        raise Exception((
            '{var}Expect format of (<project.|<project:)<dataset>.<table>, '
            'got {input}'
        ).format(var=var_print(var_name), input=table_input))

    if project_id is None:
        if var_name is not None:
            log = LoggingMixin().log
            log.info(
                'Project not included in {var}: {input}; using project "{project}"'.format(
                    var=var_name, input=table_input, project=default_project_id
                )
            )
        project_id = default_project_id

    return project_id, dataset_id, table_id
def execute_command(command):
    log = LoggingMixin().log
    log.info("Executing command in Celery: %s", command)
    try:
        subprocess.check_call(command, shell=True)
    except subprocess.CalledProcessError as e:
        log.error(e)
        raise AirflowException('Celery command failed')
Esempio n. 5
0
def _post_sendgrid_mail(mail_data):
    log = LoggingMixin().log
    sg = sendgrid.SendGridAPIClient(apikey=os.environ.get('SENDGRID_API_KEY'))
    response = sg.client.mail.send.post(request_body=mail_data)
    # 2xx status code.
    if 200 <= response.status_code < 300:
        log.info('Email with subject %s is successfully sent to recipients: %s',
                 mail_data['subject'], mail_data['personalizations'])
    else:
        log.warning('Failed to send out email with subject %s, status code: %s',
                    mail_data['subject'], response.status_code)
        def _str(s):
            # cloudant-python doesn't support unicode.
            if isinstance(s, unicode):
                log = LoggingMixin().log
                log.debug(
                    'cloudant-python does not support unicode. Encoding %s as ascii using "ignore".',
                    s
                )
                return s.encode('ascii', 'ignore')

            return s
Esempio n. 7
0
def _parse_s3_config(config_file_name, config_format='boto', profile=None):
    """
    Parses a config file for s3 credentials. Can currently
    parse boto, s3cmd.conf and AWS SDK config formats

    :param config_file_name: path to the config file
    :type config_file_name: str
    :param config_format: config type. One of "boto", "s3cmd" or "aws".
        Defaults to "boto"
    :type config_format: str
    :param profile: profile name in AWS type config file
    :type profile: str
    """
    Config = configparser.ConfigParser()
    if Config.read(config_file_name):  # pragma: no cover
        sections = Config.sections()
    else:
        raise AirflowException("Couldn't read {0}".format(config_file_name))
    # Setting option names depending on file format
    if config_format is None:
        config_format = 'boto'
    conf_format = config_format.lower()
    if conf_format == 'boto':  # pragma: no cover
        if profile is not None and 'profile ' + profile in sections:
            cred_section = 'profile ' + profile
        else:
            cred_section = 'Credentials'
    elif conf_format == 'aws' and profile is not None:
        cred_section = profile
    else:
        cred_section = 'default'
    # Option names
    if conf_format in ('boto', 'aws'):  # pragma: no cover
        key_id_option = 'aws_access_key_id'
        secret_key_option = 'aws_secret_access_key'
        # security_token_option = 'aws_security_token'
    else:
        key_id_option = 'access_key'
        secret_key_option = 'secret_key'
    # Actual Parsing
    if cred_section not in sections:
        raise AirflowException("This config file format is not recognized")
    else:
        try:
            access_key = Config.get(cred_section, key_id_option)
            secret_key = Config.get(cred_section, secret_key_option)
            calling_format = None
            if Config.has_option(cred_section, 'calling_format'):
                calling_format = Config.get(cred_section, 'calling_format')
        except:
            log = LoggingMixin().log
            log.warning("Option Error in parsing s3 config file")
            raise
        return (access_key, secret_key, calling_format)
    def handle_failure_retry(context):
        ti = context['ti']
        cmd_id = ti.xcom_pull(key='qbol_cmd_id', task_ids=ti.task_id)

        if cmd_id is not None:
            cmd = Command.find(cmd_id)
            if cmd is not None:
                if cmd.status == 'running':
                    log = LoggingMixin().log
                    log.info('Cancelling the Qubole Command Id: %s', cmd_id)
                    cmd.cancel()
 def get_query_results(self):
     log = LoggingMixin().log
     if self.cmd is not None:
         cmd_id = self.cmd.id
         log.info("command id: " + str(cmd_id))
         query_result_buffer = StringIO()
         self.cmd.get_results(fp=query_result_buffer, inline=True, delim=COL_DELIM)
         query_result = query_result_buffer.getvalue()
         query_result_buffer.close()
         return query_result
     else:
         log.info("Qubole command not found")
Esempio n. 10
0
def list_py_file_paths(directory, safe_mode=True):
    """
    Traverse a directory and look for Python files.

    :param directory: the directory to traverse
    :type directory: unicode
    :param safe_mode: whether to use a heuristic to determine whether a file
    contains Airflow DAG definitions
    :return: a list of paths to Python files in the specified directory
    :rtype: list[unicode]
    """
    file_paths = []
    if directory is None:
        return []
    elif os.path.isfile(directory):
        return [directory]
    elif os.path.isdir(directory):
        patterns = []
        for root, dirs, files in os.walk(directory, followlinks=True):
            ignore_file = [f for f in files if f == '.airflowignore']
            if ignore_file:
                f = open(os.path.join(root, ignore_file[0]), 'r')
                patterns += [p for p in f.read().split('\n') if p]
                f.close()
            for f in files:
                try:
                    file_path = os.path.join(root, f)
                    if not os.path.isfile(file_path):
                        continue
                    mod_name, file_ext = os.path.splitext(
                        os.path.split(file_path)[-1])
                    if file_ext != '.py' and not zipfile.is_zipfile(file_path):
                        continue
                    if any([re.findall(p, file_path) for p in patterns]):
                        continue

                    # Heuristic that guesses whether a Python file contains an
                    # Airflow DAG definition.
                    might_contain_dag = True
                    if safe_mode and not zipfile.is_zipfile(file_path):
                        with open(file_path, 'rb') as f:
                            content = f.read()
                            might_contain_dag = all(
                                [s in content for s in (b'DAG', b'airflow')])

                    if not might_contain_dag:
                        continue

                    file_paths.append(file_path)
                except Exception:
                    log = LoggingMixin().log
                    log.exception("Error while examining %s", f)
    return file_paths
Esempio n. 11
0
    def set(
            cls,
            key,
            value,
            execution_date,
            task_id,
            dag_id,
            session=None):
        """
        Store an XCom value.
        TODO: "pickling" has been deprecated and JSON is preferred.
        "pickling" will be removed in Airflow 2.0.

        :return: None
        """
        session.expunge_all()

        enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling')
        if enable_pickling:
            value = pickle.dumps(value)
        else:
            try:
                value = json.dumps(value).encode('UTF-8')
            except ValueError:
                log = LoggingMixin().log
                log.error("Could not serialize the XCOM value into JSON. "
                          "If you are using pickles instead of JSON "
                          "for XCOM, then you need to enable pickle "
                          "support for XCOM in your airflow config.")
                raise

        # remove any duplicate XComs
        session.query(cls).filter(
            cls.key == key,
            cls.execution_date == execution_date,
            cls.task_id == task_id,
            cls.dag_id == dag_id).delete()

        session.commit()

        # insert new XCom
        session.add(XCom(
            key=key,
            value=value,
            execution_date=execution_date,
            task_id=task_id,
            dag_id=dag_id))

        session.commit()
Esempio n. 12
0
    def filter_for_filesize(result, size=None):
        """
        Will test the filepath result and test if its size is at least self.filesize

        :param result: a list of dicts returned by Snakebite ls
        :param size: the file size in MB a file should be at least to trigger True
        :return: (bool) depending on the matching criteria
        """
        if size:
            log = LoggingMixin().log
            log.debug('Filtering for file size >= %s in files: %s', size, map(lambda x: x['path'], result))
            size *= settings.MEGABYTE
            result = [x for x in result if x['length'] >= size]
            log.debug('HdfsSensor.poke: after size filter result is %s', result)
        return result
Esempio n. 13
0
def GetDefaultExecutor():
    """Creates a new instance of the configured executor if none exists and returns it"""
    global DEFAULT_EXECUTOR

    if DEFAULT_EXECUTOR is not None:
        return DEFAULT_EXECUTOR

    executor_name = configuration.get('core', 'EXECUTOR')

    DEFAULT_EXECUTOR = _get_executor(executor_name)

    log = LoggingMixin().log
    log.info("Using executor %s", executor_name)

    return DEFAULT_EXECUTOR
Esempio n. 14
0
    def handle_failure_retry(context):
        ti = context['ti']
        cmd_id = ti.xcom_pull(key='qbol_cmd_id', task_ids=ti.task_id)

        if cmd_id is not None:
            cmd = Command.find(cmd_id)
            if cmd is not None:
                log = LoggingMixin().log
                if cmd.status == 'done':
                    log.info('Command ID: %s has been succeeded, hence marking this '
                                'TI as Success.', cmd_id)
                    ti.state = State.SUCCESS
                elif cmd.status == 'running':
                    log.info('Cancelling the Qubole Command Id: %s', cmd_id)
                    cmd.cancel()
Esempio n. 15
0
 def get_val(self):
     log = LoggingMixin().log
     if self._val and self.is_encrypted:
         try:
             fernet = get_fernet()
             return fernet.decrypt(bytes(self._val, 'utf-8')).decode()
         except InvalidFernetToken:
             log.error("Can't decrypt _val for key={}, invalid token "
                       "or value".format(self.key))
             return None
         except Exception:
             log.error("Can't decrypt _val for key={}, FERNET_KEY "
                       "configuration missing".format(self.key))
             return None
     else:
         return self._val
Esempio n. 16
0
    def get_one(cls,
                execution_date,
                key=None,
                task_id=None,
                dag_id=None,
                include_prior_dates=False,
                session=None):
        """
        Retrieve an XCom value, optionally meeting certain criteria.
        TODO: "pickling" has been deprecated and JSON is preferred.
        "pickling" will be removed in Airflow 2.0.

        :return: XCom value
        """
        filters = []
        if key:
            filters.append(cls.key == key)
        if task_id:
            filters.append(cls.task_id == task_id)
        if dag_id:
            filters.append(cls.dag_id == dag_id)
        if include_prior_dates:
            filters.append(cls.execution_date <= execution_date)
        else:
            filters.append(cls.execution_date == execution_date)

        query = (
            session.query(cls.value).filter(and_(*filters))
                   .order_by(cls.execution_date.desc(), cls.timestamp.desc()))

        result = query.first()
        if result:
            enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling')
            if enable_pickling:
                return pickle.loads(result.value)
            else:
                try:
                    return json.loads(result.value.decode('UTF-8'))
                except ValueError:
                    log = LoggingMixin().log
                    log.error("Could not deserialize the XCOM value from JSON. "
                              "If you are using pickles instead of JSON "
                              "for XCOM, then you need to enable pickle "
                              "support for XCOM in your airflow config.")
                    raise
Esempio n. 17
0
    def filter_for_ignored_ext(result, ignored_ext, ignore_copying):
        """
        Will filter if instructed to do so the result to remove matching criteria

        :param result: (list) of dicts returned by Snakebite ls
        :param ignored_ext: (list) of ignored extensions
        :param ignore_copying: (bool) shall we ignore ?
        :return: (list) of dicts which were not removed
        """
        if ignore_copying:
            log = LoggingMixin().log
            regex_builder = "^.*\.(%s$)$" % '$|'.join(ignored_ext)
            ignored_extentions_regex = re.compile(regex_builder)
            log.debug(
                'Filtering result for ignored extensions: %s in files %s',
                ignored_extentions_regex.pattern, map(lambda x: x['path'], result)
            )
            result = [x for x in result if not ignored_extentions_regex.match(x['path'])]
            log.debug('HdfsSensor.poke: after ext filter result is %s', result)
        return result
def _poll_with_exponential_delay(request, max_n, is_done_func, is_error_func):
    log = LoggingMixin().log

    for i in range(0, max_n):
        try:
            response = request.execute()
            if is_error_func(response):
                raise ValueError(
                    'The response contained an error: {}'.format(response)
                )
            elif is_done_func(response):
                log.info('Operation is done: %s', response)
                return response
            else:
                time.sleep((2**i) + (random.randint(0, 1000) / 1000))
        except HttpError as e:
            if e.resp.status != 429:
                log.info('Something went wrong. Not retrying: %s', format(e))
                raise
            else:
                time.sleep((2**i) + (random.randint(0, 1000) / 1000))
Esempio n. 19
0
    def _to_timestamp(cls, col):
        """
        Convert a column of a dataframe to UNIX timestamps if applicable

        :param col:     A Series object representing a column of a dataframe.
        """
        # try and convert the column to datetimes
        # the column MUST have a four digit year somewhere in the string
        # there should be a better way to do this,
        # but just letting pandas try and convert every column without a format
        # caused it to convert floats as well
        # For example, a column of integers
        # between 0 and 10 are turned into timestamps
        # if the column cannot be converted,
        # just return the original column untouched
        try:
            col = pd.to_datetime(col)
        except ValueError:
            log = LoggingMixin().log
            log.warning(
                "Could not convert field to timestamps: %s", col.name
            )
            return col

        # now convert the newly created datetimes into timestamps
        # we have to be careful here
        # because NaT cannot be converted to a timestamp
        # so we have to return NaN
        converted = []
        for i in col:
            try:
                converted.append(i.timestamp())
            except ValueError:
                converted.append(pd.np.NaN)
            except AttributeError:
                converted.append(pd.np.NaN)

        # return a new series that maintains the same index as the original
        return pd.Series(converted, index=col.index)
Esempio n. 20
0
def execute_command(command_to_exec):
    log = LoggingMixin().log
    log.info("Executing command in Celery: %s", command_to_exec)
    env = os.environ.copy()
    try:
        subprocess.check_call(command_to_exec, stderr=subprocess.STDOUT,
                              close_fds=True, env=env)
    except subprocess.CalledProcessError as e:
        log.exception('execute_command encountered a CalledProcessError')
        log.error(e.output)

        raise AirflowException('Celery command failed')
Esempio n. 21
0
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""
This is an example dag for using the KubernetesPodOperator.
"""
from airflow.utils.dates import days_ago
from airflow.utils.log.logging_mixin import LoggingMixin
from airflow.models import DAG

log = LoggingMixin().log

try:
    # Kubernetes is optional, so not available in vanilla Airflow
    # pip install 'apache-airflow[kubernetes]'
    from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator

    args = {'owner': 'Airflow', 'start_date': days_ago(2)}

    dag = DAG(dag_id='airflow-k8s-test-dag',
              catchup=False,
              default_args=args,
              schedule_interval='* * * * *')

    k = KubernetesPodOperator(
        namespace='default',
from airflow import models
from airflow.configuration import conf
from airflow.configuration import AirflowConfigException
from airflow.utils.db import provide_session

import traceback
import re

from airflow.utils.log.logging_mixin import LoggingMixin

LOGIN_MANAGER = flask_login.LoginManager()
LOGIN_MANAGER.login_view = 'airflow.login'  # Calls login() below
LOGIN_MANAGER.login_message = None

log = LoggingMixin().log


class AuthenticationError(Exception):
    pass


class LdapException(Exception):
    pass


def get_ldap_connection(dn=None, password=None):
    
    ldap_uri = conf.get("ldap", "uri")
    isSslEnable = False
Esempio n. 23
0
def create_airflow_rest_connection():

    from airflow.contrib.auth.backends.password_auth import PasswordUser
    import base64
    import os

    session = settings.Session()
    exists = session.query(models.User).filter(models.User.username == 'application').scalar()

    if exists is None:

        LoggingMixin().log.info("creating 'application' user for mini-BRS...")

        # create 'application' user

        random_key = str(base64.urlsafe_b64encode(os.urandom(32)))
        user = PasswordUser(models.User())
        user.username = '******'
        user.email = '*****@*****.**'
        user.password = random_key
        session.add(user)
        session.commit()
        session.close()

        # create 'application' airflow connection
        rest = Connection(
            conn_id='rest',
            login='******',
            password=random_key
        )

        session = settings.Session()
        session.add(rest)
        session.commit()
        session.close()

        # create 'admin' user
        # admin_password = str(base64.urlsafe_b64encode(os.urandom(32)))

        config_parser = configuration.AirflowConfigParser()

        config_parser.read(
            configuration.get_airflow_config(
                        configuration.get_airflow_home()
                    )
        )

        u = config_parser.get(
            section='core',
            key='username'
        )

        p = config_parser.get(
            section='core',
            key='password'
        )

        user = PasswordUser(models.User())
        user.username = u
        user.email = '*****@*****.**'
        user.password = p
        user.superuser = True
        session = settings.Session()
        session.add(user)
        session.commit()
        session.close()

        config_parser.remove_option(
            section='core',
            option='username'
        )

        config_parser.remove_option(
            section='core',
            option='password'
        )

        file = open(configuration.get_airflow_config(configuration.get_airflow_home()), 'w')

        config_parser.write(file)

        file.close()
    def _upload(self, context):

        # dropbox Connection details
        try:
            credentials_dropbox = BaseHook.get_connection(self.storage_conn_id)
            self.dropbox_access_token = credentials_dropbox.password
        except AirflowException as e:
            raise DropboxConnectionNotFoundException

        if self.is_storage_available(self.dropbox_access_token):
            try:
                LoggingMixin().log.info("Dropbox Storage avalaible")
                l_file_path = self.file_name.replace('.csv', '.json')
                file_name = l_file_path[l_file_path.rfind('/') + 1:]

                dt_current = datetime.strptime(self.execution_date[:19],
                                               "%Y-%m-%dT%H:%M:%S")

                exec_hour = str(dt_current.hour)
                exec_minute = str(dt_current.minute)
                exec_second = str(dt_current.second)

                if exec_hour == '0' and exec_minute == '0' and exec_second == '0':
                    dt_current = dt_current - timedelta(days=1)
                    r_file_path = '{}/{}/{}/{}/{}'.format(
                        '/mbrs', 'Servicenow', self.table,
                        '{}-{}-{}'.format(dt_current.year, dt_current.month,
                                          dt_current.day), file_name)
                else:
                    r_file_path = '{}/{}/{}/{}/{}'.format(
                        '/mbrs', 'Servicenow', self.table,
                        '{}-{}-{}'.format(dt_current.year, dt_current.month,
                                          dt_current.day), file_name)

                LoggingMixin().log.info("Running dropbox upload process...")
                try:
                    file_size = os.path.getsize(l_file_path)
                    CHUNK_SIZE = 4 * 1024 * 1024
                    dbx = dropbox.Dropbox(self.dropbox_access_token,
                                          timeout=600)
                    if file_size <= CHUNK_SIZE:
                        with open(l_file_path, 'rb') as f:
                            dbx.files_upload(
                                f.read(),
                                r_file_path,
                                mode=dropbox.files.WriteMode.overwrite)
                            f.close()
                            return True
                    else:
                        with open(l_file_path, 'rb') as f:
                            upload_session_start_result = dbx.files_upload_session_start(
                                f.read(CHUNK_SIZE))
                            cursor = dropbox.files.UploadSessionCursor(
                                session_id=upload_session_start_result.
                                session_id,
                                offset=f.tell())
                            commit = dropbox.files.CommitInfo(path=r_file_path)
                            while f.tell() < file_size:
                                if (file_size - f.tell()) <= CHUNK_SIZE:
                                    print(
                                        dbx.files_upload_session_finish(
                                            f.read(CHUNK_SIZE), cursor,
                                            commit))
                                else:
                                    dbx.files_upload_session_append_v2(
                                        f.read(CHUNK_SIZE), cursor)
                                    cursor.offset = f.tell()

                            f.close()
                            return True
                except Exception as e:
                    LoggingMixin().log.error(
                        "ServiceNow2DropBoxTransOperator : exception in dropbox upload for token : {} {}"
                        .format(self.dropbox_access_token, e))
                    return False
            except Exception as e:
                print(e)
        else:
            LoggingMixin().log.info("Dropbox Storage not avalaible")
            return False
Esempio n. 25
0
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from builtins import object
import imp
import inspect
import os
import re
import sys
import pkg_resources

from airflow import configuration
from airflow.utils.log.logging_mixin import LoggingMixin

log = LoggingMixin().log

import_errors = {}


class AirflowPluginException(Exception):
    pass


class AirflowPlugin(object):
    name = None
    operators = []
    sensors = []
    hooks = []
    executors = []
    macros = []
Esempio n. 26
0
def list_py_file_paths(directory, safe_mode=True,
                       include_examples=None):
    """
    Traverse a directory and look for Python files.

    :param directory: the directory to traverse
    :type directory: unicode
    :param safe_mode: whether to use a heuristic to determine whether a file
        contains Airflow DAG definitions
    :return: a list of paths to Python files in the specified directory
    :rtype: list[unicode]
    """
    if include_examples is None:
        include_examples = conf.getboolean('core', 'LOAD_EXAMPLES')
    file_paths = []
    if directory is None:
        return []
    elif os.path.isfile(directory):
        return [directory]
    elif os.path.isdir(directory):
        patterns_by_dir = {}
        for root, dirs, files in os.walk(directory, followlinks=True):
            patterns = patterns_by_dir.get(root, [])
            ignore_file = os.path.join(root, '.airflowignore')
            if os.path.isfile(ignore_file):
                with open(ignore_file, 'r') as f:
                    # If we have new patterns create a copy so we don't change
                    # the previous list (which would affect other subdirs)
                    patterns += [re.compile(p) for p in f.read().split('\n') if p]

            # If we can ignore any subdirs entirely we should - fewer paths
            # to walk is better. We have to modify the ``dirs`` array in
            # place for this to affect os.walk
            dirs[:] = [
                d
                for d in dirs
                if not any(p.search(os.path.join(root, d)) for p in patterns)
            ]

            # We want patterns defined in a parent folder's .airflowignore to
            # apply to subdirs too
            for d in dirs:
                patterns_by_dir[os.path.join(root, d)] = patterns

            for f in files:
                try:
                    file_path = os.path.join(root, f)
                    if not os.path.isfile(file_path):
                        continue
                    mod_name, file_ext = os.path.splitext(
                        os.path.split(file_path)[-1])
                    if file_ext != '.py' and not zipfile.is_zipfile(file_path):
                        continue
                    if any([re.findall(p, file_path) for p in patterns]):
                        continue

                    # Heuristic that guesses whether a Python file contains an
                    # Airflow DAG definition.
                    might_contain_dag = True
                    if safe_mode and not zipfile.is_zipfile(file_path):
                        with open(file_path, 'rb') as fp:
                            content = fp.read()
                            might_contain_dag = all(
                                [s in content for s in (b'DAG', b'airflow')])

                    if not might_contain_dag:
                        continue

                    file_paths.append(file_path)
                except Exception:
                    log = LoggingMixin().log
                    log.exception("Error while examining %s", f)
    if include_examples:
        import airflow.example_dags
        example_dag_folder = airflow.example_dags.__path__[0]
        file_paths.extend(list_py_file_paths(example_dag_folder, safe_mode, False))
    return file_paths
Esempio n. 27
0
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""Sentry Integration"""

from functools import wraps

from airflow.configuration import conf
from airflow.utils.db import provide_session
from airflow.utils.log.logging_mixin import LoggingMixin
from airflow.utils.state import State

log = LoggingMixin().log


class DummySentry:
    """
    Blank class for Sentry.
    """
    @classmethod
    def add_tagging(cls, task_instance):
        """
        Blank function for tagging.
        """

    @classmethod
    def add_breadcrumbs(cls, task_instance, session=None):
        """
Esempio n. 28
0
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import ssl

from airflow import configuration
from airflow.exceptions import AirflowConfigException, AirflowException
from airflow.utils.log.logging_mixin import LoggingMixin


def _broker_supports_visibility_timeout(url):
    return url.startswith("redis://") or url.startswith("sqs://")


log = LoggingMixin().log

broker_url = configuration.conf.get('celery', 'BROKER_URL')

broker_transport_options = configuration.conf.getsection(
    'celery_broker_transport_options')
if 'visibility_timeout' not in broker_transport_options:
    if _broker_supports_visibility_timeout(broker_url):
        broker_transport_options['visibility_timeout'] = 21600

DEFAULT_CELERY_CONFIG = {
    'accept_content': ['json', 'pickle'],
    'event_serializer':
    'json',
    'worker_prefetch_multiplier':
    1,
Esempio n. 29
0
from __future__ import print_function
from __future__ import unicode_literals

import logging
import logging.config
import os
import sys

from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker
from sqlalchemy.pool import NullPool

from airflow import configuration as conf
from airflow.utils.log.logging_mixin import LoggingMixin

log = LoggingMixin().log

class DummyStatsLogger(object):

    @classmethod
    def incr(cls, stat, count=1, rate=1):
        pass

    @classmethod
    def decr(cls, stat, count=1, rate=1):
        pass

    @classmethod
    def gauge(cls, stat, value, rate=1, delta=False):
        pass
Esempio n. 30
0
 def get_connection(cls, conn_id):
     conn = random.choice(cls.get_connections(conn_id))
     if conn.host:
         log = LoggingMixin().log
         log.info("Using connection to: %s", conn.host)
     return conn
Esempio n. 31
0
def list_py_file_paths(directory,
                       safe_mode=True,
                       include_examples=conf.getboolean(
                           'core', 'LOAD_EXAMPLES')):
    """
    Traverse a directory and look for Python files.

    :param directory: the directory to traverse
    :type directory: unicode
    :param safe_mode: whether to use a heuristic to determine whether a file
        contains Airflow DAG definitions
    :return: a list of paths to Python files in the specified directory
    :rtype: list[unicode]
    """
    file_paths = []
    if directory is None:
        return []
    elif os.path.isfile(directory):
        return [directory]
    elif os.path.isdir(directory):
        patterns_by_dir = {}
        for root, dirs, files in os.walk(directory, followlinks=True):
            patterns = patterns_by_dir.get(root, [])
            ignore_file = os.path.join(root, '.airflowignore')
            if os.path.isfile(ignore_file):
                with open(ignore_file, 'r') as f:
                    # If we have new patterns create a copy so we don't change
                    # the previous list (which would affect other subdirs)
                    patterns = patterns + [
                        p for p in f.read().split('\n') if p
                    ]

            # If we can ignore any subdirs entirely we should - fewer paths
            # to walk is better. We have to modify the ``dirs`` array in
            # place for this to affect os.walk
            dirs[:] = [
                d for d in dirs if not any(
                    re.search(p, os.path.join(root, d)) for p in patterns)
            ]

            # We want patterns defined in a parent folder's .airflowignore to
            # apply to subdirs too
            for d in dirs:
                patterns_by_dir[os.path.join(root, d)] = patterns

            for f in files:
                try:
                    file_path = os.path.join(root, f)
                    if not os.path.isfile(file_path):
                        continue
                    mod_name, file_ext = os.path.splitext(
                        os.path.split(file_path)[-1])
                    if file_ext != '.py' and not zipfile.is_zipfile(file_path):
                        continue
                    if any([re.findall(p, file_path) for p in patterns]):
                        continue

                    # Heuristic that guesses whether a Python file contains an
                    # Airflow DAG definition.
                    might_contain_dag = True
                    if safe_mode and not zipfile.is_zipfile(file_path):
                        with open(file_path, 'rb') as f:
                            content = f.read()
                            might_contain_dag = all(
                                [s in content for s in (b'DAG', b'airflow')])

                    if not might_contain_dag:
                        continue

                    file_paths.append(file_path)
                except Exception:
                    log = LoggingMixin().log
                    log.exception("Error while examining %s", f)
    if include_examples:
        import airflow.example_dags
        example_dag_folder = airflow.example_dags.__path__[0]
        file_paths.extend(
            list_py_file_paths(example_dag_folder, safe_mode, False))
    return file_paths
Esempio n. 32
0
# under the License.

from hdfs import InsecureClient, HdfsError

from airflow import configuration
from airflow.exceptions import AirflowException
from airflow.hooks.base_hook import BaseHook
from airflow.utils.log.logging_mixin import LoggingMixin

_kerberos_security_mode = configuration.conf.get("core",
                                                 "security") == "kerberos"
if _kerberos_security_mode:
    try:
        from hdfs.ext.kerberos import KerberosClient
    except ImportError:
        log = LoggingMixin().log
        log.error("Could not load the Kerberos extension for the WebHDFSHook.")
        raise


class AirflowWebHDFSHookException(AirflowException):
    pass


class WebHDFSHook(BaseHook):
    """
    Interact with HDFS. This class is a wrapper around the hdfscli library.
    """
    def __init__(self, webhdfs_conn_id='webhdfs_default', proxy_user=None):
        self.webhdfs_conn_id = webhdfs_conn_id
        self.proxy_user = proxy_user
Esempio n. 33
0
# under the License.
"""Manages all plugins."""
# noinspection PyDeprecation
import imp  # pylint: disable=deprecated-module
import inspect
import os
import re
import sys
from typing import Any, Callable, Dict, List, Optional, Set, Type

import pkg_resources

from airflow import settings
from airflow.utils.log.logging_mixin import LoggingMixin

log = LoggingMixin().log

import_errors = {}


class AirflowPluginException(Exception):
    """Exception when loading plugin."""


class AirflowPlugin:
    """Class used to define AirflowPlugin."""
    name: Optional[str] = None
    operators: List[Any] = []
    sensors: List[Any] = []
    hooks: List[Any] = []
    executors: List[Any] = []
Esempio n. 34
0
from airflow.utils.dates import days_ago
from airflow.utils.log.logging_mixin import LoggingMixin
from airflow.models import DAG
from datetime import datetime, timedelta

log = LoggingMixin().log

try:
    from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator

    args = {
        "owner": "Robin",
        "start_date": datetime(2019, 1, 30),
        "retries": 2,
        "retry_delay": timedelta(minutes=50),
        "email": ["*****@*****.**"],
        "pool": "occupeye_pool"
    }

    dag = DAG(
        dag_id="occupeye_aggregator",
        default_args=args,
        schedule_interval='0 5 * * *',
    )

    surveys_to_s3 = KubernetesPodOperator(
        namespace="airflow",
        image=
        "quay.io/mojanalytics/airflow-occupeye-dashboard-aggregation:latest",
        cmds=["bash", "-c"],
        arguments=["Rscript main.R"],
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from airflow.utils.dates import days_ago
from airflow.utils.log.logging_mixin import LoggingMixin
from airflow.models import DAG

log = LoggingMixin().log

try:
    # Kubernetes is optional, so not available in vanilla Airflow
    # pip install apache-airflow[kubernetes]
    from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator

    args = {
        'owner': 'airflow',
        'start_date': days_ago(2)
    }

    dag = DAG(
        dag_id='example_kubernetes_operator',
        default_args=args,
        schedule_interval=None)
Esempio n. 36
0
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import ssl

from airflow import configuration
from airflow.exceptions import AirflowConfigException, AirflowException
from airflow.utils.log.logging_mixin import LoggingMixin

log = LoggingMixin().log

broker_transport_options = configuration.getsection('celery_broker_transport_options')
if broker_transport_options is None:
    broker_transport_options = {'visibility_timeout': 21600}

DEFAULT_CELERY_CONFIG = {
    'accept_content': ['json', 'pickle'],
    'event_serializer': 'json',
    'worker_prefetch_multiplier': 1,
    'task_acks_late': True,
    'task_default_queue': configuration.get('celery', 'DEFAULT_QUEUE'),
    'task_default_exchange': configuration.get('celery', 'DEFAULT_QUEUE'),
    'broker_url': configuration.get('celery', 'BROKER_URL'),
    'broker_transport_options': broker_transport_options,
    'result_backend': configuration.get('celery', 'RESULT_BACKEND'),
Esempio n. 37
0
 def get_connection(cls, conn_id):
     conn = random.choice(cls.get_connections(conn_id))
     if conn.host:
         log = LoggingMixin().log
         log.info("Using connection to: %s", conn.host)
     return conn
Esempio n. 38
0
import shlex
import subprocess
import sys
import warnings
from base64 import b64encode
from collections import OrderedDict
# Ignored Mypy on configparser because it thinks the configparser module has no _UNSET attribute
from configparser import _UNSET, ConfigParser, NoOptionError, NoSectionError  # type: ignore

from cryptography.fernet import Fernet
from zope.deprecation import deprecated

from airflow.exceptions import AirflowConfigException
from airflow.utils.log.logging_mixin import LoggingMixin

log = LoggingMixin().log

# show Airflow's deprecation warnings
warnings.filterwarnings(
    action='default', category=DeprecationWarning, module='airflow')
warnings.filterwarnings(
    action='default', category=PendingDeprecationWarning, module='airflow')


def expand_env_var(env_var):
    """
    Expands (potentially nested) env vars by repeatedly applying
    `expandvars` and `expanduser` until interpolation stops having
    any effect.
    """
    if not env_var:
Esempio n. 39
0
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""Default celery configuration."""
import ssl
from airflow.utils.log.logging_mixin import LoggingMixin

log = LoggingMixin().log

#broker_url = conf.get('celery', 'BROKER_URL')
broker_url = 'pyamqp://*****:*****@rabbit01:5672/airflow'
log.info('Using broker_url ' + broker_url)

#result_backend = conf.get('celery', 'RESULT_BACKEND')
result_backend = "db+mysql://root:3point142@maria01:3306/airflow"
log.info('Using result_backend ' + result_backend)

default_queue = "celery.inbound"
log.info('Using default_queue ' + default_queue)

worker_concurrency = "16"
log.info('Using worker_concurrency ' + worker_concurrency)
Esempio n. 40
0
# See the License for the specific language governing permissions and
# limitations under the License.

from airflow.hooks.base_hook import BaseHook
from airflow import configuration

from hdfs import InsecureClient, HdfsError

from airflow.utils.log.logging_mixin import LoggingMixin

_kerberos_security_mode = configuration.get("core", "security") == "kerberos"
if _kerberos_security_mode:
    try:
        from hdfs.ext.kerberos import KerberosClient
    except ImportError:
        log = LoggingMixin().log
        log.error("Could not load the Kerberos extension for the WebHDFSHook.")
        raise
from airflow.exceptions import AirflowException


class AirflowWebHDFSHookException(AirflowException):
    pass


class WebHDFSHook(BaseHook):
    """
    Interact with HDFS. This class is a wrapper around the hdfscli library.
    """
    def __init__(self, webhdfs_conn_id='webhdfs_default', proxy_user=None):
        self.webhdfs_conn_id = webhdfs_conn_id
Esempio n. 41
0
from future import standard_library

from six import iteritems

from airflow.utils.log.logging_mixin import LoggingMixin

standard_library.install_aliases()

from builtins import str
from collections import OrderedDict
from six.moves import configparser

from airflow.exceptions import AirflowConfigException

log = LoggingMixin().log

# show Airflow's deprecation warnings
warnings.filterwarnings(
    action='default', category=DeprecationWarning, module='airflow')
warnings.filterwarnings(
    action='default', category=PendingDeprecationWarning, module='airflow')

if six.PY3:
    ConfigParser = configparser.ConfigParser
else:
    ConfigParser = configparser.SafeConfigParser


def generate_fernet_key():
    try:
from datetime import datetime
from functools import wraps
from urllib.parse import urlparse

from airflow.settings import Stats
from airflow.utils.log.logging_mixin import LoggingMixin
from requests import PreparedRequest
from requests import Session

from airflow_metrics.utils.fn_utils import get_calling_operator
from airflow_metrics.utils.fn_utils import once
from airflow_metrics.utils.hook_utils import HookManager

LOG = LoggingMixin().log

BLACKLIST = {
    'api.datadoghq.com',
}


def attach_request_meta(ctx, *args, **kwargs):
    if len(args) >= 2 and isinstance(args[1], PreparedRequest):
        request = args[1]
        url = request.url
    else:
        LOG.info('No url found for request')
        return
    ctx['url'] = url

    domain = urlparse(url).netloc
    if domain in BLACKLIST:
Esempio n. 43
0
 def get_connection(cls, conn_id):  # type: (str) -> Connection
     conn = random.choice(list(cls.get_connections(conn_id)))
     if conn.host:
         log = LoggingMixin().log
         log.info("Using connection to: %s", conn.debug_info())
     return conn
Esempio n. 44
0
 def get_connection(cls, conn_id):  # type: (str) -> Connection
     conn = random.choice(list(cls.get_connections(conn_id)))
     if conn.host:
         log = LoggingMixin().log
         log.info("Using connection to: %s", conn.debug_info())
     return conn
Esempio n. 45
0
def create_dags():

    global dag_creation_dates
    global new_dags
    global email_notify_required

    new_dags = []

    dag_creation_dates = json.loads(Variable.get(key='dag_creation_dates'))
    email_notify_required = is_email_notification_required()

    try:
        for table in config.get('tables'):
            with open(configuration.get_airflow_home() + '/dags/templates/main.py.jinja2') as file_:
                template = Template(file_.read())

            if dag_creation_dates.get(table) is not None:
                start_date = dag_creation_dates.get(table)
            else:
                start_date = get_start_date(config.get('start_date'))
                dag_creation_dates[table] = str(start_date)

            output = template.render(
                data={
                    'dag_id': table,
                    'frequency': config.get('frequency'),
                    'storage_type': storage_type,
                    'start_date': start_date,
                    'email_required': email_notify_required
                }
            )

            with open(configuration.get_airflow_home() + '/dags/generated/dag_'
                      + '{}'.format(table).replace(' ', '_') + '.py', 'w') as f:
                f.write(output)
                new_dags.append('dag_' + '{}'.format(table).replace(' ', '_') + '.py')

        if len(r_config) != 0:

            for table in r_config:
                for exec_date in r_config.get(table):
                    execution_date = str(exec_date).replace(' ', 'T')[0:19]
                    with open(configuration.get_airflow_home()
                              + '/dags/templates/recovery_template.py.jinja2') as file_:
                        template = Template(file_.read())
                        output = template.render(
                            data={'dag_id': table, 'frequency': config.get('frequency'), 'storage_type': storage_type,
                                  'execution_date': execution_date})
                    with open(configuration.get_airflow_home() + '/dags/generated/r_dag_' + '{}_{}'.format(
                            table, execution_date).replace(' ', '_') + '.py', 'w') as f:
                        f.write(output)
                        e = '{}'.format(execution_date).replace(' ', 'T')
                        new_dags.append('r_dag_' + '{}_{}'.format(table, e).replace(' ', '_') + '.py')

        md_dag_ids = settings.Session.query(Dags.dag_id, Dags.fileloc).all()

        for record in md_dag_ids:
            (d_id, loc) = record
            filename = loc[str(loc).rfind('/') + 1:]
            if filename == 'dag_generator.py' or filename == 'dag_cleanup.py':
                continue
            if filename not in new_dags:
                try:
                    if os.path.exists(str(loc)):
                        os.remove(str(loc))
                    else:
                        LoggingMixin().log.warning("{} file doesn't exists !".format(filename))

                    requests.delete(
                        url="http://{}:8080/api/experimental/dags/{}".format(
                            socket.gethostbyname(socket.gethostname()),
                            str(d_id)
                        ),
                        auth=(rest.login, rest.password)
                    )

                    dag_creation_dates.pop(d_id)

                except Exception as e:
                    LoggingMixin().log.error(str(e))

        Variable.set(key='dag_creation_dates', value=json.dumps(dag_creation_dates))

    except AirflowException:

        raise ConfigVariableNotFoundException()
from datetime import datetime, timedelta
from os import path

from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.utils.log.logging_mixin import LoggingMixin

from helpers import SqlQueries
from operators import (DataQualityOperator, DdlRedshiftOperator,
                       DataQualityValidator, LoadFactOperator,
                       LoadDimensionOperator)
from stage_s3_to_redshift_and_validate_subdag import stage_s3_to_redshift_dag

log = LoggingMixin().log
# AWS_KEY = os.environ.get('AWS_KEY')
# AWS_SECRET = os.environ.get('AWS_SECRET')
AIRFLOW_AWS_CREDENTIALS_ID = "aws_credentials"
AIRFLOW_REDSHIFT_CONN_ID = "redshift"

# S3_BUCKET="udacity-dend"
S3_BUCKET = "victor-nano-sparkify-raw-data-us-west-2"
S3_LOGS_KEY = "log_data"
S3_SONGS_KEY = "song_data"
LOG_JSONPATH = "log_json_path.json"

default_args = {
    'owner': 'Victor Costa',
    'depends_on_past': False,
    'start_date': datetime(2018, 1, 11),
    'retries': 3,
Esempio n. 47
0
    'result_serializer': 'pickle',
    'worker_prefetch_multiplier': 1,
    'task_acks_late': True,
    'task_default_queue': configuration.get('celery', 'DEFAULT_QUEUE'),
    'task_default_exchange': configuration.get('celery', 'DEFAULT_QUEUE'),
    'broker_url': configuration.get('celery', 'BROKER_URL'),
    'broker_transport_options': {'visibility_timeout': 21600},
    'result_backend': configuration.get('celery', 'CELERY_RESULT_BACKEND'),
    'worker_concurrency': configuration.getint('celery', 'CELERYD_CONCURRENCY'),
}

celery_ssl_active = False
try:
    celery_ssl_active = configuration.getboolean('celery', 'CELERY_SSL_ACTIVE')
except AirflowConfigException as e:
    log = LoggingMixin().log
    log.warning("Celery Executor will run without SSL")

try:
    if celery_ssl_active:
        broker_use_ssl = {'keyfile': configuration.get('celery', 'CELERY_SSL_KEY'),
                          'certfile': configuration.get('celery', 'CELERY_SSL_CERT'),
                          'ca_certs': configuration.get('celery', 'CELERY_SSL_CACERT'),
                          'cert_reqs': ssl.CERT_REQUIRED}
        DEFAULT_CELERY_CONFIG['broker_use_ssl'] = broker_use_ssl
except AirflowConfigException as e:
    raise AirflowException('AirflowConfigException: CELERY_SSL_ACTIVE is True, '
                           'please ensure CELERY_SSL_KEY, '
                           'CELERY_SSL_CERT and CELERY_SSL_CACERT are set')
except Exception as e:
    raise AirflowException('Exception: There was an unknown Celery SSL Error. '
Esempio n. 48
0
# under the License.
#
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from functools import wraps

import os
import contextlib

from airflow import settings
from airflow.utils.log.logging_mixin import LoggingMixin

log = LoggingMixin().log


@contextlib.contextmanager
def create_session():
    """
    Contextmanager that will create and teardown a session.
    """
    session = settings.Session()
    try:
        yield session
        session.expunge_all()
        session.commit()
    except:
        session.rollback()
        raise