def _split_tablename(table_input, default_project_id, var_name=None): assert default_project_id is not None, "INTERNAL: No default project is specified" def var_print(var_name): if var_name is None: return "" else: return "Format exception for {var}: ".format(var=var_name) if table_input.count('.') + table_input.count(':') > 3: raise Exception(('{var}Use either : or . to specify project ' 'got {input}').format(var=var_print(var_name), input=table_input)) cmpt = table_input.rsplit(':', 1) project_id = None rest = table_input if len(cmpt) == 1: project_id = None rest = cmpt[0] elif len(cmpt) == 2 and cmpt[0].count(':') <= 1: if cmpt[-1].count('.') != 2: project_id = cmpt[0] rest = cmpt[1] else: raise Exception(('{var}Expect format of (<project:)<dataset>.<table>, ' 'got {input}').format(var=var_print(var_name), input=table_input)) cmpt = rest.split('.') if len(cmpt) == 3: assert project_id is None, ( "{var}Use either : or . to specify project").format( var=var_print(var_name)) project_id = cmpt[0] dataset_id = cmpt[1] table_id = cmpt[2] elif len(cmpt) == 2: dataset_id = cmpt[0] table_id = cmpt[1] else: raise Exception( ('{var}Expect format of (<project.|<project:)<dataset>.<table>, ' 'got {input}').format(var=var_print(var_name), input=table_input)) if project_id is None: if var_name is not None: log = LoggingMixin().logger log.info( 'Project not included in {var}: {input}; using project "{project}"' .format(var=var_name, input=table_input, project=default_project_id)) project_id = default_project_id return project_id, dataset_id, table_id
def _parse_s3_config(config_file_name, config_format='boto', profile=None): """ Parses a config file for s3 credentials. Can currently parse boto, s3cmd.conf and AWS SDK config formats :param config_file_name: path to the config file :type config_file_name: str :param config_format: config type. One of "boto", "s3cmd" or "aws". Defaults to "boto" :type config_format: str :param profile: profile name in AWS type config file :type profile: str """ Config = configparser.ConfigParser() if Config.read(config_file_name): # pragma: no cover sections = Config.sections() else: raise AirflowException("Couldn't read {0}".format(config_file_name)) # Setting option names depending on file format if config_format is None: config_format = 'boto' conf_format = config_format.lower() if conf_format == 'boto': # pragma: no cover if profile is not None and 'profile ' + profile in sections: cred_section = 'profile ' + profile else: cred_section = 'Credentials' elif conf_format == 'aws' and profile is not None: cred_section = profile else: cred_section = 'default' # Option names if conf_format in ('boto', 'aws'): # pragma: no cover key_id_option = 'aws_access_key_id' secret_key_option = 'aws_secret_access_key' # security_token_option = 'aws_security_token' else: key_id_option = 'access_key' secret_key_option = 'secret_key' # Actual Parsing if cred_section not in sections: raise AirflowException("This config file format is not recognized") else: try: access_key = Config.get(cred_section, key_id_option) secret_key = Config.get(cred_section, secret_key_option) calling_format = None if Config.has_option(cred_section, 'calling_format'): calling_format = Config.get(cred_section, 'calling_format') except: log = LoggingMixin().logger log.warning("Option Error in parsing s3 config file") raise return (access_key, secret_key, calling_format)
def _str(s): # cloudant-python doesn't support unicode. if isinstance(s, unicode): log = LoggingMixin().logger log.debug( 'cloudant-python does not support unicode. Encoding %s as ascii using "ignore".', s ) return s.encode('ascii', 'ignore') return s
def list_py_file_paths(directory, safe_mode=True): """ Traverse a directory and look for Python files. :param directory: the directory to traverse :type directory: unicode :param safe_mode: whether to use a heuristic to determine whether a file contains Airflow DAG definitions :return: a list of paths to Python files in the specified directory :rtype: list[unicode] """ file_paths = [] if directory is None: return [] elif os.path.isfile(directory): return [directory] elif os.path.isdir(directory): patterns = [] for root, dirs, files in os.walk(directory, followlinks=True): ignore_file = [f for f in files if f == '.airflowignore'] if ignore_file: f = open(os.path.join(root, ignore_file[0]), 'r') patterns += [p for p in f.read().split('\n') if p] f.close() for f in files: try: file_path = os.path.join(root, f) if not os.path.isfile(file_path): continue mod_name, file_ext = os.path.splitext( os.path.split(file_path)[-1]) if file_ext != '.py' and not zipfile.is_zipfile(file_path): continue if any([re.findall(p, file_path) for p in patterns]): continue # Heuristic that guesses whether a Python file contains an # Airflow DAG definition. might_contain_dag = True if safe_mode and not zipfile.is_zipfile(file_path): with open(file_path, 'rb') as f: content = f.read() might_contain_dag = all( [s in content for s in (b'DAG', b'airflow')]) if not might_contain_dag: continue file_paths.append(file_path) except Exception: log = LoggingMixin().logger log.exception("Error while examining %s", f) return file_paths
def GetDefaultExecutor(): """Creates a new instance of the configured executor if none exists and returns it""" global DEFAULT_EXECUTOR if DEFAULT_EXECUTOR is not None: return DEFAULT_EXECUTOR executor_name = configuration.get('core', 'EXECUTOR') DEFAULT_EXECUTOR = _get_executor(executor_name) log = LoggingMixin().logger log.info("Using executor %s", executor_name) return DEFAULT_EXECUTOR
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False): log = LoggingMixin().logger SMTP_HOST = configuration.get('smtp', 'SMTP_HOST') SMTP_PORT = configuration.getint('smtp', 'SMTP_PORT') SMTP_STARTTLS = configuration.getboolean('smtp', 'SMTP_STARTTLS') SMTP_SSL = configuration.getboolean('smtp', 'SMTP_SSL') SMTP_USER = None SMTP_PASSWORD = None try: SMTP_USER = configuration.get('smtp', 'SMTP_USER') SMTP_PASSWORD = configuration.get('smtp', 'SMTP_PASSWORD') except AirflowConfigException: log.debug( "No user/password found for SMTP, so logging in with no authentication." ) if not dryrun: s = smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) if SMTP_SSL else smtplib.SMTP( SMTP_HOST, SMTP_PORT) if SMTP_STARTTLS: s.starttls() if SMTP_USER and SMTP_PASSWORD: s.login(SMTP_USER, SMTP_PASSWORD) log.info("Sent an alert email to %s", e_to) s.sendmail(e_from, e_to, mime_msg.as_string()) s.quit()
def _to_timestamp(cls, col): """ Convert a column of a dataframe to UNIX timestamps if applicable :param col: A Series object representing a column of a dataframe. """ # try and convert the column to datetimes # the column MUST have a four digit year somewhere in the string # there should be a better way to do this, # but just letting pandas try and convert every column without a format # caused it to convert floats as well # For example, a column of integers # between 0 and 10 are turned into timestamps # if the column cannot be converted, # just return the original column untouched try: col = pd.to_datetime(col) except ValueError: log = LoggingMixin().logger log.warning( "Could not convert field to timestamps: %s", col.name ) return col # now convert the newly created datetimes into timestamps # we have to be careful here # because NaT cannot be converted to a timestamp # so we have to return NaN converted = [] for i in col: try: converted.append(i.timestamp()) except ValueError: converted.append(pd.np.NaN) except AttributeError: converted.append(pd.np.NaN) # return a new series that maintains the same index as the original return pd.Series(converted, index=col.index)
def execute_command(command): log = LoggingMixin().logger log.info("Executing command in Celery: %s", command) try: subprocess.check_call(command, shell=True) except subprocess.CalledProcessError as e: log.error(e) raise AirflowException('Celery command failed')
def filter_for_filesize(result, size=None): """ Will test the filepath result and test if its size is at least self.filesize :param result: a list of dicts returned by Snakebite ls :param size: the file size in MB a file should be at least to trigger True :return: (bool) depending on the matching criteria """ if size: log = LoggingMixin().logger log.debug('Filtering for file size >= %s in files: %s', size, map(lambda x: x['path'], result)) size *= settings.MEGABYTE result = [x for x in result if x['length'] >= size] log.debug('HdfsSensor.poke: after size filter result is %s', result) return result
def handle_failure_retry(context): ti = context['ti'] cmd_id = ti.xcom_pull(key='qbol_cmd_id', task_ids=ti.task_id) if cmd_id is not None: cmd = Command.find(cmd_id) if cmd is not None: log = LoggingMixin().logger if cmd.status == 'done': log.info( 'Command ID: %s has been succeeded, hence marking this ' 'TI as Success.', cmd_id) ti.state = State.SUCCESS elif cmd.status == 'running': log.info('Cancelling the Qubole Command Id: %s', cmd_id) cmd.cancel()
class CeleryConfig(object): CELERY_ACCEPT_CONTENT = ['json', 'pickle'] CELERY_EVENT_SERIALIZER = 'json' CELERY_RESULT_SERIALIZER = 'pickle' CELERY_TASK_SERIALIZER = 'pickle' CELERYD_PREFETCH_MULTIPLIER = 1 CELERY_ACKS_LATE = True BROKER_URL = configuration.get('celery', 'BROKER_URL') CELERY_RESULT_BACKEND = configuration.get('celery', 'CELERY_RESULT_BACKEND') CELERYD_CONCURRENCY = configuration.getint('celery', 'CELERYD_CONCURRENCY') CELERY_DEFAULT_QUEUE = DEFAULT_QUEUE CELERY_DEFAULT_EXCHANGE = DEFAULT_QUEUE celery_ssl_active = False try: celery_ssl_active = configuration.getboolean('celery', 'CELERY_SSL_ACTIVE') except AirflowConfigException as e: log = LoggingMixin().logger log.warning("Celery Executor will run without SSL") try: if celery_ssl_active: BROKER_USE_SSL = { 'keyfile': configuration.get('celery', 'CELERY_SSL_KEY'), 'certfile': configuration.get('celery', 'CELERY_SSL_CERT'), 'ca_certs': configuration.get('celery', 'CELERY_SSL_CACERT'), 'cert_reqs': ssl.CERT_REQUIRED } except AirflowConfigException as e: raise AirflowException( 'AirflowConfigException: CELERY_SSL_ACTIVE is True, please ensure CELERY_SSL_KEY, ' 'CELERY_SSL_CERT and CELERY_SSL_CACERT are set') except Exception as e: raise AirflowException( 'Exception: There was an unknown Celery SSL Error. Please ensure you want to use ' 'SSL and/or have all necessary certs and key.')
def _poll_with_exponential_delay(request, max_n, is_done_func, is_error_func): log = LoggingMixin().logger for i in range(0, max_n): try: response = request.execute() if is_error_func(response): raise ValueError( 'The response contained an error: {}'.format(response)) elif is_done_func(response): log.info('Operation is done: %s', response) return response else: time.sleep((2**i) + (random.randint(0, 1000) / 1000)) except errors.HttpError as e: if e.resp.status != 429: log.info('Something went wrong. Not retrying: %s', format(e)) raise else: time.sleep((2**i) + (random.randint(0, 1000) / 1000))
def filter_for_ignored_ext(result, ignored_ext, ignore_copying): """ Will filter if instructed to do so the result to remove matching criteria :param result: (list) of dicts returned by Snakebite ls :param ignored_ext: (list) of ignored extentions :param ignore_copying: (bool) shall we ignore ? :return: (list) of dicts which were not removed """ if ignore_copying: log = LoggingMixin().logger regex_builder = "^.*\.(%s$)$" % '$|'.join(ignored_ext) ignored_extentions_regex = re.compile(regex_builder) log.debug( 'Filtering result for ignored extensions: %s in files %s', ignored_extentions_regex.pattern, map(lambda x: x['path'], result) ) result = [x for x in result if not ignored_extentions_regex.match(x['path'])] log.debug('HdfsSensor.poke: after ext filter result is %s', result) return result
def load_login(): log = LoggingMixin().logger auth_backend = 'airflow.default_login' try: if conf.getboolean('webserver', 'AUTHENTICATE'): auth_backend = conf.get('webserver', 'auth_backend') except conf.AirflowConfigException: if conf.getboolean('webserver', 'AUTHENTICATE'): log.warning( "auth_backend not found in webserver config reverting to " "*deprecated* behavior of importing airflow_login") auth_backend = "airflow_login" try: global login login = import_module(auth_backend) except ImportError as err: log.critical( "Cannot import authentication module %s. " "Please correct your authentication backend or disable authentication: %s", auth_backend, err) if conf.getboolean('webserver', 'AUTHENTICATE'): raise AirflowException("Failed to import authentication backend")
from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals from functools import wraps import os from sqlalchemy import event, exc from sqlalchemy.pool import Pool from airflow import settings from airflow.utils.log.LoggingMixin import LoggingMixin log = LoggingMixin().logger def provide_session(func): """ Function decorator that provides a session if it isn't provided. If you want to reuse a session or run the function as part of a database transaction, you pass it to the function, if not this wrapper will create one and close it for you. """ @wraps(func) def wrapper(*args, **kwargs): needs_session = False arg_session = 'session' func_params = func.__code__.co_varnames session_in_args = arg_session in func_params and \ func_params.index(arg_session) < len(args)
from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals from builtins import object import imp import inspect import os import re import sys from airflow import configuration from airflow.utils.log.LoggingMixin import LoggingMixin log = LoggingMixin().logger class AirflowPluginException(Exception): pass class AirflowPlugin(object): name = None operators = [] hooks = [] executors = [] macros = [] admin_views = [] flask_blueprints = [] menu_links = []
from __future__ import print_function from __future__ import unicode_literals import logging import logging.config import os import sys from sqlalchemy import create_engine from sqlalchemy.orm import scoped_session, sessionmaker from sqlalchemy.pool import NullPool from airflow import configuration as conf from airflow.utils.log.LoggingMixin import LoggingMixin log = LoggingMixin().logger class DummyStatsLogger(object): @classmethod def incr(cls, stat, count=1, rate=1): pass @classmethod def decr(cls, stat, count=1, rate=1): pass @classmethod def gauge(cls, stat, value, rate=1, delta=False): pass
def get_connection(cls, conn_id): conn = random.choice(cls.get_connections(conn_id)) if conn.host: log = LoggingMixin().logger log.info("Using connection to: %s", conn.host) return conn
# See the License for the specific language governing permissions and # limitations under the License. from airflow.hooks.base_hook import BaseHook from airflow import configuration from hdfs import InsecureClient, HdfsError from airflow.utils.log.LoggingMixin import LoggingMixin _kerberos_security_mode = configuration.get("core", "security") == "kerberos" if _kerberos_security_mode: try: from hdfs.ext.kerberos import KerberosClient except ImportError: log = LoggingMixin().logger log.error("Could not load the Kerberos extension for the WebHDFSHook.") raise from airflow.exceptions import AirflowException class AirflowWebHDFSHookException(AirflowException): pass class WebHDFSHook(BaseHook): """ Interact with HDFS. This class is a wrapper around the hdfscli library. """ def __init__(self, webhdfs_conn_id='webhdfs_default', proxy_user=None): self.webhdfs_conn_id = webhdfs_conn_id
import shlex import sys from future import standard_library from airflow.utils.log.LoggingMixin import LoggingMixin standard_library.install_aliases() from builtins import str from collections import OrderedDict from six.moves import configparser from airflow.exceptions import AirflowConfigException log = LoggingMixin().logger # show Airflow's deprecation warnings warnings.filterwarnings(action='default', category=DeprecationWarning, module='airflow') warnings.filterwarnings(action='default', category=PendingDeprecationWarning, module='airflow') if six.PY3: ConfigParser = configparser.ConfigParser else: ConfigParser = configparser.SafeConfigParser