def send_MIME_email(e_from, e_to, mime_msg, dryrun=False): log = LoggingMixin().log SMTP_HOST = configuration.conf.get('smtp', 'SMTP_HOST') SMTP_PORT = configuration.conf.getint('smtp', 'SMTP_PORT') SMTP_STARTTLS = configuration.conf.getboolean('smtp', 'SMTP_STARTTLS') SMTP_SSL = configuration.conf.getboolean('smtp', 'SMTP_SSL') SMTP_USER = None SMTP_PASSWORD = None try: SMTP_USER = configuration.conf.get('smtp', 'SMTP_USER') SMTP_PASSWORD = configuration.conf.get('smtp', 'SMTP_PASSWORD') except AirflowConfigException: log.debug("No user/password found for SMTP, so logging in with no authentication.") if not dryrun: s = smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) if SMTP_SSL else smtplib.SMTP(SMTP_HOST, SMTP_PORT) if SMTP_STARTTLS: s.starttls() if SMTP_USER and SMTP_PASSWORD: s.login(SMTP_USER, SMTP_PASSWORD) log.info("Sent an alert email to %s", e_to) s.sendmail(e_from, e_to, mime_msg.as_string()) s.quit()
def load_login(): log = LoggingMixin().log auth_backend = 'airflow.default_login' try: if conf.getboolean('webserver', 'AUTHENTICATE'): auth_backend = conf.get('webserver', 'auth_backend') except conf.AirflowConfigException: if conf.getboolean('webserver', 'AUTHENTICATE'): log.warning( "auth_backend not found in webserver config reverting to " "*deprecated* behavior of importing airflow_login") auth_backend = "airflow_login" try: global login login = import_module(auth_backend) except ImportError as err: log.critical( "Cannot import authentication module %s. " "Please correct your authentication backend or disable authentication: %s", auth_backend, err ) if conf.getboolean('webserver', 'AUTHENTICATE'): raise AirflowException("Failed to import authentication backend")
def _split_tablename(table_input, default_project_id, var_name=None): assert default_project_id is not None, "INTERNAL: No default project is specified" def var_print(var_name): if var_name is None: return "" else: return "Format exception for {var}: ".format(var=var_name) if table_input.count('.') + table_input.count(':') > 3: raise Exception(( '{var}Use either : or . to specify project ' 'got {input}' ).format(var=var_print(var_name), input=table_input)) cmpt = table_input.rsplit(':', 1) project_id = None rest = table_input if len(cmpt) == 1: project_id = None rest = cmpt[0] elif len(cmpt) == 2 and cmpt[0].count(':') <= 1: if cmpt[-1].count('.') != 2: project_id = cmpt[0] rest = cmpt[1] else: raise Exception(( '{var}Expect format of (<project:)<dataset>.<table>, ' 'got {input}' ).format(var=var_print(var_name), input=table_input)) cmpt = rest.split('.') if len(cmpt) == 3: assert project_id is None, ( "{var}Use either : or . to specify project" ).format(var=var_print(var_name)) project_id = cmpt[0] dataset_id = cmpt[1] table_id = cmpt[2] elif len(cmpt) == 2: dataset_id = cmpt[0] table_id = cmpt[1] else: raise Exception(( '{var}Expect format of (<project.|<project:)<dataset>.<table>, ' 'got {input}' ).format(var=var_print(var_name), input=table_input)) if project_id is None: if var_name is not None: log = LoggingMixin().log log.info( 'Project not included in {var}: {input}; using project "{project}"'.format( var=var_name, input=table_input, project=default_project_id ) ) project_id = default_project_id return project_id, dataset_id, table_id
def execute_command(command): log = LoggingMixin().log log.info("Executing command in Celery: %s", command) try: subprocess.check_call(command, shell=True) except subprocess.CalledProcessError as e: log.error(e) raise AirflowException('Celery command failed')
def _post_sendgrid_mail(mail_data): log = LoggingMixin().log sg = sendgrid.SendGridAPIClient(apikey=os.environ.get('SENDGRID_API_KEY')) response = sg.client.mail.send.post(request_body=mail_data) # 2xx status code. if 200 <= response.status_code < 300: log.info('Email with subject %s is successfully sent to recipients: %s', mail_data['subject'], mail_data['personalizations']) else: log.warning('Failed to send out email with subject %s, status code: %s', mail_data['subject'], response.status_code)
def _str(s): # cloudant-python doesn't support unicode. if isinstance(s, unicode): log = LoggingMixin().log log.debug( 'cloudant-python does not support unicode. Encoding %s as ascii using "ignore".', s ) return s.encode('ascii', 'ignore') return s
def _parse_s3_config(config_file_name, config_format='boto', profile=None): """ Parses a config file for s3 credentials. Can currently parse boto, s3cmd.conf and AWS SDK config formats :param config_file_name: path to the config file :type config_file_name: str :param config_format: config type. One of "boto", "s3cmd" or "aws". Defaults to "boto" :type config_format: str :param profile: profile name in AWS type config file :type profile: str """ Config = configparser.ConfigParser() if Config.read(config_file_name): # pragma: no cover sections = Config.sections() else: raise AirflowException("Couldn't read {0}".format(config_file_name)) # Setting option names depending on file format if config_format is None: config_format = 'boto' conf_format = config_format.lower() if conf_format == 'boto': # pragma: no cover if profile is not None and 'profile ' + profile in sections: cred_section = 'profile ' + profile else: cred_section = 'Credentials' elif conf_format == 'aws' and profile is not None: cred_section = profile else: cred_section = 'default' # Option names if conf_format in ('boto', 'aws'): # pragma: no cover key_id_option = 'aws_access_key_id' secret_key_option = 'aws_secret_access_key' # security_token_option = 'aws_security_token' else: key_id_option = 'access_key' secret_key_option = 'secret_key' # Actual Parsing if cred_section not in sections: raise AirflowException("This config file format is not recognized") else: try: access_key = Config.get(cred_section, key_id_option) secret_key = Config.get(cred_section, secret_key_option) calling_format = None if Config.has_option(cred_section, 'calling_format'): calling_format = Config.get(cred_section, 'calling_format') except: log = LoggingMixin().log log.warning("Option Error in parsing s3 config file") raise return (access_key, secret_key, calling_format)
def handle_failure_retry(context): ti = context['ti'] cmd_id = ti.xcom_pull(key='qbol_cmd_id', task_ids=ti.task_id) if cmd_id is not None: cmd = Command.find(cmd_id) if cmd is not None: if cmd.status == 'running': log = LoggingMixin().log log.info('Cancelling the Qubole Command Id: %s', cmd_id) cmd.cancel()
def get_query_results(self): log = LoggingMixin().log if self.cmd is not None: cmd_id = self.cmd.id log.info("command id: " + str(cmd_id)) query_result_buffer = StringIO() self.cmd.get_results(fp=query_result_buffer, inline=True, delim=COL_DELIM) query_result = query_result_buffer.getvalue() query_result_buffer.close() return query_result else: log.info("Qubole command not found")
def list_py_file_paths(directory, safe_mode=True): """ Traverse a directory and look for Python files. :param directory: the directory to traverse :type directory: unicode :param safe_mode: whether to use a heuristic to determine whether a file contains Airflow DAG definitions :return: a list of paths to Python files in the specified directory :rtype: list[unicode] """ file_paths = [] if directory is None: return [] elif os.path.isfile(directory): return [directory] elif os.path.isdir(directory): patterns = [] for root, dirs, files in os.walk(directory, followlinks=True): ignore_file = [f for f in files if f == '.airflowignore'] if ignore_file: f = open(os.path.join(root, ignore_file[0]), 'r') patterns += [p for p in f.read().split('\n') if p] f.close() for f in files: try: file_path = os.path.join(root, f) if not os.path.isfile(file_path): continue mod_name, file_ext = os.path.splitext( os.path.split(file_path)[-1]) if file_ext != '.py' and not zipfile.is_zipfile(file_path): continue if any([re.findall(p, file_path) for p in patterns]): continue # Heuristic that guesses whether a Python file contains an # Airflow DAG definition. might_contain_dag = True if safe_mode and not zipfile.is_zipfile(file_path): with open(file_path, 'rb') as f: content = f.read() might_contain_dag = all( [s in content for s in (b'DAG', b'airflow')]) if not might_contain_dag: continue file_paths.append(file_path) except Exception: log = LoggingMixin().log log.exception("Error while examining %s", f) return file_paths
def set( cls, key, value, execution_date, task_id, dag_id, session=None): """ Store an XCom value. TODO: "pickling" has been deprecated and JSON is preferred. "pickling" will be removed in Airflow 2.0. :return: None """ session.expunge_all() enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') if enable_pickling: value = pickle.dumps(value) else: try: value = json.dumps(value).encode('UTF-8') except ValueError: log = LoggingMixin().log log.error("Could not serialize the XCOM value into JSON. " "If you are using pickles instead of JSON " "for XCOM, then you need to enable pickle " "support for XCOM in your airflow config.") raise # remove any duplicate XComs session.query(cls).filter( cls.key == key, cls.execution_date == execution_date, cls.task_id == task_id, cls.dag_id == dag_id).delete() session.commit() # insert new XCom session.add(XCom( key=key, value=value, execution_date=execution_date, task_id=task_id, dag_id=dag_id)) session.commit()
def filter_for_filesize(result, size=None): """ Will test the filepath result and test if its size is at least self.filesize :param result: a list of dicts returned by Snakebite ls :param size: the file size in MB a file should be at least to trigger True :return: (bool) depending on the matching criteria """ if size: log = LoggingMixin().log log.debug('Filtering for file size >= %s in files: %s', size, map(lambda x: x['path'], result)) size *= settings.MEGABYTE result = [x for x in result if x['length'] >= size] log.debug('HdfsSensor.poke: after size filter result is %s', result) return result
def GetDefaultExecutor(): """Creates a new instance of the configured executor if none exists and returns it""" global DEFAULT_EXECUTOR if DEFAULT_EXECUTOR is not None: return DEFAULT_EXECUTOR executor_name = configuration.get('core', 'EXECUTOR') DEFAULT_EXECUTOR = _get_executor(executor_name) log = LoggingMixin().log log.info("Using executor %s", executor_name) return DEFAULT_EXECUTOR
def handle_failure_retry(context): ti = context['ti'] cmd_id = ti.xcom_pull(key='qbol_cmd_id', task_ids=ti.task_id) if cmd_id is not None: cmd = Command.find(cmd_id) if cmd is not None: log = LoggingMixin().log if cmd.status == 'done': log.info('Command ID: %s has been succeeded, hence marking this ' 'TI as Success.', cmd_id) ti.state = State.SUCCESS elif cmd.status == 'running': log.info('Cancelling the Qubole Command Id: %s', cmd_id) cmd.cancel()
def get_val(self): log = LoggingMixin().log if self._val and self.is_encrypted: try: fernet = get_fernet() return fernet.decrypt(bytes(self._val, 'utf-8')).decode() except InvalidFernetToken: log.error("Can't decrypt _val for key={}, invalid token " "or value".format(self.key)) return None except Exception: log.error("Can't decrypt _val for key={}, FERNET_KEY " "configuration missing".format(self.key)) return None else: return self._val
def get_one(cls, execution_date, key=None, task_id=None, dag_id=None, include_prior_dates=False, session=None): """ Retrieve an XCom value, optionally meeting certain criteria. TODO: "pickling" has been deprecated and JSON is preferred. "pickling" will be removed in Airflow 2.0. :return: XCom value """ filters = [] if key: filters.append(cls.key == key) if task_id: filters.append(cls.task_id == task_id) if dag_id: filters.append(cls.dag_id == dag_id) if include_prior_dates: filters.append(cls.execution_date <= execution_date) else: filters.append(cls.execution_date == execution_date) query = ( session.query(cls.value).filter(and_(*filters)) .order_by(cls.execution_date.desc(), cls.timestamp.desc())) result = query.first() if result: enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') if enable_pickling: return pickle.loads(result.value) else: try: return json.loads(result.value.decode('UTF-8')) except ValueError: log = LoggingMixin().log log.error("Could not deserialize the XCOM value from JSON. " "If you are using pickles instead of JSON " "for XCOM, then you need to enable pickle " "support for XCOM in your airflow config.") raise
def filter_for_ignored_ext(result, ignored_ext, ignore_copying): """ Will filter if instructed to do so the result to remove matching criteria :param result: (list) of dicts returned by Snakebite ls :param ignored_ext: (list) of ignored extensions :param ignore_copying: (bool) shall we ignore ? :return: (list) of dicts which were not removed """ if ignore_copying: log = LoggingMixin().log regex_builder = "^.*\.(%s$)$" % '$|'.join(ignored_ext) ignored_extentions_regex = re.compile(regex_builder) log.debug( 'Filtering result for ignored extensions: %s in files %s', ignored_extentions_regex.pattern, map(lambda x: x['path'], result) ) result = [x for x in result if not ignored_extentions_regex.match(x['path'])] log.debug('HdfsSensor.poke: after ext filter result is %s', result) return result
def _poll_with_exponential_delay(request, max_n, is_done_func, is_error_func): log = LoggingMixin().log for i in range(0, max_n): try: response = request.execute() if is_error_func(response): raise ValueError( 'The response contained an error: {}'.format(response) ) elif is_done_func(response): log.info('Operation is done: %s', response) return response else: time.sleep((2**i) + (random.randint(0, 1000) / 1000)) except HttpError as e: if e.resp.status != 429: log.info('Something went wrong. Not retrying: %s', format(e)) raise else: time.sleep((2**i) + (random.randint(0, 1000) / 1000))
def _to_timestamp(cls, col): """ Convert a column of a dataframe to UNIX timestamps if applicable :param col: A Series object representing a column of a dataframe. """ # try and convert the column to datetimes # the column MUST have a four digit year somewhere in the string # there should be a better way to do this, # but just letting pandas try and convert every column without a format # caused it to convert floats as well # For example, a column of integers # between 0 and 10 are turned into timestamps # if the column cannot be converted, # just return the original column untouched try: col = pd.to_datetime(col) except ValueError: log = LoggingMixin().log log.warning( "Could not convert field to timestamps: %s", col.name ) return col # now convert the newly created datetimes into timestamps # we have to be careful here # because NaT cannot be converted to a timestamp # so we have to return NaN converted = [] for i in col: try: converted.append(i.timestamp()) except ValueError: converted.append(pd.np.NaN) except AttributeError: converted.append(pd.np.NaN) # return a new series that maintains the same index as the original return pd.Series(converted, index=col.index)
def execute_command(command_to_exec): log = LoggingMixin().log log.info("Executing command in Celery: %s", command_to_exec) env = os.environ.copy() try: subprocess.check_call(command_to_exec, stderr=subprocess.STDOUT, close_fds=True, env=env) except subprocess.CalledProcessError as e: log.exception('execute_command encountered a CalledProcessError') log.error(e.output) raise AirflowException('Celery command failed')
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """ This is an example dag for using the KubernetesPodOperator. """ from airflow.utils.dates import days_ago from airflow.utils.log.logging_mixin import LoggingMixin from airflow.models import DAG log = LoggingMixin().log try: # Kubernetes is optional, so not available in vanilla Airflow # pip install 'apache-airflow[kubernetes]' from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator args = {'owner': 'Airflow', 'start_date': days_ago(2)} dag = DAG(dag_id='airflow-k8s-test-dag', catchup=False, default_args=args, schedule_interval='* * * * *') k = KubernetesPodOperator( namespace='default',
from airflow import models from airflow.configuration import conf from airflow.configuration import AirflowConfigException from airflow.utils.db import provide_session import traceback import re from airflow.utils.log.logging_mixin import LoggingMixin LOGIN_MANAGER = flask_login.LoginManager() LOGIN_MANAGER.login_view = 'airflow.login' # Calls login() below LOGIN_MANAGER.login_message = None log = LoggingMixin().log class AuthenticationError(Exception): pass class LdapException(Exception): pass def get_ldap_connection(dn=None, password=None): ldap_uri = conf.get("ldap", "uri") isSslEnable = False
def create_airflow_rest_connection(): from airflow.contrib.auth.backends.password_auth import PasswordUser import base64 import os session = settings.Session() exists = session.query(models.User).filter(models.User.username == 'application').scalar() if exists is None: LoggingMixin().log.info("creating 'application' user for mini-BRS...") # create 'application' user random_key = str(base64.urlsafe_b64encode(os.urandom(32))) user = PasswordUser(models.User()) user.username = '******' user.email = '*****@*****.**' user.password = random_key session.add(user) session.commit() session.close() # create 'application' airflow connection rest = Connection( conn_id='rest', login='******', password=random_key ) session = settings.Session() session.add(rest) session.commit() session.close() # create 'admin' user # admin_password = str(base64.urlsafe_b64encode(os.urandom(32))) config_parser = configuration.AirflowConfigParser() config_parser.read( configuration.get_airflow_config( configuration.get_airflow_home() ) ) u = config_parser.get( section='core', key='username' ) p = config_parser.get( section='core', key='password' ) user = PasswordUser(models.User()) user.username = u user.email = '*****@*****.**' user.password = p user.superuser = True session = settings.Session() session.add(user) session.commit() session.close() config_parser.remove_option( section='core', option='username' ) config_parser.remove_option( section='core', option='password' ) file = open(configuration.get_airflow_config(configuration.get_airflow_home()), 'w') config_parser.write(file) file.close()
def _upload(self, context): # dropbox Connection details try: credentials_dropbox = BaseHook.get_connection(self.storage_conn_id) self.dropbox_access_token = credentials_dropbox.password except AirflowException as e: raise DropboxConnectionNotFoundException if self.is_storage_available(self.dropbox_access_token): try: LoggingMixin().log.info("Dropbox Storage avalaible") l_file_path = self.file_name.replace('.csv', '.json') file_name = l_file_path[l_file_path.rfind('/') + 1:] dt_current = datetime.strptime(self.execution_date[:19], "%Y-%m-%dT%H:%M:%S") exec_hour = str(dt_current.hour) exec_minute = str(dt_current.minute) exec_second = str(dt_current.second) if exec_hour == '0' and exec_minute == '0' and exec_second == '0': dt_current = dt_current - timedelta(days=1) r_file_path = '{}/{}/{}/{}/{}'.format( '/mbrs', 'Servicenow', self.table, '{}-{}-{}'.format(dt_current.year, dt_current.month, dt_current.day), file_name) else: r_file_path = '{}/{}/{}/{}/{}'.format( '/mbrs', 'Servicenow', self.table, '{}-{}-{}'.format(dt_current.year, dt_current.month, dt_current.day), file_name) LoggingMixin().log.info("Running dropbox upload process...") try: file_size = os.path.getsize(l_file_path) CHUNK_SIZE = 4 * 1024 * 1024 dbx = dropbox.Dropbox(self.dropbox_access_token, timeout=600) if file_size <= CHUNK_SIZE: with open(l_file_path, 'rb') as f: dbx.files_upload( f.read(), r_file_path, mode=dropbox.files.WriteMode.overwrite) f.close() return True else: with open(l_file_path, 'rb') as f: upload_session_start_result = dbx.files_upload_session_start( f.read(CHUNK_SIZE)) cursor = dropbox.files.UploadSessionCursor( session_id=upload_session_start_result. session_id, offset=f.tell()) commit = dropbox.files.CommitInfo(path=r_file_path) while f.tell() < file_size: if (file_size - f.tell()) <= CHUNK_SIZE: print( dbx.files_upload_session_finish( f.read(CHUNK_SIZE), cursor, commit)) else: dbx.files_upload_session_append_v2( f.read(CHUNK_SIZE), cursor) cursor.offset = f.tell() f.close() return True except Exception as e: LoggingMixin().log.error( "ServiceNow2DropBoxTransOperator : exception in dropbox upload for token : {} {}" .format(self.dropbox_access_token, e)) return False except Exception as e: print(e) else: LoggingMixin().log.info("Dropbox Storage not avalaible") return False
from __future__ import division from __future__ import print_function from __future__ import unicode_literals from builtins import object import imp import inspect import os import re import sys import pkg_resources from airflow import configuration from airflow.utils.log.logging_mixin import LoggingMixin log = LoggingMixin().log import_errors = {} class AirflowPluginException(Exception): pass class AirflowPlugin(object): name = None operators = [] sensors = [] hooks = [] executors = [] macros = []
def list_py_file_paths(directory, safe_mode=True, include_examples=None): """ Traverse a directory and look for Python files. :param directory: the directory to traverse :type directory: unicode :param safe_mode: whether to use a heuristic to determine whether a file contains Airflow DAG definitions :return: a list of paths to Python files in the specified directory :rtype: list[unicode] """ if include_examples is None: include_examples = conf.getboolean('core', 'LOAD_EXAMPLES') file_paths = [] if directory is None: return [] elif os.path.isfile(directory): return [directory] elif os.path.isdir(directory): patterns_by_dir = {} for root, dirs, files in os.walk(directory, followlinks=True): patterns = patterns_by_dir.get(root, []) ignore_file = os.path.join(root, '.airflowignore') if os.path.isfile(ignore_file): with open(ignore_file, 'r') as f: # If we have new patterns create a copy so we don't change # the previous list (which would affect other subdirs) patterns += [re.compile(p) for p in f.read().split('\n') if p] # If we can ignore any subdirs entirely we should - fewer paths # to walk is better. We have to modify the ``dirs`` array in # place for this to affect os.walk dirs[:] = [ d for d in dirs if not any(p.search(os.path.join(root, d)) for p in patterns) ] # We want patterns defined in a parent folder's .airflowignore to # apply to subdirs too for d in dirs: patterns_by_dir[os.path.join(root, d)] = patterns for f in files: try: file_path = os.path.join(root, f) if not os.path.isfile(file_path): continue mod_name, file_ext = os.path.splitext( os.path.split(file_path)[-1]) if file_ext != '.py' and not zipfile.is_zipfile(file_path): continue if any([re.findall(p, file_path) for p in patterns]): continue # Heuristic that guesses whether a Python file contains an # Airflow DAG definition. might_contain_dag = True if safe_mode and not zipfile.is_zipfile(file_path): with open(file_path, 'rb') as fp: content = fp.read() might_contain_dag = all( [s in content for s in (b'DAG', b'airflow')]) if not might_contain_dag: continue file_paths.append(file_path) except Exception: log = LoggingMixin().log log.exception("Error while examining %s", f) if include_examples: import airflow.example_dags example_dag_folder = airflow.example_dags.__path__[0] file_paths.extend(list_py_file_paths(example_dag_folder, safe_mode, False)) return file_paths
# Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """Sentry Integration""" from functools import wraps from airflow.configuration import conf from airflow.utils.db import provide_session from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.state import State log = LoggingMixin().log class DummySentry: """ Blank class for Sentry. """ @classmethod def add_tagging(cls, task_instance): """ Blank function for tagging. """ @classmethod def add_breadcrumbs(cls, task_instance, session=None): """
# KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import ssl from airflow import configuration from airflow.exceptions import AirflowConfigException, AirflowException from airflow.utils.log.logging_mixin import LoggingMixin def _broker_supports_visibility_timeout(url): return url.startswith("redis://") or url.startswith("sqs://") log = LoggingMixin().log broker_url = configuration.conf.get('celery', 'BROKER_URL') broker_transport_options = configuration.conf.getsection( 'celery_broker_transport_options') if 'visibility_timeout' not in broker_transport_options: if _broker_supports_visibility_timeout(broker_url): broker_transport_options['visibility_timeout'] = 21600 DEFAULT_CELERY_CONFIG = { 'accept_content': ['json', 'pickle'], 'event_serializer': 'json', 'worker_prefetch_multiplier': 1,
from __future__ import print_function from __future__ import unicode_literals import logging import logging.config import os import sys from sqlalchemy import create_engine from sqlalchemy.orm import scoped_session, sessionmaker from sqlalchemy.pool import NullPool from airflow import configuration as conf from airflow.utils.log.logging_mixin import LoggingMixin log = LoggingMixin().log class DummyStatsLogger(object): @classmethod def incr(cls, stat, count=1, rate=1): pass @classmethod def decr(cls, stat, count=1, rate=1): pass @classmethod def gauge(cls, stat, value, rate=1, delta=False): pass
def get_connection(cls, conn_id): conn = random.choice(cls.get_connections(conn_id)) if conn.host: log = LoggingMixin().log log.info("Using connection to: %s", conn.host) return conn
def list_py_file_paths(directory, safe_mode=True, include_examples=conf.getboolean( 'core', 'LOAD_EXAMPLES')): """ Traverse a directory and look for Python files. :param directory: the directory to traverse :type directory: unicode :param safe_mode: whether to use a heuristic to determine whether a file contains Airflow DAG definitions :return: a list of paths to Python files in the specified directory :rtype: list[unicode] """ file_paths = [] if directory is None: return [] elif os.path.isfile(directory): return [directory] elif os.path.isdir(directory): patterns_by_dir = {} for root, dirs, files in os.walk(directory, followlinks=True): patterns = patterns_by_dir.get(root, []) ignore_file = os.path.join(root, '.airflowignore') if os.path.isfile(ignore_file): with open(ignore_file, 'r') as f: # If we have new patterns create a copy so we don't change # the previous list (which would affect other subdirs) patterns = patterns + [ p for p in f.read().split('\n') if p ] # If we can ignore any subdirs entirely we should - fewer paths # to walk is better. We have to modify the ``dirs`` array in # place for this to affect os.walk dirs[:] = [ d for d in dirs if not any( re.search(p, os.path.join(root, d)) for p in patterns) ] # We want patterns defined in a parent folder's .airflowignore to # apply to subdirs too for d in dirs: patterns_by_dir[os.path.join(root, d)] = patterns for f in files: try: file_path = os.path.join(root, f) if not os.path.isfile(file_path): continue mod_name, file_ext = os.path.splitext( os.path.split(file_path)[-1]) if file_ext != '.py' and not zipfile.is_zipfile(file_path): continue if any([re.findall(p, file_path) for p in patterns]): continue # Heuristic that guesses whether a Python file contains an # Airflow DAG definition. might_contain_dag = True if safe_mode and not zipfile.is_zipfile(file_path): with open(file_path, 'rb') as f: content = f.read() might_contain_dag = all( [s in content for s in (b'DAG', b'airflow')]) if not might_contain_dag: continue file_paths.append(file_path) except Exception: log = LoggingMixin().log log.exception("Error while examining %s", f) if include_examples: import airflow.example_dags example_dag_folder = airflow.example_dags.__path__[0] file_paths.extend( list_py_file_paths(example_dag_folder, safe_mode, False)) return file_paths
# under the License. from hdfs import InsecureClient, HdfsError from airflow import configuration from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook from airflow.utils.log.logging_mixin import LoggingMixin _kerberos_security_mode = configuration.conf.get("core", "security") == "kerberos" if _kerberos_security_mode: try: from hdfs.ext.kerberos import KerberosClient except ImportError: log = LoggingMixin().log log.error("Could not load the Kerberos extension for the WebHDFSHook.") raise class AirflowWebHDFSHookException(AirflowException): pass class WebHDFSHook(BaseHook): """ Interact with HDFS. This class is a wrapper around the hdfscli library. """ def __init__(self, webhdfs_conn_id='webhdfs_default', proxy_user=None): self.webhdfs_conn_id = webhdfs_conn_id self.proxy_user = proxy_user
# under the License. """Manages all plugins.""" # noinspection PyDeprecation import imp # pylint: disable=deprecated-module import inspect import os import re import sys from typing import Any, Callable, Dict, List, Optional, Set, Type import pkg_resources from airflow import settings from airflow.utils.log.logging_mixin import LoggingMixin log = LoggingMixin().log import_errors = {} class AirflowPluginException(Exception): """Exception when loading plugin.""" class AirflowPlugin: """Class used to define AirflowPlugin.""" name: Optional[str] = None operators: List[Any] = [] sensors: List[Any] = [] hooks: List[Any] = [] executors: List[Any] = []
from airflow.utils.dates import days_ago from airflow.utils.log.logging_mixin import LoggingMixin from airflow.models import DAG from datetime import datetime, timedelta log = LoggingMixin().log try: from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator args = { "owner": "Robin", "start_date": datetime(2019, 1, 30), "retries": 2, "retry_delay": timedelta(minutes=50), "email": ["*****@*****.**"], "pool": "occupeye_pool" } dag = DAG( dag_id="occupeye_aggregator", default_args=args, schedule_interval='0 5 * * *', ) surveys_to_s3 = KubernetesPodOperator( namespace="airflow", image= "quay.io/mojanalytics/airflow-occupeye-dashboard-aggregation:latest", cmds=["bash", "-c"], arguments=["Rscript main.R"],
# with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from airflow.utils.dates import days_ago from airflow.utils.log.logging_mixin import LoggingMixin from airflow.models import DAG log = LoggingMixin().log try: # Kubernetes is optional, so not available in vanilla Airflow # pip install apache-airflow[kubernetes] from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator args = { 'owner': 'airflow', 'start_date': days_ago(2) } dag = DAG( dag_id='example_kubernetes_operator', default_args=args, schedule_interval=None)
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import ssl from airflow import configuration from airflow.exceptions import AirflowConfigException, AirflowException from airflow.utils.log.logging_mixin import LoggingMixin log = LoggingMixin().log broker_transport_options = configuration.getsection('celery_broker_transport_options') if broker_transport_options is None: broker_transport_options = {'visibility_timeout': 21600} DEFAULT_CELERY_CONFIG = { 'accept_content': ['json', 'pickle'], 'event_serializer': 'json', 'worker_prefetch_multiplier': 1, 'task_acks_late': True, 'task_default_queue': configuration.get('celery', 'DEFAULT_QUEUE'), 'task_default_exchange': configuration.get('celery', 'DEFAULT_QUEUE'), 'broker_url': configuration.get('celery', 'BROKER_URL'), 'broker_transport_options': broker_transport_options, 'result_backend': configuration.get('celery', 'RESULT_BACKEND'),
import shlex import subprocess import sys import warnings from base64 import b64encode from collections import OrderedDict # Ignored Mypy on configparser because it thinks the configparser module has no _UNSET attribute from configparser import _UNSET, ConfigParser, NoOptionError, NoSectionError # type: ignore from cryptography.fernet import Fernet from zope.deprecation import deprecated from airflow.exceptions import AirflowConfigException from airflow.utils.log.logging_mixin import LoggingMixin log = LoggingMixin().log # show Airflow's deprecation warnings warnings.filterwarnings( action='default', category=DeprecationWarning, module='airflow') warnings.filterwarnings( action='default', category=PendingDeprecationWarning, module='airflow') def expand_env_var(env_var): """ Expands (potentially nested) env vars by repeatedly applying `expandvars` and `expanduser` until interpolation stops having any effect. """ if not env_var:
# "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """Default celery configuration.""" import ssl from airflow.utils.log.logging_mixin import LoggingMixin log = LoggingMixin().log #broker_url = conf.get('celery', 'BROKER_URL') broker_url = 'pyamqp://*****:*****@rabbit01:5672/airflow' log.info('Using broker_url ' + broker_url) #result_backend = conf.get('celery', 'RESULT_BACKEND') result_backend = "db+mysql://root:3point142@maria01:3306/airflow" log.info('Using result_backend ' + result_backend) default_queue = "celery.inbound" log.info('Using default_queue ' + default_queue) worker_concurrency = "16" log.info('Using worker_concurrency ' + worker_concurrency)
# See the License for the specific language governing permissions and # limitations under the License. from airflow.hooks.base_hook import BaseHook from airflow import configuration from hdfs import InsecureClient, HdfsError from airflow.utils.log.logging_mixin import LoggingMixin _kerberos_security_mode = configuration.get("core", "security") == "kerberos" if _kerberos_security_mode: try: from hdfs.ext.kerberos import KerberosClient except ImportError: log = LoggingMixin().log log.error("Could not load the Kerberos extension for the WebHDFSHook.") raise from airflow.exceptions import AirflowException class AirflowWebHDFSHookException(AirflowException): pass class WebHDFSHook(BaseHook): """ Interact with HDFS. This class is a wrapper around the hdfscli library. """ def __init__(self, webhdfs_conn_id='webhdfs_default', proxy_user=None): self.webhdfs_conn_id = webhdfs_conn_id
from future import standard_library from six import iteritems from airflow.utils.log.logging_mixin import LoggingMixin standard_library.install_aliases() from builtins import str from collections import OrderedDict from six.moves import configparser from airflow.exceptions import AirflowConfigException log = LoggingMixin().log # show Airflow's deprecation warnings warnings.filterwarnings( action='default', category=DeprecationWarning, module='airflow') warnings.filterwarnings( action='default', category=PendingDeprecationWarning, module='airflow') if six.PY3: ConfigParser = configparser.ConfigParser else: ConfigParser = configparser.SafeConfigParser def generate_fernet_key(): try:
from datetime import datetime from functools import wraps from urllib.parse import urlparse from airflow.settings import Stats from airflow.utils.log.logging_mixin import LoggingMixin from requests import PreparedRequest from requests import Session from airflow_metrics.utils.fn_utils import get_calling_operator from airflow_metrics.utils.fn_utils import once from airflow_metrics.utils.hook_utils import HookManager LOG = LoggingMixin().log BLACKLIST = { 'api.datadoghq.com', } def attach_request_meta(ctx, *args, **kwargs): if len(args) >= 2 and isinstance(args[1], PreparedRequest): request = args[1] url = request.url else: LOG.info('No url found for request') return ctx['url'] = url domain = urlparse(url).netloc if domain in BLACKLIST:
def get_connection(cls, conn_id): # type: (str) -> Connection conn = random.choice(list(cls.get_connections(conn_id))) if conn.host: log = LoggingMixin().log log.info("Using connection to: %s", conn.debug_info()) return conn
def create_dags(): global dag_creation_dates global new_dags global email_notify_required new_dags = [] dag_creation_dates = json.loads(Variable.get(key='dag_creation_dates')) email_notify_required = is_email_notification_required() try: for table in config.get('tables'): with open(configuration.get_airflow_home() + '/dags/templates/main.py.jinja2') as file_: template = Template(file_.read()) if dag_creation_dates.get(table) is not None: start_date = dag_creation_dates.get(table) else: start_date = get_start_date(config.get('start_date')) dag_creation_dates[table] = str(start_date) output = template.render( data={ 'dag_id': table, 'frequency': config.get('frequency'), 'storage_type': storage_type, 'start_date': start_date, 'email_required': email_notify_required } ) with open(configuration.get_airflow_home() + '/dags/generated/dag_' + '{}'.format(table).replace(' ', '_') + '.py', 'w') as f: f.write(output) new_dags.append('dag_' + '{}'.format(table).replace(' ', '_') + '.py') if len(r_config) != 0: for table in r_config: for exec_date in r_config.get(table): execution_date = str(exec_date).replace(' ', 'T')[0:19] with open(configuration.get_airflow_home() + '/dags/templates/recovery_template.py.jinja2') as file_: template = Template(file_.read()) output = template.render( data={'dag_id': table, 'frequency': config.get('frequency'), 'storage_type': storage_type, 'execution_date': execution_date}) with open(configuration.get_airflow_home() + '/dags/generated/r_dag_' + '{}_{}'.format( table, execution_date).replace(' ', '_') + '.py', 'w') as f: f.write(output) e = '{}'.format(execution_date).replace(' ', 'T') new_dags.append('r_dag_' + '{}_{}'.format(table, e).replace(' ', '_') + '.py') md_dag_ids = settings.Session.query(Dags.dag_id, Dags.fileloc).all() for record in md_dag_ids: (d_id, loc) = record filename = loc[str(loc).rfind('/') + 1:] if filename == 'dag_generator.py' or filename == 'dag_cleanup.py': continue if filename not in new_dags: try: if os.path.exists(str(loc)): os.remove(str(loc)) else: LoggingMixin().log.warning("{} file doesn't exists !".format(filename)) requests.delete( url="http://{}:8080/api/experimental/dags/{}".format( socket.gethostbyname(socket.gethostname()), str(d_id) ), auth=(rest.login, rest.password) ) dag_creation_dates.pop(d_id) except Exception as e: LoggingMixin().log.error(str(e)) Variable.set(key='dag_creation_dates', value=json.dumps(dag_creation_dates)) except AirflowException: raise ConfigVariableNotFoundException()
from datetime import datetime, timedelta from os import path from airflow import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator from airflow.utils.log.logging_mixin import LoggingMixin from helpers import SqlQueries from operators import (DataQualityOperator, DdlRedshiftOperator, DataQualityValidator, LoadFactOperator, LoadDimensionOperator) from stage_s3_to_redshift_and_validate_subdag import stage_s3_to_redshift_dag log = LoggingMixin().log # AWS_KEY = os.environ.get('AWS_KEY') # AWS_SECRET = os.environ.get('AWS_SECRET') AIRFLOW_AWS_CREDENTIALS_ID = "aws_credentials" AIRFLOW_REDSHIFT_CONN_ID = "redshift" # S3_BUCKET="udacity-dend" S3_BUCKET = "victor-nano-sparkify-raw-data-us-west-2" S3_LOGS_KEY = "log_data" S3_SONGS_KEY = "song_data" LOG_JSONPATH = "log_json_path.json" default_args = { 'owner': 'Victor Costa', 'depends_on_past': False, 'start_date': datetime(2018, 1, 11), 'retries': 3,
'result_serializer': 'pickle', 'worker_prefetch_multiplier': 1, 'task_acks_late': True, 'task_default_queue': configuration.get('celery', 'DEFAULT_QUEUE'), 'task_default_exchange': configuration.get('celery', 'DEFAULT_QUEUE'), 'broker_url': configuration.get('celery', 'BROKER_URL'), 'broker_transport_options': {'visibility_timeout': 21600}, 'result_backend': configuration.get('celery', 'CELERY_RESULT_BACKEND'), 'worker_concurrency': configuration.getint('celery', 'CELERYD_CONCURRENCY'), } celery_ssl_active = False try: celery_ssl_active = configuration.getboolean('celery', 'CELERY_SSL_ACTIVE') except AirflowConfigException as e: log = LoggingMixin().log log.warning("Celery Executor will run without SSL") try: if celery_ssl_active: broker_use_ssl = {'keyfile': configuration.get('celery', 'CELERY_SSL_KEY'), 'certfile': configuration.get('celery', 'CELERY_SSL_CERT'), 'ca_certs': configuration.get('celery', 'CELERY_SSL_CACERT'), 'cert_reqs': ssl.CERT_REQUIRED} DEFAULT_CELERY_CONFIG['broker_use_ssl'] = broker_use_ssl except AirflowConfigException as e: raise AirflowException('AirflowConfigException: CELERY_SSL_ACTIVE is True, ' 'please ensure CELERY_SSL_KEY, ' 'CELERY_SSL_CERT and CELERY_SSL_CACERT are set') except Exception as e: raise AirflowException('Exception: There was an unknown Celery SSL Error. '
# under the License. # from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals from functools import wraps import os import contextlib from airflow import settings from airflow.utils.log.logging_mixin import LoggingMixin log = LoggingMixin().log @contextlib.contextmanager def create_session(): """ Contextmanager that will create and teardown a session. """ session = settings.Session() try: yield session session.expunge_all() session.commit() except: session.rollback() raise