def list_py_file_paths(directory, safe_mode=True, include_examples=conf.getboolean('core', 'LOAD_EXAMPLES')): """ Traverse a directory and look for Python files. :param directory: the directory to traverse :type directory: unicode :param safe_mode: whether to use a heuristic to determine whether a file contains Airflow DAG definitions :return: a list of paths to Python files in the specified directory :rtype: list[unicode] """ file_paths = [] if directory is None: return [] elif os.path.isfile(directory): return [directory] elif os.path.isdir(directory): patterns_by_dir = {} for root, dirs, files in os.walk(directory, followlinks=True): patterns = patterns_by_dir.get(root, []) ignore_file = os.path.join(root, '.airflowignore') if os.path.isfile(ignore_file): with open(ignore_file, 'r') as f: # If we have new patterns create a copy so we don't change # the previous list (which would affect other subdirs) patterns = patterns + [p for p in f.read().split('\n') if p] # If we can ignore any subdirs entirely we should - fewer paths # to walk is better. We have to modify the ``dirs`` array in # place for this to affect os.walk dirs[:] = [ d for d in dirs if not any(re.search(p, os.path.join(root, d)) for p in patterns) ] # We want patterns defined in a parent folder's .airflowignore to # apply to subdirs too for d in dirs: patterns_by_dir[os.path.join(root, d)] = patterns for f in files: try: file_path = os.path.join(root, f) if not os.path.isfile(file_path): continue mod_name, file_ext = os.path.splitext( os.path.split(file_path)[-1]) if file_ext != '.py' and not zipfile.is_zipfile(file_path): continue if any([re.findall(p, file_path) for p in patterns]): continue # Heuristic that guesses whether a Python file contains an # Airflow DAG definition. might_contain_dag = True if safe_mode and not zipfile.is_zipfile(file_path): with open(file_path, 'rb') as f: content = f.read() might_contain_dag = all( [s in content for s in (b'DAG', b'airflow')]) if not might_contain_dag: continue file_paths.append(file_path) except Exception: log = LoggingMixin().log log.exception("Error while examining %s", f) if include_examples: import airflow.example_dags example_dag_folder = airflow.example_dags.__path__[0] file_paths.extend(list_py_file_paths(example_dag_folder, safe_mode, False)) return file_paths
import attr import jinja2 from cattr import structure, unstructure from airflow.models.base import Operator from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.module_loading import import_string ENV = jinja2.Environment() PIPELINE_OUTLETS = "pipeline_outlets" PIPELINE_INLETS = "pipeline_inlets" AUTO = "auto" log = LoggingMixin().log @attr.s(auto_attribs=True) class Metadata: """ Class for serialized entities. """ type_name: str = attr.ib() data: Dict = attr.ib() def _get_instance(meta: Metadata): """ Instantiate an object from Metadata """
def get_connection(cls, conn_id): conn = random.choice(cls.get_connections(conn_id)) if conn.host: log = LoggingMixin().log log.info("Using connection to: %s", conn.host) return conn
import logging from typing import TYPE_CHECKING, Dict, Optional, Union import pendulum from dateutil import relativedelta import airflow from airflow.exceptions import AirflowException from airflow.models.baseoperator import BaseOperator from airflow.models.connection import Connection from airflow.models.dag import DAG from airflow.serialization.enums import DagAttributeTypes as DAT, Encoding from airflow.utils.log.logging_mixin import LoggingMixin from airflow.www.utils import get_python_source LOG = LoggingMixin().log # Serialization failure returns 'failed'. FAILED = 'serialization_failed' class Serialization: """Serialization provides utils for serialization.""" # JSON primitive types. _primitive_types = (int, bool, float, str) # Time types. # datetime.date and datetime.time are converted to strings. _datetime_types = (datetime.datetime, )
def fetch_servicenow_record_count(table_name, execution_date, **kwargs): """ This method calls the service now api for a particular table and time period and gets count of records for a particular table :param: table_name : for which count of records is fetched :param: execution_date : airflow execution date of the dag :return: task_id """ # check for empty if is_empty(table_name) or is_empty(execution_date): raise InvalidArguments("table_name, execution_date can't be empty") # check for none if table_name is None or execution_date is None: raise InvalidArguments("table_name, execution_date can't be None") try: try: # Load Configuration Data config = json.loads(Variable.get("config")) frequency = config['frequency'] execution_datetime = datetime.strptime(execution_date[:19], "%Y-%m-%dT%H:%M:%S") if frequency == 'hourly': freq_param = timedelta(hours=-1) elif frequency == 'daily': freq_param = timedelta(days=-1) elif frequency == 'monthly': freq_param = timedelta(days=-1 * one_month_ago(execution_date)) elif frequency == 'half-hourly': freq_param = timedelta(minutes=-30) else: freq_param = timedelta(hours=-1) to_time = datetime(year=execution_datetime.year, month=execution_datetime.month, day=execution_datetime.day, hour=execution_datetime.hour, minute=execution_datetime.minute, second=execution_datetime.second, tzinfo=pendulum.timezone("UTC")) from_time = to_time + freq_param except KeyError as e: raise ConfigVariableNotFoundException() try: credentials_snow = BaseHook.get_connection("servicenow_default") login = credentials_snow.login password = credentials_snow.password host = credentials_snow.host except AirflowException as e: raise ServiceNowConnectionNotFoundException() service_now_hook = ServiceNowHook(host=host, login=login, password=password) response = service_now_hook.api_call( route='/api/now/stats/{}'.format(table_name), accept='application/json', query_params={ 'sysparm_count': 'true', 'sysparm_query': "sys_updated_onBETWEENjavascript:gs.dateGenerate('{}','{}')" "@javascript:gs.dateGenerate('{}','{}')".format( str(from_time.date()), str(from_time.time()), str(to_time.date()), str(to_time.time())) }) print('response :' + response) count_of_records = int( json.loads(response)['result']['stats']['count']) log = LoggingMixin().log log.info("Getting count from: {} to : {} ".format(from_time, to_time)) log.info("totals number of records %s ", str(count_of_records)) if int(count_of_records) == 0: return 'count_is_zero' elif int(count_of_records) > config['threshold']: return 'count_exceeds_threshold' else: return 'count_within_threshold' except Exception as e: kwargs['ti'].xcom_push(key='exception', value=str(e)) instance = kwargs['task_instance'] dag_id = str(instance.dag_id) task_id = str(instance.task_id) msg = str(e) execution_date = str(instance.execution_date) run_id = str(kwargs['run_id']) execution_date = execution_date.replace('T', ' ')[0:19] key = '{}${}'.format(execution_date, dag_id) value = { 'dag_id': dag_id, 'execution_date': execution_date, 'task_id': task_id, 'run_id': run_id, 'error_msg': msg } Variable.set(key=key, value=json.dumps(value)) raise
def get_connection(cls, conn_id): # type: (str) -> Connection conn = random.choice(list(cls.get_connections(conn_id))) if conn.host: log = LoggingMixin().log log.info("Using connection to: %s", conn.debug_info()) return conn
def _upload(self, context): # dropbox Connection details try: credentials_dropbox = BaseHook.get_connection(self.storage_conn_id) self.dropbox_access_token = credentials_dropbox.password except AirflowException as e: raise DropboxConnectionNotFoundException if self.is_storage_available(self.dropbox_access_token): try: LoggingMixin().log.info("Dropbox Storage avalaible") l_file_path = self.file_name.replace('.csv', '.json') file_name = l_file_path[l_file_path.rfind('/') + 1:] dt_current = datetime.strptime(self.execution_date[:19], "%Y-%m-%dT%H:%M:%S") exec_hour = str(dt_current.hour) exec_minute = str(dt_current.minute) exec_second = str(dt_current.second) if exec_hour == '0' and exec_minute == '0' and exec_second == '0': dt_current = dt_current - timedelta(days=1) r_file_path = '{}/{}/{}/{}/{}'.format( '/mbrs', 'Servicenow', self.table, '{}-{}-{}'.format(dt_current.year, dt_current.month, dt_current.day), file_name) else: r_file_path = '{}/{}/{}/{}/{}'.format( '/mbrs', 'Servicenow', self.table, '{}-{}-{}'.format(dt_current.year, dt_current.month, dt_current.day), file_name) LoggingMixin().log.info("Running dropbox upload process...") try: file_size = os.path.getsize(l_file_path) CHUNK_SIZE = 4 * 1024 * 1024 dbx = dropbox.Dropbox(self.dropbox_access_token, timeout=600) if file_size <= CHUNK_SIZE: with open(l_file_path, 'rb') as f: dbx.files_upload( f.read(), r_file_path, mode=dropbox.files.WriteMode.overwrite) f.close() return True else: with open(l_file_path, 'rb') as f: upload_session_start_result = dbx.files_upload_session_start( f.read(CHUNK_SIZE)) cursor = dropbox.files.UploadSessionCursor( session_id=upload_session_start_result. session_id, offset=f.tell()) commit = dropbox.files.CommitInfo(path=r_file_path) while f.tell() < file_size: if (file_size - f.tell()) <= CHUNK_SIZE: print( dbx.files_upload_session_finish( f.read(CHUNK_SIZE), cursor, commit)) else: dbx.files_upload_session_append_v2( f.read(CHUNK_SIZE), cursor) cursor.offset = f.tell() f.close() return True except Exception as e: LoggingMixin().log.error( "ServiceNow2DropBoxTransOperator : exception in dropbox upload for token : {} {}" .format(self.dropbox_access_token, e)) return False except Exception as e: print(e) else: LoggingMixin().log.info("Dropbox Storage not avalaible") return False
def create_airflow_rest_connection(): from airflow.contrib.auth.backends.password_auth import PasswordUser import base64 import os session = settings.Session() exists = session.query(models.User).filter(models.User.username == 'application').scalar() if exists is None: LoggingMixin().log.info("creating 'application' user for mini-BRS...") # create 'application' user random_key = str(base64.urlsafe_b64encode(os.urandom(32))) user = PasswordUser(models.User()) user.username = '******' user.email = '*****@*****.**' user.password = random_key session.add(user) session.commit() session.close() # create 'application' airflow connection rest = Connection( conn_id='rest', login='******', password=random_key ) session = settings.Session() session.add(rest) session.commit() session.close() # create 'admin' user # admin_password = str(base64.urlsafe_b64encode(os.urandom(32))) config_parser = configuration.AirflowConfigParser() config_parser.read( configuration.get_airflow_config( configuration.get_airflow_home() ) ) u = config_parser.get( section='core', key='username' ) p = config_parser.get( section='core', key='password' ) user = PasswordUser(models.User()) user.username = u user.email = '*****@*****.**' user.password = p user.superuser = True session = settings.Session() session.add(user) session.commit() session.close() config_parser.remove_option( section='core', option='username' ) config_parser.remove_option( section='core', option='password' ) file = open(configuration.get_airflow_config(configuration.get_airflow_home()), 'w') config_parser.write(file) file.close()
def create_dags(): global dag_creation_dates global new_dags global email_notify_required new_dags = [] dag_creation_dates = json.loads(Variable.get(key='dag_creation_dates')) email_notify_required = is_email_notification_required() try: for table in config.get('tables'): with open(configuration.get_airflow_home() + '/dags/templates/main.py.jinja2') as file_: template = Template(file_.read()) if dag_creation_dates.get(table) is not None: start_date = dag_creation_dates.get(table) else: start_date = get_start_date(config.get('start_date')) dag_creation_dates[table] = str(start_date) output = template.render( data={ 'dag_id': table, 'frequency': config.get('frequency'), 'storage_type': storage_type, 'start_date': start_date, 'email_required': email_notify_required } ) with open(configuration.get_airflow_home() + '/dags/generated/dag_' + '{}'.format(table).replace(' ', '_') + '.py', 'w') as f: f.write(output) new_dags.append('dag_' + '{}'.format(table).replace(' ', '_') + '.py') if len(r_config) != 0: for table in r_config: for exec_date in r_config.get(table): execution_date = str(exec_date).replace(' ', 'T')[0:19] with open(configuration.get_airflow_home() + '/dags/templates/recovery_template.py.jinja2') as file_: template = Template(file_.read()) output = template.render( data={'dag_id': table, 'frequency': config.get('frequency'), 'storage_type': storage_type, 'execution_date': execution_date}) with open(configuration.get_airflow_home() + '/dags/generated/r_dag_' + '{}_{}'.format( table, execution_date).replace(' ', '_') + '.py', 'w') as f: f.write(output) e = '{}'.format(execution_date).replace(' ', 'T') new_dags.append('r_dag_' + '{}_{}'.format(table, e).replace(' ', '_') + '.py') md_dag_ids = settings.Session.query(Dags.dag_id, Dags.fileloc).all() for record in md_dag_ids: (d_id, loc) = record filename = loc[str(loc).rfind('/') + 1:] if filename == 'dag_generator.py' or filename == 'dag_cleanup.py': continue if filename not in new_dags: try: if os.path.exists(str(loc)): os.remove(str(loc)) else: LoggingMixin().log.warning("{} file doesn't exists !".format(filename)) requests.delete( url="http://{}:8080/api/experimental/dags/{}".format( socket.gethostbyname(socket.gethostname()), str(d_id) ), auth=(rest.login, rest.password) ) dag_creation_dates.pop(d_id) except Exception as e: LoggingMixin().log.error(str(e)) Variable.set(key='dag_creation_dates', value=json.dumps(dag_creation_dates)) except AirflowException: raise ConfigVariableNotFoundException()