コード例 #1
0
        def _str(s):
            # cloudant-python doesn't support unicode.
            if isinstance(s, unicode):
                log = LoggingMixin().log
                log.debug(
                    'cloudant-python does not support unicode. Encoding %s as '
                    'ascii using "ignore".', s)
                return s.encode('ascii', 'ignore')

            return s
コード例 #2
0
    def handle_failure_retry(context):
        ti = context['ti']
        cmd_id = ti.xcom_pull(key='qbol_cmd_id', task_ids=ti.task_id)

        if cmd_id is not None:
            cmd = Command.find(cmd_id)
            if cmd is not None:
                if cmd.status == 'running':
                    log = LoggingMixin().log
                    log.info('Cancelling the Qubole Command Id: %s', cmd_id)
                    cmd.cancel()
コード例 #3
0
    def set(
            cls,
            key,
            value,
            execution_date,
            task_id,
            dag_id,
            session=None):
        """保存中间结果
        Store an XCom value.
        TODO: "pickling" has been deprecated and JSON is preferred.
              "pickling" will be removed in Airflow 2.0.
        :return: None
        """
        # 调用底层,清除所有session实例
        session.expunge_all()

        enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling')
        # 序列化中间结果
        if enable_pickling:
            value = pickle.dumps(value)
        else:
            try:
                # 注意编码转换
                value = json.dumps(value).encode('UTF-8')
            except ValueError:
                log = LoggingMixin().log
                log.error("Could not serialize the XCOM value into JSON. "
                          "If you are using pickles instead of JSON "
                          "for XCOM, then you need to enable pickle "
                          "support for XCOM in your airflow config.")
                raise

        # remove any duplicate XComs
        # 删除相同key的中间结果
        session.query(cls).filter(
            cls.key == key,
            cls.execution_date == execution_date,
            cls.task_id == task_id,
            cls.dag_id == dag_id).delete()
        session.commit()

        # insert new XCom
        session.add(XCom(
            key=key,
            value=value,
            execution_date=execution_date,
            task_id=task_id,
            dag_id=dag_id))
        session.commit()
コード例 #4
0
ファイル: __init__.py プロジェクト: fengzhongzhu1621/xAirflow
def GetDefaultExecutor():
    """Creates a new instance of the configured executor if none exists and returns it"""
    global DEFAULT_EXECUTOR

    if DEFAULT_EXECUTOR is not None:
        return DEFAULT_EXECUTOR

    executor_name = configuration.conf.get('core', 'EXECUTOR')

    DEFAULT_EXECUTOR = _get_executor(executor_name)

    log = LoggingMixin().log
    log.info("Using executor %s", executor_name)

    return DEFAULT_EXECUTOR
コード例 #5
0
    def get_one(cls,
                execution_date,
                key=None,
                task_id=None,
                dag_id=None,
                include_prior_dates=False,
                session=None):
        """获取一条中间结果
        Retrieve an XCom value, optionally meeting certain criteria.
        TODO: "pickling" has been deprecated and JSON is preferred.
              "pickling" will be removed in Airflow 2.0.
        :return: XCom value
        """
        # 构造搜索条件
        filters = []
        if key:
            filters.append(cls.key == key)
        if task_id:
            filters.append(cls.task_id == task_id)
        if dag_id:
            filters.append(cls.dag_id == dag_id)
        if include_prior_dates:
            filters.append(cls.execution_date <= execution_date)
        else:
            filters.append(cls.execution_date == execution_date)

        query = (
            session.query(cls.value).filter(and_(*filters))
                   .order_by(cls.execution_date.desc(), cls.timestamp.desc()))

        # 获得最近的一条记录
        result = query.first()
        if result:
            enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling')
            if enable_pickling:
                return pickle.loads(result.value)
            else:
                try:
                    # 注意编码转换
                    return json.loads(result.value.decode('UTF-8'))
                except ValueError:
                    log = LoggingMixin().log
                    log.error("Could not deserialize the XCOM value from JSON. "
                              "If you are using pickles instead of JSON "
                              "for XCOM, then you need to enable pickle "
                              "support for XCOM in your airflow config.")
                    raise
コード例 #6
0
    def set_dirty(dag_id, session=None):
        """
        :param dag_id: the dag_id to mark dirty
        :param session: database session
        :return:
        """
        # 给dag的每一种状态都创建一条记录
        # 将统计表中不存在的状态插入到db中
        DagStat.create(dag_id=dag_id, session=session)

        try:
            # 给指定的dag所有的状态行加行锁
            stats = session.query(DagStat).filter(
                DagStat.dag_id == dag_id
            ).with_for_update().all()

            # 修改设置dirty标记
            for stat in stats:
                stat.dirty = True
            session.commit()
        except Exception as e:
            session.rollback()
            log = LoggingMixin().log
            log.warning("Could not update dag stats for %s", dag_id)
            log.exception(e)
コード例 #7
0
ファイル: email.py プロジェクト: fengzhongzhu1621/xAirflow
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False):
    """发送邮件 ."""
    log = LoggingMixin().log

    SMTP_HOST = configuration.conf.get('smtp', 'SMTP_HOST')
    SMTP_PORT = configuration.conf.getint('smtp', 'SMTP_PORT')
    SMTP_STARTTLS = configuration.conf.getboolean('smtp', 'SMTP_STARTTLS')
    SMTP_SSL = configuration.conf.getboolean('smtp', 'SMTP_SSL')
    SMTP_USER = None
    SMTP_PASSWORD = None

    try:
        SMTP_USER = configuration.conf.get('smtp', 'SMTP_USER')
        SMTP_PASSWORD = configuration.conf.get('smtp', 'SMTP_PASSWORD')
    except XToolConfigException:
        log.debug(
            "No user/password found for SMTP, so logging in with no authentication."
        )

    if not dryrun:
        s = smtplib.SMTP_SSL(SMTP_HOST,
                             SMTP_PORT) if SMTP_SSL else smtplib.SMTP(
                                 SMTP_HOST, SMTP_PORT)
        if SMTP_STARTTLS:
            s.starttls()
        if SMTP_USER and SMTP_PASSWORD:
            s.login(SMTP_USER, SMTP_PASSWORD)
        log.info("Sent an alert email to %s", e_to)
        s.sendmail(e_from, e_to, mime_msg.as_string())
        s.quit()
コード例 #8
0
ファイル: send_email.py プロジェクト: P79N6A/xTool
def send_MIME_email(e_from,
                    e_to,
                    mime_msg,
                    host,
                    port,
                    ssl=False,
                    starttls=False,
                    user=None,
                    password=None,
                    dryrun=False):
    """发送邮件 ."""
    log = LoggingMixin().log

    if not user or not password:
        log.debug(
            "No user/password found for SMTP, so logging in with no authentication."
        )

    if not dryrun:
        s = smtplib.SMTP_SSL(host, port) if ssl else smtplib.SMTP(host, port)
        if starttls:
            s.starttls()
        s.login(user, password)
        log.info("Sent an alert email to %s", e_to)
        s.sendmail(e_from, e_to, mime_msg.as_string())
        s.quit()
コード例 #9
0
ファイル: __init__.py プロジェクト: fengzhongzhu1621/xAirflow
def load_login():
    log = LoggingMixin().log

    auth_backend = 'airflow.default_login'
    try:
        # 获得默认web认证
        if conf.getboolean('webserver', 'AUTHENTICATE'):
            auth_backend = conf.get('webserver', 'auth_backend')
    except (AirflowConfigException, XToolConfigException):
        if conf.getboolean('webserver', 'AUTHENTICATE'):
            log.warning(
                "auth_backend not found in webserver config reverting to "
                "*deprecated*  behavior of importing airflow_login")
            auth_backend = "airflow_login"

    # 导入认证模块
    try:
        global login
        login = import_module(auth_backend)
    except ImportError as err:
        log.critical(
            "Cannot import authentication module %s. "
            "Please correct your authentication backend or disable authentication: %s",
            auth_backend, err
        )
        if conf.getboolean('webserver', 'AUTHENTICATE'):
            raise AirflowException("Failed to import authentication backend")
コード例 #10
0
def execute_command(command):
    """airflow worker 执行shell命令 ."""
    log = LoggingMixin().log
    log.info("Executing command in Celery: %s", command)
    env = os.environ.copy()
    try:
        # celery worker 收到消息后,执行消息中的shell命令
        subprocess.check_call(command,
                              shell=True,
                              stderr=subprocess.STDOUT,
                              close_fds=True,
                              env=env)
    except subprocess.CalledProcessError as e:
        log.exception('execute_command encountered a CalledProcessError')
        log.error(e.output)
        raise AirflowException('Celery command failed')
コード例 #11
0
 def wrapper(*args, **kwargs):
     metrics = {}
     metrics['begin_datetime'] = datetime.now()
     metrics['end_datetime'] = None
     metrics['log'] = LoggingMixin().log
     on_pre_execution(**metrics)
     try:
         return f(*args, **kwargs)
     except Exception as e:
         metrics['error'] = e
         raise
     finally:
         metrics['end_datetime'] = datetime.now()
         on_post_execution(**metrics)
コード例 #12
0
    def _to_timestamp(cls, col):
        """
        Convert a column of a dataframe to UNIX timestamps if applicable

        :param col:     A Series object representing a column of a dataframe.
        """
        # try and convert the column to datetimes
        # the column MUST have a four digit year somewhere in the string
        # there should be a better way to do this,
        # but just letting pandas try and convert every column without a format
        # caused it to convert floats as well
        # For example, a column of integers
        # between 0 and 10 are turned into timestamps
        # if the column cannot be converted,
        # just return the original column untouched
        try:
            col = pd.to_datetime(col)
        except ValueError:
            log = LoggingMixin().log
            log.warning("Could not convert field to timestamps: %s", col.name)
            return col

        # now convert the newly created datetimes into timestamps
        # we have to be careful here
        # because NaT cannot be converted to a timestamp
        # so we have to return NaN
        converted = []
        for i in col:
            try:
                converted.append(i.timestamp())
            except ValueError:
                converted.append(pd.np.NaN)
            except AttributeError:
                converted.append(pd.np.NaN)

        # return a new series that maintains the same index as the original
        return pd.Series(converted, index=col.index)
コード例 #13
0
def _post_sendgrid_mail(mail_data):
    log = LoggingMixin().log
    sg = sendgrid.SendGridAPIClient(apikey=os.environ.get('SENDGRID_API_KEY'))
    response = sg.client.mail.send.post(request_body=mail_data)
    # 2xx status code.
    if response.status_code >= 200 and response.status_code < 300:
        log.info('Email with subject %s is successfully sent to recipients: %s' %
                 (mail_data['subject'], mail_data['personalizations']))
    else:
        log.warning('Failed to send out email with subject %s, status code: %s' %
                    (mail_data['subject'], response.status_code))
コード例 #14
0
    def update(dag_ids=None, dirty_only=True, session=None):
        """更新dag每个状态的dag_run的数量,并设置dirty为False
        Updates the stats for dirty/out-of-sync dags

        :param dag_ids: dag_ids to be updated
        :type dag_ids: list
        :param dirty_only: only updated for marked dirty, defaults to True
        :type dirty_only: bool
        :param session: db session to use
        :type session: Session
        """
        try:
            qry = session.query(DagStat)
            if dag_ids:
                qry = qry.filter(DagStat.dag_id.in_(set(dag_ids)))
            # 仅仅获得脏数据
            if dirty_only:
                qry = qry.filter(DagStat.dirty == True) # noqa

            # 添加行级锁
            qry = qry.with_for_update().all()

            # 获得所有的dagId列表
            dag_ids = set([dag_stat.dag_id for dag_stat in qry])

            # avoid querying with an empty IN clause
            if not dag_ids:
                session.commit()
                return

            # 获得dag每个dagrun状态的记录数量
            begin_time = datetime.now() - timedelta(days=configuration.getint('core', 'sql_query_history_days'))
            dagstat_states = set(itertools.product(dag_ids, State.dag_states))
            qry = (
                session.query(DagRun.dag_id, DagRun.state, func.count('*'))
                .filter(DagRun.dag_id.in_(dag_ids))
                .filter(DagRun.execution_date > begin_time)
                .group_by(DagRun.dag_id, DagRun.state)
            )
            counts = {(dag_id, state): count for dag_id, state, count in qry}

            # 修改每个dag_id的每个状态的dagrund的数量
            for dag_id, state in dagstat_states:
                count = counts.get((dag_id, state), 0)
                session.merge(
                    DagStat(dag_id=dag_id, state=state, count=count, dirty=False)
                )

            session.commit()
        except Exception as e:
            session.rollback()
            log = LoggingMixin().log
            log.warning("Could not update dag stat table")
            log.exception(e)
コード例 #15
0
 def get_query_results(self):
     log = LoggingMixin().log
     if self.cmd is not None:
         cmd_id = self.cmd.id
         log.info("command id: " + str(cmd_id))
         query_result_buffer = StringIO()
         self.cmd.get_results(fp=query_result_buffer, inline=True, delim=COL_DELIM)
         query_result = query_result_buffer.getvalue()
         query_result_buffer.close()
         return query_result
     else:
         log.info("Qubole command not found")
コード例 #16
0
    def handle_failure_retry(context):
        ti = context['ti']
        cmd_id = ti.xcom_pull(key='qbol_cmd_id', task_ids=ti.task_id)

        if cmd_id is not None:
            cmd = Command.find(cmd_id)
            if cmd is not None:
                log = LoggingMixin().log
                if cmd.status == 'done':
                    log.info(
                        'Command ID: %s has been succeeded, hence marking this '
                        'TI as Success.', cmd_id)
                    ti.state = State.SUCCESS
                elif cmd.status == 'running':
                    log.info('Cancelling the Qubole Command Id: %s', cmd_id)
                    cmd.cancel()
コード例 #17
0
    def filter_for_filesize(result, size=None):
        """
        Will test the filepath result and test if its size is at least self.filesize

        :param result: a list of dicts returned by Snakebite ls
        :param size: the file size in MB a file should be at least to trigger True
        :return: (bool) depending on the matching criteria
        """
        if size:
            log = LoggingMixin().log
            log.debug('Filtering for file size >= %s in files: %s', size,
                      map(lambda x: x['path'], result))
            size *= settings.MEGABYTE
            result = [x for x in result if x['length'] >= size]
            log.debug('HdfsSensor.poke: after size filter result is %s',
                      result)
        return result
コード例 #18
0
 def get_val(self):
     """获得常量的值 ."""
     log = LoggingMixin().log
     if self._val and self.is_encrypted:
         try:
             # 解密常量值
             fernet_key = configuration.conf.get('core', 'FERNET_KEY')
             fernet = get_fernet(fernet_key)
             return fernet.decrypt(bytes(self._val, 'utf-8')).decode()
         except InvalidFernetToken:
             # 解密失败返回None
             log.error("Can't decrypt _val for key={}, invalid token "
                       "or value".format(self.key))
             return None
         except Exception:
             log.error("Can't decrypt _val for key={}, FERNET_KEY "
                       "configuration missing".format(self.key))
             return None
     else:
         return self._val
コード例 #19
0
def _poll_with_exponential_delay(request, max_n, is_done_func, is_error_func):
    log = LoggingMixin().log

    for i in range(0, max_n):
        try:
            response = request.execute()
            if is_error_func(response):
                raise ValueError(
                    'The response contained an error: {}'.format(response)
                )
            elif is_done_func(response):
                log.info('Operation is done: %s', response)
                return response
            else:
                time.sleep((2**i) + (random.randint(0, 1000) / 1000))
        except errors.HttpError as e:
            if e.resp.status != 429:
                log.info('Something went wrong. Not retrying: %s', format(e))
                raise
            else:
                time.sleep((2**i) + (random.randint(0, 1000) / 1000))
コード例 #20
0
    def create(dag_id, session=None):
        """将统计表中不存在的状态插入到db中

        Creates the missing states the stats table for the dag specified

        :param dag_id: dag id of the dag to create stats for
        :param session: database session
        :return:
        """
        # 获得DagStat中存在的状态
        qry = session.query(DagStat).filter(DagStat.dag_id == dag_id).all()
        states = {dag_stat.state for dag_stat in qry}
        # 遍历所有状态, 找出不再数据库中的状态
        states_not_found = set(State.dag_states) - states
        for state in states_not_found:
            try:
                session.merge(DagStat(dag_id=dag_id, state=state))
                session.commit()
            except Exception as e:
                session.rollback()
                log = LoggingMixin().log
                log.warning("Could not create stat record")
                log.exception(e)
コード例 #21
0
    def filter_for_ignored_ext(result, ignored_ext, ignore_copying):
        """
        Will filter if instructed to do so the result to remove matching criteria

        :param result: (list) of dicts returned by Snakebite ls
        :param ignored_ext: (list) of ignored extensions
        :param ignore_copying: (bool) shall we ignore ?
        :return: (list) of dicts which were not removed
        """
        if ignore_copying:
            log = LoggingMixin().log
            regex_builder = "^.*\.(%s$)$" % '$|'.join(ignored_ext)
            ignored_extentions_regex = re.compile(regex_builder)
            log.debug(
                'Filtering result for ignored extensions: %s in files %s',
                ignored_extentions_regex.pattern,
                map(lambda x: x['path'], result))
            result = [
                x for x in result
                if not ignored_extentions_regex.match(x['path'])
            ]
            log.debug('HdfsSensor.poke: after ext filter result is %s', result)
        return result
コード例 #22
0
# specific language governing permissions and limitations
# under the License.

import ssl

from airflow import configuration
from airflow.exceptions import AirflowConfigException, AirflowException
from xTool.exceptions import XToolConfigException
from xTool.utils.log.logging_mixin import LoggingMixin


def _broker_supports_visibility_timeout(url):
    return url.startswith("redis://") or url.startswith("sqs://")


log = LoggingMixin().log

broker_url = configuration.conf.get('celery', 'BROKER_URL')
# 任务发出后,经过一段时间还未收到acknowledge , 就将任务重新交给其他worker执行
broker_transport_options = configuration.conf.getsection(
    'celery_broker_transport_options')
if 'visibility_timeout' not in broker_transport_options:
    if _broker_supports_visibility_timeout(broker_url):
        broker_transport_options = {'visibility_timeout': 21600}

DEFAULT_CELERY_CONFIG = {
    'accept_content': ['json', 'pickle'],  # 指定接受的内容类型
    'event_serializer':
    'json',  # 发送event message的格式
    'worker_prefetch_multiplier':
    1,  # 每个worker每次只取一个消息
コード例 #23
0
import six
from six import iteritems
import warnings
from zope.deprecation import deprecated as _deprecated

from xTool.crypto.fernet import generate_fernet_key
from xTool.exceptions import AirflowConfigException
from xTool.utils.helpers import expand_env_var
from xTool.utils.configuration import read_config_file
from xTool.utils.configuration import XToolConfigParser
from xTool.utils.file import mkdir_p
from xTool.utils.log.logging_mixin import LoggingMixin

standard_library.install_aliases()

log = LoggingMixin().log

# 控制警告错误的输出
# show Airflow's deprecation warnings
warnings.filterwarnings(action='default',
                        category=DeprecationWarning,
                        module='airflow')
warnings.filterwarnings(action='default',
                        category=PendingDeprecationWarning,
                        module='airflow')


def parameterized_config(template):
    """使用全局变量和局部变量渲染模版字符串
    Generates a configuration from the provided template + variables defined in
    current scope
コード例 #24
0
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from xTool.utils.dates import days_ago
from xTool.utils.log.logging_mixin import LoggingMixin
from airflow.models import DAG

log = LoggingMixin().log

try:
    # Kubernetes is optional, so not available in vanilla Airflow
    # pip install apache-airflow[kubernetes]
    from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator

    args = {'owner': 'airflow', 'start_date': days_ago(2)}

    dag = DAG(dag_id='example_kubernetes_operator',
              default_args=args,
              schedule_interval=None)

    k = KubernetesPodOperator(namespace='default',
                              image="ubuntu:16.04",
                              cmds=["bash", "-cx"],
コード例 #25
0
ファイル: file.py プロジェクト: P79N6A/xTool
def list_py_file_paths(directory,
                       followlinks=True,
                       ignore_filename='.ignore',
                       file_ext='.py',
                       safe_mode=False,
                       safe_filters=(b'xTool', b'XTool')):
    """递归遍历目录,返回匹配规则的文件列表
    Traverse a directory and look for Python files.

    :param directory: the directory to traverse
    :type directory: unicode
    :param safe_mode: whether to use a heuristic to determine whether a file
    contains Airflow DAG definitions
    :return: a list of paths to Python files in the specified directory
    :rtype: list[unicode]
    """
    file_paths = []
    if directory is None:
        return []
    elif os.path.isfile(directory):
        return [directory]
    elif os.path.isdir(directory):
        patterns = []
        # 递归遍历目录,包含链接文件
        for root, dirs, files in os.walk(directory, followlinks=followlinks):
            # 获得需要忽略的文件
            ignore_file = [f for f in files if f == ignore_filename]
            if ignore_file:
                f = open(os.path.join(root, ignore_file[0]), 'r')
                patterns += [p.strip() for p in f.read().split('\n') if p]
                f.close()
            for f in files:
                try:
                    # 获得文件的绝对路径
                    file_path = os.path.join(root, f)
                    if not os.path.isfile(file_path):
                        continue
                    # 验证文件后缀
                    mod_name, file_extension = os.path.splitext(
                        os.path.split(file_path)[-1])
                    if file_extension != file_ext and not zipfile.is_zipfile(
                            file_path):
                        continue
                    # 验证忽略规则
                    if any([re.findall(p, file_path) for p in patterns]):
                        continue

                    # 使用启发式方式猜测是否是一个DAG文件,DAG文件需要包含DAG 或 airflow
                    # Heuristic that guesses whether a Python file contains an
                    # Airflow DAG definition.
                    might_contain_dag = True
                    if safe_mode and not zipfile.is_zipfile(file_path):
                        with open(file_path, 'rb') as f:
                            content = f.read()
                            might_contain_dag = all(
                                [s in content for s in safe_filters])

                    if not might_contain_dag:
                        continue

                    file_paths.append(file_path)
                except Exception:
                    log = LoggingMixin().log
                    log.exception("Error while examining %s", f)
    return file_paths
コード例 #26
0
# specific language governing permissions and limitations
# under the License.
#
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from airflow import configuration
from xTool.utils.log.logging_mixin import LoggingMixin
from xTool.utils.module_loading import prepare_classpath
from xTool.utils.module_loading import make_module
from xTool.plugins_manager import XToolPlugin
from xTool.plugins_manager import import_plugins

log = LoggingMixin().log


class AirflowPlugin(XToolPlugin):
    pass


# 获得插件目录
plugins_folder = configuration.conf.get('core', 'plugins_folder')
if not plugins_folder:
    plugins_folder = configuration.conf.get('core', 'airflow_home') + '/plugins'
# 将插件目录加入到系统路径中
prepare_classpath(plugins_folder)

# 导入插件
plugins = import_plugins(plugins_folder)
コード例 #27
0
def list_py_file_paths(
    directory,
    followlinks=True,
    ignore_filename='.ignore',
    file_ext='.py',
    safe_mode=False,
    safe_filters=(b'xTool', b'XTool')):
    """递归遍历目录,返回匹配规则的文件列表
    Traverse a directory and look for Python files.

    :param directory: the directory to traverse
    :type directory: unicode
    :param safe_mode: whether to use a heuristic to determine whether a file
    contains Airflow DAG definitions
    :return: a list of paths to Python files in the specified directory
    :rtype: list[unicode]
    """
    file_paths = []
    if directory is None:
        return []
    elif os.path.isfile(directory):
        return [directory]
    elif os.path.isdir(directory):
        patterns_by_dir = {}
        # 递归遍历目录,包含链接文件
        for root, dirs, files in os.walk(directory, followlinks=followlinks):
            patterns = patterns_by_dir.get(root, [])
            # 获得需要忽略的文件
            ignore_file = os.path.join(root, ignore_filename)
            if os.path.isfile(ignore_file):
                with open(ignore_file, 'r') as f:
                    # If we have new patterns create a copy so we don't change
                    # the previous list (which would affect other subdirs)
                    patterns = patterns + \
                        [p for p in f.read().split('\n') if p]

            # If we can ignore any subdirs entirely we should - fewer paths
            # to walk is better. We have to modify the ``dirs`` array in
            # place for this to affect os.walk
            dirs[:] = [
                d
                for d in dirs
                if not any(re.search(p, os.path.join(root, d)) for p in patterns)
            ]

            # We want patterns defined in a parent folder's .airflowignore to
            # apply to subdirs too
            for d in dirs:
                patterns_by_dir[os.path.join(root, d)] = patterns

            for f in files:
                try:
                    # 获得文件的绝对路径
                    file_path = os.path.join(root, f)
                    if not os.path.isfile(file_path):
                        continue
                    # 验证文件后缀
                    mod_name, file_extension = os.path.splitext(
                        os.path.split(file_path)[-1])
                    if file_extension != file_ext and not zipfile.is_zipfile(
                            file_path):
                        continue
                    # 验证忽略规则
                    if any([re.findall(p, file_path) for p in patterns]):
                        continue

                    # 使用启发式方式猜测是否是一个DAG文件,DAG文件需要包含DAG 或 airflow
                    # Heuristic that guesses whether a Python file contains an
                    # Airflow DAG definition.
                    might_contain_dag = True
                    if safe_mode and not zipfile.is_zipfile(file_path):
                        with open(file_path, 'rb') as f:
                            content = f.read()
                            might_contain_dag = all(
                                [s in content for s in safe_filters])

                    if not might_contain_dag:
                        continue

                    file_paths.append(file_path)
                except Exception:
                    log = LoggingMixin().log
                    log.exception("Error while examining %s", f)
    return file_paths
コード例 #28
0
# under the License.

from hdfs import InsecureClient, HdfsError

from airflow import configuration
from airflow.exceptions import AirflowException
from airflow.hooks.base_hook import BaseHook
from xTool.utils.log.logging_mixin import LoggingMixin

_kerberos_security_mode = configuration.conf.get("core",
                                                 "security") == "kerberos"
if _kerberos_security_mode:
    try:
        from hdfs.ext.kerberos import KerberosClient
    except ImportError:
        log = LoggingMixin().log
        log.error("Could not load the Kerberos extension for the WebHDFSHook.")
        raise


class AirflowWebHDFSHookException(AirflowException):
    pass


class WebHDFSHook(BaseHook):
    """
    Interact with HDFS. This class is a wrapper around the hdfscli library.
    """
    def __init__(self, webhdfs_conn_id='webhdfs_default', proxy_user=None):
        self.webhdfs_conn_id = webhdfs_conn_id
        self.proxy_user = proxy_user
コード例 #29
0
ファイル: plugins_manager.py プロジェクト: P79N6A/xTool
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from builtins import object
import imp
import inspect
import os
import re
import sys

from xTool import configuration
from xTool.utils.log.logging_mixin import LoggingMixin

log = LoggingMixin().log


class XToolPluginException(Exception):
    pass


class XToolPlugin(object):
    name = None
    operators = []
    sensors = []
    hooks = []
    executors = []
    macros = []
    admin_views = []
    flask_blueprints = []
コード例 #30
0
 def get_connection(cls, conn_id):
     conn = random.choice(cls.get_connections(conn_id))
     if conn.host:
         log = LoggingMixin().log
         log.info("Using connection to: %s", conn.host)
     return conn