Beispiel #1
0
    def set_dirty(dag_id, session=None):
        """
        :param dag_id: the dag_id to mark dirty
        :param session: database session
        :return:
        """
        # 给dag的每一种状态都创建一条记录
        # 将统计表中不存在的状态插入到db中
        DagStat.create(dag_id=dag_id, session=session)

        try:
            # 给指定的dag所有的状态行加行锁
            stats = session.query(DagStat).filter(
                DagStat.dag_id == dag_id
            ).with_for_update().all()

            # 修改设置dirty标记
            for stat in stats:
                stat.dirty = True
            session.commit()
        except Exception as e:
            session.rollback()
            log = LoggingMixin().log
            log.warning("Could not update dag stats for %s", dag_id)
            log.exception(e)
Beispiel #2
0
    def update(dag_ids=None, dirty_only=True, session=None):
        """更新dag每个状态的dag_run的数量,并设置dirty为False
        Updates the stats for dirty/out-of-sync dags

        :param dag_ids: dag_ids to be updated
        :type dag_ids: list
        :param dirty_only: only updated for marked dirty, defaults to True
        :type dirty_only: bool
        :param session: db session to use
        :type session: Session
        """
        try:
            qry = session.query(DagStat)
            if dag_ids:
                qry = qry.filter(DagStat.dag_id.in_(set(dag_ids)))
            # 仅仅获得脏数据
            if dirty_only:
                qry = qry.filter(DagStat.dirty == True) # noqa

            # 添加行级锁
            qry = qry.with_for_update().all()

            # 获得所有的dagId列表
            dag_ids = set([dag_stat.dag_id for dag_stat in qry])

            # avoid querying with an empty IN clause
            if not dag_ids:
                session.commit()
                return

            # 获得dag每个dagrun状态的记录数量
            begin_time = datetime.now() - timedelta(days=configuration.getint('core', 'sql_query_history_days'))
            dagstat_states = set(itertools.product(dag_ids, State.dag_states))
            qry = (
                session.query(DagRun.dag_id, DagRun.state, func.count('*'))
                .filter(DagRun.dag_id.in_(dag_ids))
                .filter(DagRun.execution_date > begin_time)
                .group_by(DagRun.dag_id, DagRun.state)
            )
            counts = {(dag_id, state): count for dag_id, state, count in qry}

            # 修改每个dag_id的每个状态的dagrund的数量
            for dag_id, state in dagstat_states:
                count = counts.get((dag_id, state), 0)
                session.merge(
                    DagStat(dag_id=dag_id, state=state, count=count, dirty=False)
                )

            session.commit()
        except Exception as e:
            session.rollback()
            log = LoggingMixin().log
            log.warning("Could not update dag stat table")
            log.exception(e)
Beispiel #3
0
def execute_command(command):
    """airflow worker 执行shell命令 ."""
    log = LoggingMixin().log
    log.info("Executing command in Celery: %s", command)
    env = os.environ.copy()
    try:
        # celery worker 收到消息后,执行消息中的shell命令
        subprocess.check_call(command,
                              shell=True,
                              stderr=subprocess.STDOUT,
                              close_fds=True,
                              env=env)
    except subprocess.CalledProcessError as e:
        log.exception('execute_command encountered a CalledProcessError')
        log.error(e.output)
        raise AirflowException('Celery command failed')
Beispiel #4
0
    def create(dag_id, session=None):
        """将统计表中不存在的状态插入到db中

        Creates the missing states the stats table for the dag specified

        :param dag_id: dag id of the dag to create stats for
        :param session: database session
        :return:
        """
        # 获得DagStat中存在的状态
        qry = session.query(DagStat).filter(DagStat.dag_id == dag_id).all()
        states = {dag_stat.state for dag_stat in qry}
        # 遍历所有状态, 找出不再数据库中的状态
        states_not_found = set(State.dag_states) - states
        for state in states_not_found:
            try:
                session.merge(DagStat(dag_id=dag_id, state=state))
                session.commit()
            except Exception as e:
                session.rollback()
                log = LoggingMixin().log
                log.warning("Could not create stat record")
                log.exception(e)
Beispiel #5
0
            # 将所加载模块中的 XToolPlugin 子类添加到 plugins 列表中
            # 遍历模块的属性值
            for obj in list(m.__dict__.values()):
                # 判断模块中的类是否为 XToolPlugin 的子类
                if (
                        inspect.isclass(obj) and
                        issubclass(obj, XToolPlugin) and
                        obj is not XToolPlugin):
                    # 验证子类中是否定义了name静态变量
                    obj.validate()
                    # 将类加入到插件列表中
                    if obj not in plugins:
                        plugins.append(obj)

        except Exception as e:
            log.exception(e)
            log.error('Failed to import plugin %s', filepath)


def make_module(name, objects):
    """动态创建模块 .

    :param name: 模块名称
    :param objects: 模块中需要包含的对象列表
    """
    log.debug('Creating module %s', name)
    name = name.lower()
    # 创建模块
    module = imp.new_module(name)
    # 给模块设置_name属性 (插件名)
    module._name = name.split('.')[-1]
Beispiel #6
0
def list_py_file_paths(
    directory,
    followlinks=True,
    ignore_filename='.ignore',
    file_ext='.py',
    safe_mode=False,
    safe_filters=(b'xTool', b'XTool')):
    """递归遍历目录,返回匹配规则的文件列表
    Traverse a directory and look for Python files.

    :param directory: the directory to traverse
    :type directory: unicode
    :param safe_mode: whether to use a heuristic to determine whether a file
    contains Airflow DAG definitions
    :return: a list of paths to Python files in the specified directory
    :rtype: list[unicode]
    """
    file_paths = []
    if directory is None:
        return []
    elif os.path.isfile(directory):
        return [directory]
    elif os.path.isdir(directory):
        patterns_by_dir = {}
        # 递归遍历目录,包含链接文件
        for root, dirs, files in os.walk(directory, followlinks=followlinks):
            patterns = patterns_by_dir.get(root, [])
            # 获得需要忽略的文件
            ignore_file = os.path.join(root, ignore_filename)
            if os.path.isfile(ignore_file):
                with open(ignore_file, 'r') as f:
                    # If we have new patterns create a copy so we don't change
                    # the previous list (which would affect other subdirs)
                    patterns = patterns + \
                        [p for p in f.read().split('\n') if p]

            # If we can ignore any subdirs entirely we should - fewer paths
            # to walk is better. We have to modify the ``dirs`` array in
            # place for this to affect os.walk
            dirs[:] = [
                d
                for d in dirs
                if not any(re.search(p, os.path.join(root, d)) for p in patterns)
            ]

            # We want patterns defined in a parent folder's .airflowignore to
            # apply to subdirs too
            for d in dirs:
                patterns_by_dir[os.path.join(root, d)] = patterns

            for f in files:
                try:
                    # 获得文件的绝对路径
                    file_path = os.path.join(root, f)
                    if not os.path.isfile(file_path):
                        continue
                    # 验证文件后缀
                    mod_name, file_extension = os.path.splitext(
                        os.path.split(file_path)[-1])
                    if file_extension != file_ext and not zipfile.is_zipfile(
                            file_path):
                        continue
                    # 验证忽略规则
                    if any([re.findall(p, file_path) for p in patterns]):
                        continue

                    # 使用启发式方式猜测是否是一个DAG文件,DAG文件需要包含DAG 或 airflow
                    # Heuristic that guesses whether a Python file contains an
                    # Airflow DAG definition.
                    might_contain_dag = True
                    if safe_mode and not zipfile.is_zipfile(file_path):
                        with open(file_path, 'rb') as f:
                            content = f.read()
                            might_contain_dag = all(
                                [s in content for s in safe_filters])

                    if not might_contain_dag:
                        continue

                    file_paths.append(file_path)
                except Exception:
                    log = LoggingMixin().log
                    log.exception("Error while examining %s", f)
    return file_paths
Beispiel #7
0
def list_py_file_paths(directory,
                       followlinks=True,
                       ignore_filename='.ignore',
                       file_ext='.py',
                       safe_mode=False,
                       safe_filters=(b'xTool', b'XTool')):
    """递归遍历目录,返回匹配规则的文件列表
    Traverse a directory and look for Python files.

    :param directory: the directory to traverse
    :type directory: unicode
    :param safe_mode: whether to use a heuristic to determine whether a file
    contains Airflow DAG definitions
    :return: a list of paths to Python files in the specified directory
    :rtype: list[unicode]
    """
    file_paths = []
    if directory is None:
        return []
    elif os.path.isfile(directory):
        return [directory]
    elif os.path.isdir(directory):
        patterns = []
        # 递归遍历目录,包含链接文件
        for root, dirs, files in os.walk(directory, followlinks=followlinks):
            # 获得需要忽略的文件
            ignore_file = [f for f in files if f == ignore_filename]
            if ignore_file:
                f = open(os.path.join(root, ignore_file[0]), 'r')
                patterns += [p.strip() for p in f.read().split('\n') if p]
                f.close()
            for f in files:
                try:
                    # 获得文件的绝对路径
                    file_path = os.path.join(root, f)
                    if not os.path.isfile(file_path):
                        continue
                    # 验证文件后缀
                    mod_name, file_extension = os.path.splitext(
                        os.path.split(file_path)[-1])
                    if file_extension != file_ext and not zipfile.is_zipfile(
                            file_path):
                        continue
                    # 验证忽略规则
                    if any([re.findall(p, file_path) for p in patterns]):
                        continue

                    # 使用启发式方式猜测是否是一个DAG文件,DAG文件需要包含DAG 或 airflow
                    # Heuristic that guesses whether a Python file contains an
                    # Airflow DAG definition.
                    might_contain_dag = True
                    if safe_mode and not zipfile.is_zipfile(file_path):
                        with open(file_path, 'rb') as f:
                            content = f.read()
                            might_contain_dag = all(
                                [s in content for s in safe_filters])

                    if not might_contain_dag:
                        continue

                    file_paths.append(file_path)
                except Exception:
                    log = LoggingMixin().log
                    log.exception("Error while examining %s", f)
    return file_paths