Beispiel #1
0
def _capture_task_logs(ti):
    """Manage logging context for a task run

    - Replace the root logger configuration with the airflow.task configuration
      so we can capture logs from any custom loggers used in the task.

    - Redirect stdout and stderr to the task instance log, as INFO and WARNING
      level messages, respectively.

    """
    modify = not settings.DONOT_MODIFY_HANDLERS

    if modify:
        root_logger, task_logger = logging.getLogger(), logging.getLogger(
            'airflow.task')

        orig_level = root_logger.level
        root_logger.setLevel(task_logger.level)
        orig_handlers = root_logger.handlers.copy()
        root_logger.handlers[:] = task_logger.handlers

    try:
        info_writer = StreamLogWriter(ti.log, logging.INFO)
        warning_writer = StreamLogWriter(ti.log, logging.WARNING)

        with redirect_stdout(info_writer), redirect_stderr(warning_writer):
            yield

    finally:
        if modify:
            # Restore the root logger to its original state.
            root_logger.setLevel(orig_level)
            root_logger.handlers[:] = orig_handlers
Beispiel #2
0
def task_run(args, dag=None):
    """Runs a single task instance"""
    if dag:
        args.dag_id = dag.dag_id

    # Load custom airflow config
    if args.cfg_path:
        with open(args.cfg_path, 'r') as conf_file:
            conf_dict = json.load(conf_file)

        if os.path.exists(args.cfg_path):
            os.remove(args.cfg_path)

        conf.read_dict(conf_dict, source=args.cfg_path)
        settings.configure_vars()

    # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave
    # behind multiple open sleeping connections while heartbeating, which could
    # easily exceed the database connection limit when
    # processing hundreds of simultaneous tasks.
    settings.configure_orm(disable_connection_pool=True)

    if not args.pickle and not dag:
        dag = get_dag(args)
    elif not dag:
        with db.create_session() as session:
            print(f'Loading pickle id {args.pickle}')
            dag_pickle = session.query(DagPickle).filter(
                DagPickle.id == args.pickle).first()
            if not dag_pickle:
                raise AirflowException("Who hid the pickle!? [missing pickle]")
            dag = dag_pickle.pickle

    task = dag.get_task(task_id=args.task_id)
    ti = TaskInstance(task, args.execution_date)
    ti.refresh_from_db()

    ti.init_run_context(raw=args.raw)

    hostname = get_hostname()
    print(f"Running {ti} on host {hostname}")

    if args.interactive:
        _run(args, dag, ti)
    else:
        with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \
                redirect_stderr(StreamLogWriter(ti.log, logging.WARN)):
            _run(args, dag, ti)
    logging.shutdown()
    def test_flush(self):
        logger = mock.MagicMock()
        logger.log = mock.MagicMock()

        log = StreamLogWriter(logger, 1)

        msg = "test_message"

        log.write(msg)
        self.assertEqual(log._buffer, msg)

        log.flush()
        logger.log.assert_called_once_with(1, msg)

        self.assertEqual(log._buffer, "")
Beispiel #4
0
def task_run(args, dag=None):
    """Runs a single task instance"""

    # Load custom airflow config
    if args.cfg_path:
        with open(args.cfg_path, 'r') as conf_file:
            conf_dict = json.load(conf_file)

        if os.path.exists(args.cfg_path):
            os.remove(args.cfg_path)

        conf.read_dict(conf_dict, source=args.cfg_path)
        settings.configure_vars()

    # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave
    # behind multiple open sleeping connections while heartbeating, which could
    # easily exceed the database connection limit when
    # processing hundreds of simultaneous tasks.
    settings.configure_orm(disable_connection_pool=True)

    if dag and args.pickle:
        raise AirflowException(
            "You cannot use the --pickle option when using DAG.cli() method.")
    elif args.pickle:
        print(f'Loading pickle id: {args.pickle}')
        dag = get_dag_by_pickle(args.pickle)
    elif not dag:
        dag = get_dag(args.subdir, args.dag_id)
    else:
        # Use DAG from parameter
        pass

    task = dag.get_task(task_id=args.task_id)
    ti = TaskInstance(task, args.execution_date)
    ti.refresh_from_db()

    ti.init_run_context(raw=args.raw)

    hostname = get_hostname()
    print(f"Running {ti} on host {hostname}")

    if args.interactive:
        _run_task_by_selected_method(args, dag, ti)
    else:
        with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \
                redirect_stderr(StreamLogWriter(ti.log, logging.WARN)):
            _run_task_by_selected_method(args, dag, ti)
    logging.shutdown()
Beispiel #5
0
    def test_flush(self):
        logger = mock.MagicMock()
        logger.log = mock.MagicMock()

        log = StreamLogWriter(logger, 1)

        msg = "test_message"

        log.write(msg)
        assert log._buffer == msg

        log.flush()
        logger.log.assert_called_once_with(1, msg)

        assert log._buffer == ""
    def test_write(self):
        logger = mock.MagicMock()
        logger.log = mock.MagicMock()

        log = StreamLogWriter(logger, 1)

        msg = "test_message"
        log.write(msg)

        self.assertEqual(log._buffer, msg)

        log.write(" \n")
        logger.log.assert_called_once_with(1, msg)

        self.assertEqual(log._buffer, "")
Beispiel #7
0
    def _run_file_processor(
        result_channel: MultiprocessingConnection,
        parent_channel: MultiprocessingConnection,
        file_path: str,
        pickle_dags: bool,
        dag_ids: Optional[List[str]],
        thread_name: str,
        callback_requests: List[CallbackRequest],
    ) -> None:
        """
        Process the given file.

        :param result_channel: the connection to use for passing back the result
        :type result_channel: multiprocessing.Connection
        :param parent_channel: the parent end of the channel to close in the child
        :type parent_channel: multiprocessing.Connection
        :param file_path: the file to process
        :type file_path: str
        :param pickle_dags: whether to pickle the DAGs found in the file and
            save them to the DB
        :type pickle_dags: bool
        :param dag_ids: if specified, only examine DAG ID's that are
            in this list
        :type dag_ids: list[str]
        :param thread_name: the name to use for the process that is launched
        :type thread_name: str
        :param callback_requests: failure callback to execute
        :type callback_requests: List[airflow.utils.callback_requests.CallbackRequest]
        :return: the process that was launched
        :rtype: multiprocessing.Process
        """
        # This helper runs in the newly created process
        log: logging.Logger = logging.getLogger("airflow.processor")

        # Since we share all open FDs from the parent, we need to close the parent side of the pipe here in
        # the child, else it won't get closed properly until we exit.
        log.info("Closing parent pipe")

        parent_channel.close()
        del parent_channel

        set_context(log, file_path)
        setproctitle(f"airflow scheduler - DagFileProcessor {file_path}")

        try:
            # redirect stdout/stderr to log
            with redirect_stdout(StreamLogWriter(
                    log, logging.INFO)), redirect_stderr(
                        StreamLogWriter(log,
                                        logging.WARN)), Stats.timer() as timer:
                # Re-configure the ORM engine as there are issues with multiple processes
                settings.configure_orm()

                # Change the thread name to differentiate log lines. This is
                # really a separate process, but changing the name of the
                # process doesn't work, so changing the thread name instead.
                threading.current_thread().name = thread_name

                log.info("Started process (PID=%s) to work on %s", os.getpid(),
                         file_path)
                dag_file_processor = DagFileProcessor(dag_ids=dag_ids, log=log)
                result: Tuple[int, int] = dag_file_processor.process_file(
                    file_path=file_path,
                    pickle_dags=pickle_dags,
                    callback_requests=callback_requests,
                )
                result_channel.send(result)
            log.info("Processing %s took %.3f seconds", file_path,
                     timer.duration)
        except Exception:  # pylint: disable=broad-except
            # Log exceptions through the logging framework.
            log.exception("Got an exception! Propagating...")
            raise
        finally:
            # We re-initialized the ORM within this Process above so we need to
            # tear it down manually here
            settings.dispose_orm()

            result_channel.close()
Beispiel #8
0
def task_run(args, dag=None):
    """Runs a single task instance"""

    # Load custom airflow config
    if args.cfg_path:
        with open(args.cfg_path, 'r') as conf_file:
            conf_dict = json.load(conf_file)

        if os.path.exists(args.cfg_path):
            os.remove(args.cfg_path)

        conf.read_dict(conf_dict, source=args.cfg_path)
        settings.configure_vars()

    # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave
    # behind multiple open sleeping connections while heartbeating, which could
    # easily exceed the database connection limit when
    # processing hundreds of simultaneous tasks.
    settings.configure_orm(disable_connection_pool=True)

    if dag and args.pickle:
        raise AirflowException("You cannot use the --pickle option when using DAG.cli() method.")
    elif args.pickle:
        print(f'Loading pickle id: {args.pickle}')
        dag = get_dag_by_pickle(args.pickle)
    elif not dag:
        dag = get_dag(args.subdir, args.dag_id)
    else:
        # Use DAG from parameter
        pass

    task = dag.get_task(task_id=args.task_id)
    ti = TaskInstance(task, args.execution_date)
    ti.init_run_context(raw=args.raw)

    hostname = get_hostname()

    print(f"Running {ti} on host {hostname}")

    if args.interactive:
        _run_task_by_selected_method(args, dag, ti)
    else:
        if settings.DONOT_MODIFY_HANDLERS:
            with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \
                    redirect_stderr(StreamLogWriter(ti.log, logging.WARN)):
                _run_task_by_selected_method(args, dag, ti)
        else:
            # Get all the Handlers from 'airflow.task' logger
            # Add these handlers to the root logger so that we can get logs from
            # any custom loggers defined in the DAG
            airflow_logger_handlers = logging.getLogger('airflow.task').handlers
            root_logger = logging.getLogger()
            root_logger_handlers = root_logger.handlers

            # Remove all handlers from Root Logger to avoid duplicate logs
            for handler in root_logger_handlers:
                root_logger.removeHandler(handler)

            for handler in airflow_logger_handlers:
                root_logger.addHandler(handler)
            root_logger.setLevel(logging.getLogger('airflow.task').level)

            with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \
                    redirect_stderr(StreamLogWriter(ti.log, logging.WARN)):
                _run_task_by_selected_method(args, dag, ti)

            # We need to restore the handlers to the loggers as celery worker process
            # can call this command multiple times,
            # so if we don't reset this then logs from next task would go to the wrong place
            for handler in airflow_logger_handlers:
                root_logger.removeHandler(handler)
            for handler in root_logger_handlers:
                root_logger.addHandler(handler)

    logging.shutdown()
Beispiel #9
0
    def test_encoding(self):
        logger = mock.MagicMock()
        logger.log = mock.MagicMock()

        log = StreamLogWriter(logger, 1)
        assert log.encoding is None
Beispiel #10
0
    def test_isatty(self):
        logger = mock.MagicMock()
        logger.log = mock.MagicMock()

        log = StreamLogWriter(logger, 1)
        assert not log.isatty()
Beispiel #11
0
    def test_iobase_compatibility(self):
        log = StreamLogWriter(None, 1)

        assert not log.closed
        # has no specific effect
        log.close()
    def test_isatty(self):
        logger = mock.MagicMock()
        logger.log = mock.MagicMock()

        log = StreamLogWriter(logger, 1)
        self.assertFalse(log.isatty())
    def test_encoding(self):
        logger = mock.MagicMock()
        logger.log = mock.MagicMock()

        log = StreamLogWriter(logger, 1)
        self.assertFalse(log.encoding)
    def test_isatty(self):
        logger = mock.MagicMock()
        logger.log = mock.MagicMock()

        log = StreamLogWriter(logger, 1)
        self.assertFalse(log.isatty())