Example #1
0
    def test_timeout_kills_job(self):
        def submit_side_effect(_1, _2, _3, _4):
            time.sleep(10)
        job_id = 1
        with patch(HOOK) as mock_hook:
            mock_hook = mock_hook()
            mock_hook.submit.side_effect = submit_side_effect
            mock_hook.create_job_template().build.return_value = {
                'job': {
                    'reference': {
                        'jobId': job_id
                    }
                }
            }

            task = DataProcJobBaseOperator(
                task_id=TASK_ID,
                region=GCP_REGION,
                execution_timeout=datetime.timedelta(seconds=1),
                dag=self.dag
            )
            task.create_job_template()

            with self.assertRaises(AirflowTaskTimeout):
                task.run(start_date=make_aware(DEFAULT_DATE), end_date=make_aware(DEFAULT_DATE))
            mock_hook.cancel.assert_called_once_with(mock.ANY, job_id, GCP_REGION)
Example #2
0
 def test_make_aware(self):
     self.assertEqual(
         timezone.make_aware(datetime.datetime(2011, 9, 1, 13, 20, 30),
                             EAT),
         datetime.datetime(2011, 9, 1, 13, 20, 30, tzinfo=EAT))
     with self.assertRaises(ValueError):
         timezone.make_aware(
             datetime.datetime(2011, 9, 1, 13, 20, 30, tzinfo=EAT), EAT)
Example #3
0
    def setUp(self) -> None:
        self.dag_id = "test_logging_dag"
        self.task_id = "test_task"
        self.dag_path = os.path.join(ROOT_FOLDER, "dags",
                                     "test_logging_in_dag.py")
        reset(self.dag_id)
        self.execution_date = timezone.make_aware(datetime(2017, 1, 1))
        self.execution_date_str = self.execution_date.isoformat()
        self.task_args = [
            'tasks', 'run', self.dag_id, self.task_id, '--local',
            self.execution_date_str
        ]
        self.log_dir = conf.get('logging', 'base_log_folder')
        self.log_filename = f"{self.dag_id}/{self.task_id}/{self.execution_date_str}/1.log"
        self.ti_log_file_path = os.path.join(self.log_dir, self.log_filename)
        self.parser = cli_parser.get_parser()

        root = self.root_logger = logging.getLogger()
        self.root_handlers = root.handlers.copy()
        self.root_filters = root.filters.copy()
        self.root_level = root.level

        try:
            os.remove(self.ti_log_file_path)
        except OSError:
            pass
Example #4
0
 def populate_obj(self, item):
     # TODO: This is probably better done as a custom field type so we can
     # set TZ at parse time
     super().populate_obj(item)
     item.execution_date = timezone.make_aware(item.execution_date)
     if item.conf:
         item.conf = json.loads(item.conf)
Example #5
0
    def test_task_states_for_dag_run(self):

        dag2 = DagBag().dags['example_python_operator']

        task2 = dag2.get_task(task_id='print_the_context')
        defaut_date2 = timezone.make_aware(datetime(2016, 1, 9))
        ti2 = TaskInstance(task2, defaut_date2)

        ti2.set_state(State.SUCCESS)
        ti_start = ti2.start_date
        ti_end = ti2.end_date

        with redirect_stdout(io.StringIO()) as stdout:
            task_command.task_states_for_dag_run(
                self.parser.parse_args([
                    'tasks', 'states_for_dag_run', 'example_python_operator',
                    defaut_date2.isoformat()
                ]))
        actual_out = stdout.getvalue()

        formatted_rows = [
            ('example_python_operator', '2016-01-09 00:00:00+00:00',
             'print_the_context', 'success', ti_start, ti_end)
        ]

        expected = tabulate(
            formatted_rows,
            ['dag', 'exec_date', 'task', 'state', 'start_date', 'end_date'],
            tablefmt="fancy_grid")

        # Check that prints, and log messages, are shown
        self.assertEqual(expected.replace("\n", ""),
                         actual_out.replace("\n", ""))
Example #6
0
    def test_task_states_for_dag_run(self):

        dag2 = DagBag().dags['example_python_operator']
        task2 = dag2.get_task(task_id='print_the_context')
        default_date2 = timezone.make_aware(datetime(2016, 1, 9))
        dag2.clear()

        ti2 = TaskInstance(task2, default_date2)

        ti2.set_state(State.SUCCESS)
        ti_start = ti2.start_date
        ti_end = ti2.end_date

        with redirect_stdout(io.StringIO()) as stdout:
            task_command.task_states_for_dag_run(
                self.parser.parse_args([
                    'tasks',
                    'states-for-dag-run',
                    'example_python_operator',
                    default_date2.isoformat(),
                    '--output',
                    "json",
                ]))
        actual_out = json.loads(stdout.getvalue())

        assert len(actual_out) == 1
        assert actual_out[0] == {
            'dag_id': 'example_python_operator',
            'execution_date': '2016-01-09T00:00:00+00:00',
            'task_id': 'print_the_context',
            'state': 'success',
            'start_date': ti_start.isoformat(),
            'end_date': ti_end.isoformat(),
        }
Example #7
0
    def __init__(self,
                 dag_directory: str,
                 max_runs: int,
                 processor_factory: Callable[[str, List[Any]], AbstractDagFileProcessorProcess],
                 processor_timeout: timedelta,
                 signal_conn: Connection,
                 async_mode: bool = True):
        self._file_paths: List[str] = []
        self._file_path_queue: List[str] = []
        self._dag_directory = dag_directory
        self._max_runs = max_runs
        self._processor_factory = processor_factory
        self._signal_conn = signal_conn
        self._async_mode = async_mode
        self._parsing_start_time: Optional[datetime] = None

        self._parallelism = conf.getint('scheduler', 'max_threads')
        if 'sqlite' in conf.get('core', 'sql_alchemy_conn') and self._parallelism > 1:
            self.log.warning(
                "Because we cannot use more than 1 thread (max_threads = "
                "%d ) when using sqlite. So we set parallelism to 1.", self._parallelism
            )
            self._parallelism = 1

        # Parse and schedule each file no faster than this interval.
        self._file_process_interval = conf.getint('scheduler',
                                                  'min_file_process_interval')
        # How often to print out DAG file processing stats to the log. Default to
        # 30 seconds.
        self.print_stats_interval = conf.getint('scheduler',
                                                'print_stats_interval')
        # How many seconds do we wait for tasks to heartbeat before mark them as zombies.
        self._zombie_threshold_secs = (
            conf.getint('scheduler', 'scheduler_zombie_task_threshold'))
        # Map from file path to the processor
        self._processors: Dict[str, AbstractDagFileProcessorProcess] = {}

        self._num_run = 0

        # Map from file path to stats about the file
        self._file_stats: Dict[str, DagFileStat] = {}

        self._last_zombie_query_time = None
        # Last time that the DAG dir was traversed to look for files
        self.last_dag_dir_refresh_time = timezone.make_aware(datetime.fromtimestamp(0))
        # Last time stats were printed
        self.last_stat_print_time = timezone.datetime(2000, 1, 1)
        # TODO: Remove magic number
        self._zombie_query_interval = 10
        self._zombies: List[SimpleTaskInstance] = []
        # How long to wait before timing out a process to parse a DAG file
        self._processor_timeout = processor_timeout

        # How often to scan the DAGs directory for new files. Default to 5 minutes.
        self.dag_dir_list_interval = conf.getint('scheduler',
                                                 'dag_dir_list_interval')

        self._log = logging.getLogger('airflow.processor_manager')
Example #8
0
 def _get_prev(self, current: DateTime) -> DateTime:
     """Get the first schedule before specified time, with DST fixed."""
     naive = make_naive(current, self._timezone)
     cron = croniter(self._expression, start_time=naive)
     scheduled = cron.get_prev(datetime.datetime)
     if not self._should_fix_dst:
         return convert_to_utc(make_aware(scheduled, self._timezone))
     delta = naive - scheduled
     return convert_to_utc(current.in_timezone(self._timezone) - delta)
Example #9
0
def pull_xcom_call(**kwargs):
    # xcom will get all values, that was written before this date with using Xcom directly (without context object)
    get_mane_xcoms_values__with_xcom_class = XCom.get_many(
        execution_date=make_aware(datetime(2020, 27, 4, 14, 50, 00, 00)),
        dag_ids=["write_to_xcom"],
        include_prior_dates=True)
    print('XCom.get_many ')
    print(get_mane_xcoms_values__with_xcom_class)

    get_xcom_with_ti = kwargs['ti'].xcom_pull(dag_id="write_to_xcom",
                                              include_prior_dates=True)
    print('ti.xcom_pull with include_prior_dates')
    print(get_xcom_with_ti)
Example #10
0
 def test_cli_list_dag_runs(self):
     dag_command.dag_trigger(self.parser.parse_args([
         'dags', 'trigger', 'example_bash_operator', ]))
     args = self.parser.parse_args(['dags',
                                    'list_runs',
                                    '--dag-id',
                                    'example_bash_operator',
                                    '--no-backfill',
                                    '--start-date',
                                    DEFAULT_DATE.isoformat(),
                                    '--end-date',
                                    timezone.make_aware(datetime.max).isoformat()])
     dag_command.dag_list_dag_runs(args)
Example #11
0
 def setUp(self) -> None:
     self.dag_id = "test_logging_dag"
     self.task_id = "test_task"
     reset(self.dag_id)
     self.execution_date_str = timezone.make_aware(datetime(2017, 1, 1)).isoformat()
     self.log_dir = conf.get('logging', 'base_log_folder')
     self.log_filename = f"{self.dag_id}/{self.task_id}/{self.execution_date_str}/1.log"
     self.ti_log_file_path = os.path.join(self.log_dir, self.log_filename)
     self.parser = cli_parser.get_parser()
     try:
         os.remove(self.ti_log_file_path)
     except OSError:
         pass
Example #12
0
 def __init__(self, *, target_time: Union[str, datetime.datetime],
              **kwargs) -> None:
     super().__init__(**kwargs)
     if isinstance(target_time, datetime.datetime):
         if timezone.is_naive(target_time):
             target_time = timezone.make_aware(target_time)
         self.target_time = target_time
     elif isinstance(target_time, str):
         self.target_time = timezone.parse(target_time)
     else:
         raise TypeError(
             f"Expected str or datetime.datetime type for target_time. Got {type(target_time)}"
         )
Example #13
0
    def test_kill_timed_out_processors_kill(self, mock_kill, mock_pid):
        mock_pid.return_value = 1234
        manager = DagFileProcessorManager(
            dag_directory='directory',
            max_runs=1,
            processor_factory=MagicMock().return_value,
            processor_timeout=timedelta(seconds=5),
            signal_conn=MagicMock(),
            async_mode=True)

        processor = DagFileProcessorProcess('abc.txt', False, [], [])
        processor._start_time = timezone.make_aware(datetime.min)
        manager._processors = {'abc.txt': processor}
        manager._kill_timed_out_processors()
        mock_kill.assert_called_once_with()
    def test_kill_timed_out_processors_no_kill(self, mock_dag_file_processor,
                                               mock_pid):
        mock_pid.return_value = 1234
        manager = DagFileProcessorManager(
            dag_directory='directory',
            file_paths=['abc.txt'],
            max_runs=1,
            processor_factory=MagicMock().return_value,
            processor_timeout=timedelta(seconds=5),
            signal_conn=MagicMock(),
            dag_ids=[],
            pickle_dags=False,
            async_mode=True)

        processor = DagFileProcessor('abc.txt', False, [], [])
        processor._start_time = timezone.make_aware(datetime.max)
        manager._processors = {'abc.txt': processor}
        manager._kill_timed_out_processors()
        mock_dag_file_processor.kill.assert_not_called()
Example #15
0
    def __init__(self,
                 dag_directory: str,
                 max_runs: int,
                 processor_factory: Callable[
                     [str, List[FailureCallbackRequest]],
                     AbstractDagFileProcessorProcess
                 ],
                 processor_timeout: timedelta,
                 signal_conn: MultiprocessingConnection,
                 async_mode: bool = True):
        super().__init__()
        self._file_paths: List[str] = []
        self._file_path_queue: List[str] = []
        self._dag_directory = dag_directory
        self._max_runs = max_runs
        self._processor_factory = processor_factory
        self._signal_conn = signal_conn
        self._async_mode = async_mode
        self._parsing_start_time: Optional[datetime] = None

        self._parallelism = conf.getint('scheduler', 'max_threads')
        if 'sqlite' in conf.get('core', 'sql_alchemy_conn') and self._parallelism > 1:
            self.log.warning(
                "Because we cannot use more than 1 thread (max_threads = "
                "%d ) when using sqlite. So we set parallelism to 1.", self._parallelism
            )
            self._parallelism = 1

        # How often to mark DAGs as inactive and delete their serializations
        # if they haven't been processed recently.
        self._dag_cleanup_interval = conf.getint('scheduler', 'dag_cleanup_interval')
        self._min_serialized_dag_update_interval = conf.getint('core',
                                                               'min_serialized_dag_update_interval')
        # Parse and schedule each file no faster than this interval.
        self._file_process_interval = conf.getint('scheduler',
                                                  'min_file_process_interval')
        # How often to print out DAG file processing stats to the log. Default to
        # 30 seconds.
        self.print_stats_interval = conf.getint('scheduler',
                                                'print_stats_interval')
        # How many seconds do we wait for tasks to heartbeat before mark them as zombies.
        self._zombie_threshold_secs = (
            conf.getint('scheduler', 'scheduler_zombie_task_threshold'))

        # Should store dag file source in a database?
        self.store_dag_code = conf.getboolean('core', 'store_dag_code', fallback=False)
        # Map from file path to the processor
        self._processors: Dict[str, AbstractDagFileProcessorProcess] = {}

        self._num_run = 0

        # Map from file path to stats about the file
        self._file_stats: Dict[str, DagFileStat] = {}

        self._last_zombie_query_time = None
        # Last time that the DAG dir was traversed to look for files
        self.last_dag_dir_refresh_time = timezone.make_aware(datetime.fromtimestamp(0))
        # Last time stats were printed
        self.last_stat_print_time = timezone.datetime(2000, 1, 1)
        # Last time we ran DAG cleanup
        self.last_dag_cleanup_time = timezone.utcnow()
        # TODO: Remove magic number
        self._zombie_query_interval = 10
        # How long to wait before timing out a process to parse a DAG file
        self._processor_timeout = processor_timeout

        # How often to scan the DAGs directory for new files. Default to 5 minutes.
        self.dag_dir_list_interval = conf.getint('scheduler', 'dag_dir_list_interval')

        # Mapping file name and callbacks requests
        self._callback_to_execute: Dict[str, List[FailureCallbackRequest]] = defaultdict(list)

        self._log = logging.getLogger('airflow.processor_manager')
Example #16
0
from argparse import Namespace
from airflow import settings
import airflow.bin.cli as cli
from airflow.bin.cli import get_num_ready_workers_running, run, get_dag
from airflow.models import TaskInstance
from airflow.utils import timezone
from airflow.utils.state import State
from airflow.settings import Session
from airflow import models
from tests.compat import mock

import os

dag_folder_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])

DEFAULT_DATE = timezone.make_aware(datetime(2015, 1, 1))
TEST_DAG_FOLDER = os.path.join(
    os.path.dirname(dag_folder_path), 'dags')
TEST_DAG_ID = 'unit_tests'


def reset(dag_id):
    session = Session()
    tis = session.query(models.TaskInstance).filter_by(dag_id=dag_id)
    tis.delete()
    session.commit()
    session.close()


def create_mock_args(
    task_id,
Example #17
0
def date_range(
    start_date: datetime,
    end_date: Optional[datetime] = None,
    num: Optional[int] = None,
    delta: Optional[Union[str, timedelta, relativedelta]] = None,
) -> List[datetime]:
    """
    Get a set of dates as a list based on a start, end and delta, delta
    can be something that can be added to `datetime.datetime`
    or a cron expression as a `str`

    .. code-block:: pycon
        >>> from airflow.utils.dates import datterange
        >>> from datetime import datetime, timedelta
        >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta=timedelta(1))
        [datetime.datetime(2016, 1, 1, 0, 0, tzinfo=Timezone('UTC')),
        datetime.datetime(2016, 1, 2, 0, 0, tzinfo=Timezone('UTC')),
        datetime.datetime(2016, 1, 3, 0, 0, tzinfo=Timezone('UTC'))]
        >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta="0 0 * * *")
        [datetime.datetime(2016, 1, 1, 0, 0, tzinfo=Timezone('UTC')),
        datetime.datetime(2016, 1, 2, 0, 0, tzinfo=Timezone('UTC')),
        datetime.datetime(2016, 1, 3, 0, 0, tzinfo=Timezone('UTC'))]
        >>> date_range(datetime(2016, 1, 1), datetime(2016, 3, 3), delta="0 0 0 * *")
        [datetime.datetime(2016, 1, 1, 0, 0, tzinfo=Timezone('UTC')),
        datetime.datetime(2016, 2, 1, 0, 0, tzinfo=Timezone('UTC')),
        datetime.datetime(2016, 3, 1, 0, 0, tzinfo=Timezone('UTC'))]

    :param start_date: anchor date to start the series from
    :param end_date: right boundary for the date range
    :param num: alternatively to end_date, you can specify the number of
        number of entries you want in the range. This number can be negative,
        output will always be sorted regardless
    :param delta: step length. It can be datetime.timedelta or cron expression as string
    """
    warnings.warn(
        "`airflow.utils.dates.date_range()` is deprecated. Please use `airflow.timetables`.",
        category=DeprecationWarning,
        stacklevel=2,
    )

    if not delta:
        return []
    if end_date:
        if start_date > end_date:
            raise Exception("Wait. start_date needs to be before end_date")
        if num:
            raise Exception("Wait. Either specify end_date OR num")
    if not end_date and not num:
        end_date = timezone.utcnow()

    delta_iscron = False
    time_zone = start_date.tzinfo

    abs_delta: Union[timedelta, relativedelta]
    if isinstance(delta, str):
        delta_iscron = True
        if timezone.is_localized(start_date):
            start_date = timezone.make_naive(start_date, time_zone)
        cron = croniter(cron_presets.get(delta, delta), start_date)
    elif isinstance(delta, timedelta):
        abs_delta = abs(delta)
    elif isinstance(delta, relativedelta):
        abs_delta = abs(delta)
    else:
        raise Exception(
            "Wait. delta must be either datetime.timedelta or cron expression as str"
        )

    dates = []
    if end_date:
        if timezone.is_naive(start_date) and not timezone.is_naive(end_date):
            end_date = timezone.make_naive(end_date, time_zone)
        while start_date <= end_date:  # type: ignore
            if timezone.is_naive(start_date):
                dates.append(timezone.make_aware(start_date, time_zone))
            else:
                dates.append(start_date)

            if delta_iscron:
                start_date = cron.get_next(datetime)
            else:
                start_date += abs_delta
    else:
        num_entries: int = num  # type: ignore
        for _ in range(abs(num_entries)):
            if timezone.is_naive(start_date):
                dates.append(timezone.make_aware(start_date, time_zone))
            else:
                dates.append(start_date)

            if delta_iscron and num_entries > 0:
                start_date = cron.get_next(datetime)
            elif delta_iscron:
                start_date = cron.get_prev(datetime)
            elif num_entries > 0:
                start_date += abs_delta
            else:
                start_date -= abs_delta

    return sorted(dates)
Example #18
0
def parse_datetime_f(value):
    if not isinstance(value, dt.datetime):
        return value

    return timezone.make_aware(value)
Example #19
0
    def __init__(
        self,
        dag_directory: str,
        max_runs: int,
        processor_factory: Callable[[str, List[CallbackRequest]], AbstractDagFileProcessorProcess],
        processor_timeout: timedelta,
        signal_conn: MultiprocessingConnection,
        dag_ids: Optional[List[str]],
        pickle_dags: bool,
        async_mode: bool = True,
    ):
        super().__init__()
        self._file_paths: List[str] = []
        self._file_path_queue: List[str] = []
        self._dag_directory = dag_directory
        self._max_runs = max_runs
        self._processor_factory = processor_factory
        self._signal_conn = signal_conn
        self._pickle_dags = pickle_dags
        self._dag_ids = dag_ids
        self._async_mode = async_mode
        self._parsing_start_time: Optional[int] = None

        self._parallelism = conf.getint('scheduler', 'parsing_processes')
        if 'sqlite' in conf.get('core', 'sql_alchemy_conn') and self._parallelism > 1:
            self.log.warning(
                "Because we cannot use more than 1 thread (parsing_processes = "
                "%d ) when using sqlite. So we set parallelism to 1.",
                self._parallelism,
            )
            self._parallelism = 1

        # Parse and schedule each file no faster than this interval.
        self._file_process_interval = conf.getint('scheduler', 'min_file_process_interval')
        # How often to print out DAG file processing stats to the log. Default to
        # 30 seconds.
        self.print_stats_interval = conf.getint('scheduler', 'print_stats_interval')
        # How many seconds do we wait for tasks to heartbeat before mark them as zombies.
        self._zombie_threshold_secs = conf.getint('scheduler', 'scheduler_zombie_task_threshold')

        # Should store dag file source in a database?
        self.store_dag_code = STORE_DAG_CODE
        # Map from file path to the processor
        self._processors: Dict[str, AbstractDagFileProcessorProcess] = {}

        self._num_run = 0

        # Map from file path to stats about the file
        self._file_stats: Dict[str, DagFileStat] = {}

        self._last_zombie_query_time = None
        # Last time that the DAG dir was traversed to look for files
        self.last_dag_dir_refresh_time = timezone.make_aware(datetime.fromtimestamp(0))
        # Last time stats were printed
        self.last_stat_print_time = 0
        # TODO: Remove magic number
        self._zombie_query_interval = 10
        # How long to wait before timing out a process to parse a DAG file
        self._processor_timeout = processor_timeout

        # How often to scan the DAGs directory for new files. Default to 5 minutes.
        self.dag_dir_list_interval = conf.getint('scheduler', 'dag_dir_list_interval')

        # Mapping file name and callbacks requests
        self._callback_to_execute: Dict[str, List[CallbackRequest]] = defaultdict(list)

        self._log = logging.getLogger('airflow.processor_manager')

        self.waitables: Dict[Any, Union[MultiprocessingConnection, AbstractDagFileProcessorProcess]] = {
            self._signal_conn: self._signal_conn,
        }
Example #20
0
    def __init__(
        self,
        dag_directory: Union[str, "pathlib.Path"],
        max_runs: int,
        processor_timeout: timedelta,
        dag_ids: Optional[List[str]],
        pickle_dags: bool,
        signal_conn: Optional[MultiprocessingConnection] = None,
        async_mode: bool = True,
    ):
        super().__init__()
        self._file_paths: List[str] = []
        self._file_path_queue: List[str] = []
        self._dag_directory = dag_directory
        self._max_runs = max_runs
        # signal_conn is None for dag_processor_standalone mode.
        self._direct_scheduler_conn = signal_conn
        self._pickle_dags = pickle_dags
        self._dag_ids = dag_ids
        self._async_mode = async_mode
        self._parsing_start_time: Optional[int] = None

        # Set the signal conn in to non-blocking mode, so that attempting to
        # send when the buffer is full errors, rather than hangs for-ever
        # attempting to send (this is to avoid deadlocks!)
        #
        # Don't do this in sync_mode, as we _need_ the DagParsingStat sent to
        # continue the scheduler
        if self._async_mode and self._direct_scheduler_conn is not None:
            os.set_blocking(self._direct_scheduler_conn.fileno(), False)

        self._parallelism = conf.getint('scheduler', 'parsing_processes')
        if (conf.get_mandatory_value('database',
                                     'sql_alchemy_conn').startswith('sqlite')
                and self._parallelism > 1):
            self.log.warning(
                "Because we cannot use more than 1 thread (parsing_processes = "
                "%d) when using sqlite. So we set parallelism to 1.",
                self._parallelism,
            )
            self._parallelism = 1

        # Parse and schedule each file no faster than this interval.
        self._file_process_interval = conf.getint('scheduler',
                                                  'min_file_process_interval')
        # How often to print out DAG file processing stats to the log. Default to
        # 30 seconds.
        self.print_stats_interval = conf.getint('scheduler',
                                                'print_stats_interval')

        # Map from file path to the processor
        self._processors: Dict[str, DagFileProcessorProcess] = {}

        self._num_run = 0

        # Map from file path to stats about the file
        self._file_stats: Dict[str, DagFileStat] = {}

        # Last time that the DAG dir was traversed to look for files
        self.last_dag_dir_refresh_time = timezone.make_aware(
            datetime.fromtimestamp(0))
        # Last time stats were printed
        self.last_stat_print_time = 0
        # Last time we cleaned up DAGs which are no longer in files
        self.last_deactivate_stale_dags_time = timezone.make_aware(
            datetime.fromtimestamp(0))
        # How often to check for DAGs which are no longer in files
        self.deactivate_stale_dags_interval = conf.getint(
            'scheduler', 'deactivate_stale_dags_interval')
        # How long to wait before timing out a process to parse a DAG file
        self._processor_timeout = processor_timeout
        # How often to scan the DAGs directory for new files. Default to 5 minutes.
        self.dag_dir_list_interval = conf.getint('scheduler',
                                                 'dag_dir_list_interval')

        # Mapping file name and callbacks requests
        self._callback_to_execute: Dict[
            str, List[CallbackRequest]] = defaultdict(list)

        self._log = logging.getLogger('airflow.processor_manager')

        self.waitables: Dict[Any,
                             Union[MultiprocessingConnection,
                                   DagFileProcessorProcess]] = ({
                                       self._direct_scheduler_conn:
                                       self._direct_scheduler_conn,
                                   } if self._direct_scheduler_conn is not None
                                                                else {})
 def test_make_aware(self):
     self.assertEqual(
         timezone.make_aware(datetime.datetime(2011, 9, 1, 13, 20, 30), EAT),
         datetime.datetime(2011, 9, 1, 13, 20, 30, tzinfo=EAT))
     with self.assertRaises(ValueError):
         timezone.make_aware(datetime.datetime(2011, 9, 1, 13, 20, 30, tzinfo=EAT), EAT)
Example #22
0
 def clean(self, value):
     dt = super(UtcDateTimeFilterMixin, self).clean(value)
     return timezone.make_aware(dt, timezone=timezone.utc)
Example #23
0
def round_time(dt, delta, start_date=timezone.make_aware(datetime.min)):
    """
    Returns the datetime of the form start_date + i * delta
    which is closest to dt for any non-negative integer i.
    Note that delta may be a datetime.timedelta or a dateutil.relativedelta
    >>> round_time(datetime(2015, 1, 1, 6), timedelta(days=1))
    datetime.datetime(2015, 1, 1, 0, 0)
    >>> round_time(datetime(2015, 1, 2), relativedelta(months=1))
    datetime.datetime(2015, 1, 1, 0, 0)
    >>> round_time(datetime(2015, 9, 16, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0))
    datetime.datetime(2015, 9, 16, 0, 0)
    >>> round_time(datetime(2015, 9, 15, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0))
    datetime.datetime(2015, 9, 15, 0, 0)
    >>> round_time(datetime(2015, 9, 14, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0))
    datetime.datetime(2015, 9, 14, 0, 0)
    >>> round_time(datetime(2015, 9, 13, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0))
    datetime.datetime(2015, 9, 14, 0, 0)
    """

    if isinstance(delta, six.string_types):
        # It's cron based, so it's easy
        tz = start_date.tzinfo
        start_date = timezone.make_naive(start_date, tz)
        cron = croniter(delta, start_date)
        prev = cron.get_prev(datetime)
        if prev == start_date:
            return timezone.make_aware(start_date, tz)
        else:
            return timezone.make_aware(prev, tz)

    # Ignore the microseconds of dt
    dt -= timedelta(microseconds=dt.microsecond)

    # We are looking for a datetime in the form start_date + i * delta
    # which is as close as possible to dt. Since delta could be a relative
    # delta we don't know its exact length in seconds so we cannot rely on
    # division to find i. Instead we employ a binary search algorithm, first
    # finding an upper and lower limit and then disecting the interval until
    # we have found the closest match.

    # We first search an upper limit for i for which start_date + upper * delta
    # exceeds dt.
    upper = 1
    while start_date + upper * delta < dt:
        # To speed up finding an upper limit we grow this exponentially by a
        # factor of 2
        upper *= 2

    # Since upper is the first value for which start_date + upper * delta
    # exceeds dt, upper // 2 is below dt and therefore forms a lower limited
    # for the i we are looking for
    lower = upper // 2

    # We now continue to intersect the interval between
    # start_date + lower * delta and start_date + upper * delta
    # until we find the closest value
    while True:
        # Invariant: start + lower * delta < dt <= start + upper * delta
        # If start_date + (lower + 1)*delta exceeds dt, then either lower or
        # lower+1 has to be the solution we are searching for
        if start_date + (lower + 1) * delta >= dt:
            # Check if start_date + (lower + 1)*delta or
            # start_date + lower*delta is closer to dt and return the solution
            if (
                    (start_date + (lower + 1) * delta) - dt <=
                    dt - (start_date + lower * delta)):
                return start_date + (lower + 1) * delta
            else:
                return start_date + lower * delta

        # We intersect the interval and either replace the lower or upper
        # limit with the candidate
        candidate = lower + (upper - lower) // 2
        if start_date + candidate * delta >= dt:
            upper = candidate
        else:
            lower = candidate
Example #24
0
def date_range(start_date, end_date=None, num=None, delta=None):  # pylint: disable=too-many-branches
    """
    Get a set of dates as a list based on a start, end and delta, delta
    can be something that can be added to `datetime.datetime`
    or a cron expression as a `str`

    .. code-block:: python

        date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta=timedelta(1))
            [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0),
            datetime.datetime(2016, 1, 3, 0, 0)]
        date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta='0 0 * * *')
            [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0),
            datetime.datetime(2016, 1, 3, 0, 0)]
        date_range(datetime(2016, 1, 1), datetime(2016, 3, 3), delta="0 0 0 * *")
            [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0),
            datetime.datetime(2016, 3, 1, 0, 0)]

    :param start_date: anchor date to start the series from
    :type start_date: datetime.datetime
    :param end_date: right boundary for the date range
    :type end_date: datetime.datetime
    :param num: alternatively to end_date, you can specify the number of
        number of entries you want in the range. This number can be negative,
        output will always be sorted regardless
    :type num: int
    :param delta: step length. It can be datetime.timedelta or cron expression as string
    :type delta: datetime.timedelta or str
    """
    if not delta:
        return []
    if end_date and start_date > end_date:
        raise Exception("Wait. start_date needs to be before end_date")
    if end_date and num:
        raise Exception("Wait. Either specify end_date OR num")
    if not end_date and not num:
        end_date = timezone.utcnow()
    if delta in cron_presets:
        delta = cron_presets.get(delta)

    delta_iscron = False
    time_zone = start_date.tzinfo

    if isinstance(delta, str):
        delta_iscron = True
        if timezone.is_localized(start_date):
            start_date = timezone.make_naive(start_date, time_zone)
        cron = croniter(delta, start_date)
    elif isinstance(delta, timedelta):
        delta = abs(delta)
    else:
        raise Exception(
            "Wait. delta must be either datetime.timedelta or cron expression as str"
        )

    dates = []
    if end_date:
        if timezone.is_naive(start_date) and not timezone.is_naive(end_date):
            end_date = timezone.make_naive(end_date, time_zone)
        while start_date <= end_date:
            if timezone.is_naive(start_date):
                dates.append(timezone.make_aware(start_date, time_zone))
            else:
                dates.append(start_date)

            if delta_iscron:
                start_date = cron.get_next(datetime)
            else:
                start_date += delta
    else:
        for _ in range(abs(num)):
            if timezone.is_naive(start_date):
                dates.append(timezone.make_aware(start_date, time_zone))
            else:
                dates.append(start_date)

            if delta_iscron and num > 0:
                start_date = cron.get_next(datetime)
            elif delta_iscron:
                start_date = cron.get_prev(datetime)
            elif num > 0:
                start_date += delta
            else:
                start_date -= delta

    return sorted(dates)
def round_time(dt, delta, start_date=timezone.make_aware(datetime.min)):
    """
    Returns the datetime of the form start_date + i * delta
    which is closest to dt for any non-negative integer i.

    Note that delta may be a datetime.timedelta or a dateutil.relativedelta

    >>> round_time(datetime(2015, 1, 1, 6), timedelta(days=1))
    datetime.datetime(2015, 1, 1, 0, 0)
    >>> round_time(datetime(2015, 1, 2), relativedelta(months=1))
    datetime.datetime(2015, 1, 1, 0, 0)
    >>> round_time(datetime(2015, 9, 16, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0))
    datetime.datetime(2015, 9, 16, 0, 0)
    >>> round_time(datetime(2015, 9, 15, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0))
    datetime.datetime(2015, 9, 15, 0, 0)
    >>> round_time(datetime(2015, 9, 14, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0))
    datetime.datetime(2015, 9, 14, 0, 0)
    >>> round_time(datetime(2015, 9, 13, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0))
    datetime.datetime(2015, 9, 14, 0, 0)
    """

    if isinstance(delta, six.string_types):
        # It's cron based, so it's easy
        tz = start_date.tzinfo
        start_date = timezone.make_naive(start_date, tz)
        cron = croniter(delta, start_date)
        prev = cron.get_prev(datetime)
        if prev == start_date:
            return timezone.make_aware(start_date, tz)
        else:
            return timezone.make_aware(prev, tz)

    # Ignore the microseconds of dt
    dt -= timedelta(microseconds=dt.microsecond)

    # We are looking for a datetime in the form start_date + i * delta
    # which is as close as possible to dt. Since delta could be a relative
    # delta we don't know it's exact length in seconds so we cannot rely on
    # division to find i. Instead we employ a binary search algorithm, first
    # finding an upper and lower limit and then disecting the interval until
    # we have found the closest match.

    # We first search an upper limit for i for which start_date + upper * delta
    # exceeds dt.
    upper = 1
    while start_date + upper * delta < dt:
        # To speed up finding an upper limit we grow this exponentially by a
        # factor of 2
        upper *= 2

    # Since upper is the first value for which start_date + upper * delta
    # exceeds dt, upper // 2 is below dt and therefore forms a lower limited
    # for the i we are looking for
    lower = upper // 2

    # We now continue to intersect the interval between
    # start_date + lower * delta and start_date + upper * delta
    # until we find the closest value
    while True:
        # Invariant: start + lower * delta < dt <= start + upper * delta
        # If start_date + (lower + 1)*delta exceeds dt, then either lower or
        # lower+1 has to be the solution we are searching for
        if start_date + (lower + 1) * delta >= dt:
            # Check if start_date + (lower + 1)*delta or
            # start_date + lower*delta is closer to dt and return the solution
            if ((start_date + (lower + 1) * delta) - dt <= dt -
                (start_date + lower * delta)):
                return start_date + (lower + 1) * delta
            else:
                return start_date + lower * delta

        # We intersect the interval and either replace the lower or upper
        # limit with the candidate
        candidate = lower + (upper - lower) // 2
        if start_date + candidate * delta >= dt:
            upper = candidate
        else:
            lower = candidate
Example #26
0
 def clean(self, value):
     dt = super(UtcDateTimeFilterMixin, self).clean(value)
     if isinstance(dt, list):
         return [timezone.make_aware(d, timezone=timezone.utc) for d in dt]
     return timezone.make_aware(dt, timezone=timezone.utc)
def date_range(start_date, end_date=None, num=None, delta=None):
    """
    Get a set of dates as a list based on a start, end and delta, delta
    can be something that can be added to ``datetime.datetime``
    or a cron expression as a ``str``

    :param start_date: anchor date to start the series from
    :type start_date: datetime.datetime
    :param end_date: right boundary for the date range
    :type end_date: datetime.datetime
    :param num: alternatively to end_date, you can specify the number of
        number of entries you want in the range. This number can be negative,
        output will always be sorted regardless
    :type num: int

    >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta=timedelta(1))
    [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0)]
    >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta='0 0 * * *')
    [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0)]
    >>> date_range(datetime(2016, 1, 1), datetime(2016, 3, 3), delta="0 0 0 * *")
    [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0)]
    """
    if not delta:
        return []
    if end_date and start_date > end_date:
        raise Exception("Wait. start_date needs to be before end_date")
    if end_date and num:
        raise Exception("Wait. Either specify end_date OR num")
    if not end_date and not num:
        end_date = timezone.utcnow()

    delta_iscron = False
    if isinstance(delta, six.string_types):
        delta_iscron = True
        tz = start_date.tzinfo
        start_date = timezone.make_naive(start_date, tz)
        cron = croniter(delta, start_date)
    elif isinstance(delta, timedelta):
        delta = abs(delta)
    l = []
    if end_date:
        while start_date <= end_date:
            if timezone.is_naive(start_date):
                l.append(timezone.make_aware(start_date, tz))
            else:
                l.append(start_date)

            if delta_iscron:
                start_date = cron.get_next(datetime)
            else:
                start_date += delta
    else:
        for _ in range(abs(num)):
            if timezone.is_naive(start_date):
                l.append(timezone.make_aware(start_date, tz))
            else:
                l.append(start_date)

            if delta_iscron:
                if num > 0:
                    start_date = cron.get_next(datetime)
                else:
                    start_date = cron.get_prev(datetime)
            else:
                if num > 0:
                    start_date += delta
                else:
                    start_date -= delta
    return sorted(l)
Example #28
0
 def populate_obj(self, item):
     # TODO: This is probably better done as a custom field type so we can
     # set TZ at parse time
     super(DagRunForm, self).populate_obj(item)
     item.execution_date = timezone.make_aware(item.execution_date)
Example #29
0
    def prepare_file_path_queue(self):
        """Generate more file paths to process. Result are saved in _file_path_queue."""
        self._parsing_start_time = time.perf_counter()
        # If the file path is already being processed, or if a file was
        # processed recently, wait until the next batch
        file_paths_in_progress = self._processors.keys()
        now = timezone.utcnow()

        # Sort the file paths by the parsing order mode
        list_mode = conf.get("scheduler", "file_parsing_sort_mode")

        files_with_mtime = {}
        file_paths = []
        is_mtime_mode = list_mode == "modified_time"

        file_paths_recently_processed = []
        for file_path in self._file_paths:

            if is_mtime_mode:
                try:
                    files_with_mtime[file_path] = os.path.getmtime(file_path)
                except FileNotFoundError:
                    self.log.warning("Skipping processing of missing file: %s",
                                     file_path)
                    continue
                file_modified_time = timezone.make_aware(
                    datetime.fromtimestamp(files_with_mtime[file_path]))
            else:
                file_paths.append(file_path)
                file_modified_time = None

            # Find file paths that were recently processed to exclude them
            # from being added to file_path_queue
            # unless they were modified recently and parsing mode is "modified_time"
            # in which case we don't honor "self._file_process_interval" (min_file_process_interval)
            last_finish_time = self.get_last_finish_time(file_path)
            if (last_finish_time is not None
                    and (now - last_finish_time).total_seconds() <
                    self._file_process_interval
                    and not (is_mtime_mode and file_modified_time and
                             (file_modified_time > last_finish_time))):
                file_paths_recently_processed.append(file_path)

        # Sort file paths via last modified time
        if is_mtime_mode:
            file_paths = sorted(files_with_mtime,
                                key=files_with_mtime.get,
                                reverse=True)
        elif list_mode == "alphabetical":
            file_paths = sorted(file_paths)
        elif list_mode == "random_seeded_by_host":
            # Shuffle the list seeded by hostname so multiple schedulers can work on different
            # set of files. Since we set the seed, the sort order will remain same per host
            random.Random(get_hostname()).shuffle(file_paths)

        files_paths_at_run_limit = [
            file_path for file_path, stat in self._file_stats.items()
            if stat.run_count == self._max_runs
        ]

        file_paths_to_exclude = set(file_paths_in_progress).union(
            file_paths_recently_processed, files_paths_at_run_limit)

        # Do not convert the following list to set as set does not preserve the order
        # and we need to maintain the order of file_paths for `[scheduler] file_parsing_sort_mode`
        files_paths_to_queue = [
            file_path for file_path in file_paths
            if file_path not in file_paths_to_exclude
        ]

        for file_path, processor in self._processors.items():
            self.log.debug(
                "File path %s is still being processed (started: %s)",
                processor.file_path,
                processor.start_time.isoformat(),
            )

        self.log.debug("Queuing the following files for processing:\n\t%s",
                       "\n\t".join(files_paths_to_queue))

        for file_path in files_paths_to_queue:
            if file_path not in self._file_stats:
                self._file_stats[file_path] = DagFileStat(
                    num_dags=0,
                    import_errors=0,
                    last_finish_time=None,
                    last_duration=None,
                    run_count=0)

        self._file_path_queue.extend(files_paths_to_queue)
Example #30
0
def date_range(
        start_date,
        end_date=None,
        num=None,
        delta=None):
    """
    Get a set of dates as a list based on a start, end and delta, delta
    can be something that can be added to ``datetime.datetime``
    or a cron expression as a ``str``
    :param start_date: anchor date to start the series from
    :type start_date: datetime.datetime
    :param end_date: right boundary for the date range
    :type end_date: datetime.datetime
    :param num: alternatively to end_date, you can specify the number of
        number of entries you want in the range. This number can be negative,
        output will always be sorted regardless
    :type num: int
    >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta=timedelta(1))
    [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0),
     datetime.datetime(2016, 1, 3, 0, 0)]
    >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta='0 0 * * *')
    [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0),
     datetime.datetime(2016, 1, 3, 0, 0)]
    >>> date_range(datetime(2016, 1, 1), datetime(2016, 3, 3), delta="0 0 0 * *")
    [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0),
     datetime.datetime(2016, 3, 1, 0, 0)]
    """
    if not delta:
        return []
    if end_date and start_date > end_date:
        raise Exception("Wait. start_date needs to be before end_date")
    if end_date and num:
        raise Exception("Wait. Either specify end_date OR num")
    if not end_date and not num:
        end_date = timezone.utcnow()

    delta_iscron = False
    tz = start_date.tzinfo
    if isinstance(delta, six.string_types):
        delta_iscron = True
        start_date = timezone.make_naive(start_date, tz)
        cron = croniter(delta, start_date)
    elif isinstance(delta, timedelta):
        delta = abs(delta)
    dates = []
    if end_date:
        if timezone.is_naive(start_date):
            end_date = timezone.make_naive(end_date, tz)
        while start_date <= end_date:
            if timezone.is_naive(start_date):
                dates.append(timezone.make_aware(start_date, tz))
            else:
                dates.append(start_date)

            if delta_iscron:
                start_date = cron.get_next(datetime)
            else:
                start_date += delta
    else:
        for _ in range(abs(num)):
            if timezone.is_naive(start_date):
                dates.append(timezone.make_aware(start_date, tz))
            else:
                dates.append(start_date)

            if delta_iscron:
                if num > 0:
                    start_date = cron.get_next(datetime)
                else:
                    start_date = cron.get_prev(datetime)
            else:
                if num > 0:
                    start_date += delta
                else:
                    start_date -= delta
    return sorted(dates)
Example #31
0
from airflow import settings
from airflow.cli import cli_parser
from airflow.cli.commands import dag_command
from airflow.exceptions import AirflowException
from airflow.models import DagBag, DagModel, DagRun
from airflow.utils import timezone
from airflow.utils.session import create_session
from airflow.utils.state import State
from airflow.utils.types import DagRunType
from tests.test_utils.config import conf_vars
from tests.test_utils.db import clear_db_dags, clear_db_runs

dag_folder_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])

DEFAULT_DATE = timezone.make_aware(datetime(2015, 1, 1))
TEST_DAG_FOLDER = os.path.join(os.path.dirname(dag_folder_path), 'dags')
TEST_DAG_ID = 'unit_tests'

EXAMPLE_DAGS_FOLDER = os.path.join(
    os.path.dirname(
        os.path.dirname(os.path.dirname(os.path.realpath(__file__)))),
    "airflow/example_dags")


class TestCliDags(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.dagbag = DagBag(include_examples=True)
        cls.dagbag.sync_to_db()
        cls.parser = cli_parser.get_parser()
Example #32
0
 def test_make_aware(self):
     assert timezone.make_aware(datetime.datetime(2011, 9, 1, 13, 20, 30), EAT) == datetime.datetime(
         2011, 9, 1, 13, 20, 30, tzinfo=EAT
     )
     with pytest.raises(ValueError):
         timezone.make_aware(datetime.datetime(2011, 9, 1, 13, 20, 30, tzinfo=EAT), EAT)