def test_timeout_kills_job(self): def submit_side_effect(_1, _2, _3, _4): time.sleep(10) job_id = 1 with patch(HOOK) as mock_hook: mock_hook = mock_hook() mock_hook.submit.side_effect = submit_side_effect mock_hook.create_job_template().build.return_value = { 'job': { 'reference': { 'jobId': job_id } } } task = DataProcJobBaseOperator( task_id=TASK_ID, region=GCP_REGION, execution_timeout=datetime.timedelta(seconds=1), dag=self.dag ) task.create_job_template() with self.assertRaises(AirflowTaskTimeout): task.run(start_date=make_aware(DEFAULT_DATE), end_date=make_aware(DEFAULT_DATE)) mock_hook.cancel.assert_called_once_with(mock.ANY, job_id, GCP_REGION)
def test_make_aware(self): self.assertEqual( timezone.make_aware(datetime.datetime(2011, 9, 1, 13, 20, 30), EAT), datetime.datetime(2011, 9, 1, 13, 20, 30, tzinfo=EAT)) with self.assertRaises(ValueError): timezone.make_aware( datetime.datetime(2011, 9, 1, 13, 20, 30, tzinfo=EAT), EAT)
def setUp(self) -> None: self.dag_id = "test_logging_dag" self.task_id = "test_task" self.dag_path = os.path.join(ROOT_FOLDER, "dags", "test_logging_in_dag.py") reset(self.dag_id) self.execution_date = timezone.make_aware(datetime(2017, 1, 1)) self.execution_date_str = self.execution_date.isoformat() self.task_args = [ 'tasks', 'run', self.dag_id, self.task_id, '--local', self.execution_date_str ] self.log_dir = conf.get('logging', 'base_log_folder') self.log_filename = f"{self.dag_id}/{self.task_id}/{self.execution_date_str}/1.log" self.ti_log_file_path = os.path.join(self.log_dir, self.log_filename) self.parser = cli_parser.get_parser() root = self.root_logger = logging.getLogger() self.root_handlers = root.handlers.copy() self.root_filters = root.filters.copy() self.root_level = root.level try: os.remove(self.ti_log_file_path) except OSError: pass
def populate_obj(self, item): # TODO: This is probably better done as a custom field type so we can # set TZ at parse time super().populate_obj(item) item.execution_date = timezone.make_aware(item.execution_date) if item.conf: item.conf = json.loads(item.conf)
def test_task_states_for_dag_run(self): dag2 = DagBag().dags['example_python_operator'] task2 = dag2.get_task(task_id='print_the_context') defaut_date2 = timezone.make_aware(datetime(2016, 1, 9)) ti2 = TaskInstance(task2, defaut_date2) ti2.set_state(State.SUCCESS) ti_start = ti2.start_date ti_end = ti2.end_date with redirect_stdout(io.StringIO()) as stdout: task_command.task_states_for_dag_run( self.parser.parse_args([ 'tasks', 'states_for_dag_run', 'example_python_operator', defaut_date2.isoformat() ])) actual_out = stdout.getvalue() formatted_rows = [ ('example_python_operator', '2016-01-09 00:00:00+00:00', 'print_the_context', 'success', ti_start, ti_end) ] expected = tabulate( formatted_rows, ['dag', 'exec_date', 'task', 'state', 'start_date', 'end_date'], tablefmt="fancy_grid") # Check that prints, and log messages, are shown self.assertEqual(expected.replace("\n", ""), actual_out.replace("\n", ""))
def test_task_states_for_dag_run(self): dag2 = DagBag().dags['example_python_operator'] task2 = dag2.get_task(task_id='print_the_context') default_date2 = timezone.make_aware(datetime(2016, 1, 9)) dag2.clear() ti2 = TaskInstance(task2, default_date2) ti2.set_state(State.SUCCESS) ti_start = ti2.start_date ti_end = ti2.end_date with redirect_stdout(io.StringIO()) as stdout: task_command.task_states_for_dag_run( self.parser.parse_args([ 'tasks', 'states-for-dag-run', 'example_python_operator', default_date2.isoformat(), '--output', "json", ])) actual_out = json.loads(stdout.getvalue()) assert len(actual_out) == 1 assert actual_out[0] == { 'dag_id': 'example_python_operator', 'execution_date': '2016-01-09T00:00:00+00:00', 'task_id': 'print_the_context', 'state': 'success', 'start_date': ti_start.isoformat(), 'end_date': ti_end.isoformat(), }
def __init__(self, dag_directory: str, max_runs: int, processor_factory: Callable[[str, List[Any]], AbstractDagFileProcessorProcess], processor_timeout: timedelta, signal_conn: Connection, async_mode: bool = True): self._file_paths: List[str] = [] self._file_path_queue: List[str] = [] self._dag_directory = dag_directory self._max_runs = max_runs self._processor_factory = processor_factory self._signal_conn = signal_conn self._async_mode = async_mode self._parsing_start_time: Optional[datetime] = None self._parallelism = conf.getint('scheduler', 'max_threads') if 'sqlite' in conf.get('core', 'sql_alchemy_conn') and self._parallelism > 1: self.log.warning( "Because we cannot use more than 1 thread (max_threads = " "%d ) when using sqlite. So we set parallelism to 1.", self._parallelism ) self._parallelism = 1 # Parse and schedule each file no faster than this interval. self._file_process_interval = conf.getint('scheduler', 'min_file_process_interval') # How often to print out DAG file processing stats to the log. Default to # 30 seconds. self.print_stats_interval = conf.getint('scheduler', 'print_stats_interval') # How many seconds do we wait for tasks to heartbeat before mark them as zombies. self._zombie_threshold_secs = ( conf.getint('scheduler', 'scheduler_zombie_task_threshold')) # Map from file path to the processor self._processors: Dict[str, AbstractDagFileProcessorProcess] = {} self._num_run = 0 # Map from file path to stats about the file self._file_stats: Dict[str, DagFileStat] = {} self._last_zombie_query_time = None # Last time that the DAG dir was traversed to look for files self.last_dag_dir_refresh_time = timezone.make_aware(datetime.fromtimestamp(0)) # Last time stats were printed self.last_stat_print_time = timezone.datetime(2000, 1, 1) # TODO: Remove magic number self._zombie_query_interval = 10 self._zombies: List[SimpleTaskInstance] = [] # How long to wait before timing out a process to parse a DAG file self._processor_timeout = processor_timeout # How often to scan the DAGs directory for new files. Default to 5 minutes. self.dag_dir_list_interval = conf.getint('scheduler', 'dag_dir_list_interval') self._log = logging.getLogger('airflow.processor_manager')
def _get_prev(self, current: DateTime) -> DateTime: """Get the first schedule before specified time, with DST fixed.""" naive = make_naive(current, self._timezone) cron = croniter(self._expression, start_time=naive) scheduled = cron.get_prev(datetime.datetime) if not self._should_fix_dst: return convert_to_utc(make_aware(scheduled, self._timezone)) delta = naive - scheduled return convert_to_utc(current.in_timezone(self._timezone) - delta)
def pull_xcom_call(**kwargs): # xcom will get all values, that was written before this date with using Xcom directly (without context object) get_mane_xcoms_values__with_xcom_class = XCom.get_many( execution_date=make_aware(datetime(2020, 27, 4, 14, 50, 00, 00)), dag_ids=["write_to_xcom"], include_prior_dates=True) print('XCom.get_many ') print(get_mane_xcoms_values__with_xcom_class) get_xcom_with_ti = kwargs['ti'].xcom_pull(dag_id="write_to_xcom", include_prior_dates=True) print('ti.xcom_pull with include_prior_dates') print(get_xcom_with_ti)
def test_cli_list_dag_runs(self): dag_command.dag_trigger(self.parser.parse_args([ 'dags', 'trigger', 'example_bash_operator', ])) args = self.parser.parse_args(['dags', 'list_runs', '--dag-id', 'example_bash_operator', '--no-backfill', '--start-date', DEFAULT_DATE.isoformat(), '--end-date', timezone.make_aware(datetime.max).isoformat()]) dag_command.dag_list_dag_runs(args)
def setUp(self) -> None: self.dag_id = "test_logging_dag" self.task_id = "test_task" reset(self.dag_id) self.execution_date_str = timezone.make_aware(datetime(2017, 1, 1)).isoformat() self.log_dir = conf.get('logging', 'base_log_folder') self.log_filename = f"{self.dag_id}/{self.task_id}/{self.execution_date_str}/1.log" self.ti_log_file_path = os.path.join(self.log_dir, self.log_filename) self.parser = cli_parser.get_parser() try: os.remove(self.ti_log_file_path) except OSError: pass
def __init__(self, *, target_time: Union[str, datetime.datetime], **kwargs) -> None: super().__init__(**kwargs) if isinstance(target_time, datetime.datetime): if timezone.is_naive(target_time): target_time = timezone.make_aware(target_time) self.target_time = target_time elif isinstance(target_time, str): self.target_time = timezone.parse(target_time) else: raise TypeError( f"Expected str or datetime.datetime type for target_time. Got {type(target_time)}" )
def test_kill_timed_out_processors_kill(self, mock_kill, mock_pid): mock_pid.return_value = 1234 manager = DagFileProcessorManager( dag_directory='directory', max_runs=1, processor_factory=MagicMock().return_value, processor_timeout=timedelta(seconds=5), signal_conn=MagicMock(), async_mode=True) processor = DagFileProcessorProcess('abc.txt', False, [], []) processor._start_time = timezone.make_aware(datetime.min) manager._processors = {'abc.txt': processor} manager._kill_timed_out_processors() mock_kill.assert_called_once_with()
def test_kill_timed_out_processors_no_kill(self, mock_dag_file_processor, mock_pid): mock_pid.return_value = 1234 manager = DagFileProcessorManager( dag_directory='directory', file_paths=['abc.txt'], max_runs=1, processor_factory=MagicMock().return_value, processor_timeout=timedelta(seconds=5), signal_conn=MagicMock(), dag_ids=[], pickle_dags=False, async_mode=True) processor = DagFileProcessor('abc.txt', False, [], []) processor._start_time = timezone.make_aware(datetime.max) manager._processors = {'abc.txt': processor} manager._kill_timed_out_processors() mock_dag_file_processor.kill.assert_not_called()
def __init__(self, dag_directory: str, max_runs: int, processor_factory: Callable[ [str, List[FailureCallbackRequest]], AbstractDagFileProcessorProcess ], processor_timeout: timedelta, signal_conn: MultiprocessingConnection, async_mode: bool = True): super().__init__() self._file_paths: List[str] = [] self._file_path_queue: List[str] = [] self._dag_directory = dag_directory self._max_runs = max_runs self._processor_factory = processor_factory self._signal_conn = signal_conn self._async_mode = async_mode self._parsing_start_time: Optional[datetime] = None self._parallelism = conf.getint('scheduler', 'max_threads') if 'sqlite' in conf.get('core', 'sql_alchemy_conn') and self._parallelism > 1: self.log.warning( "Because we cannot use more than 1 thread (max_threads = " "%d ) when using sqlite. So we set parallelism to 1.", self._parallelism ) self._parallelism = 1 # How often to mark DAGs as inactive and delete their serializations # if they haven't been processed recently. self._dag_cleanup_interval = conf.getint('scheduler', 'dag_cleanup_interval') self._min_serialized_dag_update_interval = conf.getint('core', 'min_serialized_dag_update_interval') # Parse and schedule each file no faster than this interval. self._file_process_interval = conf.getint('scheduler', 'min_file_process_interval') # How often to print out DAG file processing stats to the log. Default to # 30 seconds. self.print_stats_interval = conf.getint('scheduler', 'print_stats_interval') # How many seconds do we wait for tasks to heartbeat before mark them as zombies. self._zombie_threshold_secs = ( conf.getint('scheduler', 'scheduler_zombie_task_threshold')) # Should store dag file source in a database? self.store_dag_code = conf.getboolean('core', 'store_dag_code', fallback=False) # Map from file path to the processor self._processors: Dict[str, AbstractDagFileProcessorProcess] = {} self._num_run = 0 # Map from file path to stats about the file self._file_stats: Dict[str, DagFileStat] = {} self._last_zombie_query_time = None # Last time that the DAG dir was traversed to look for files self.last_dag_dir_refresh_time = timezone.make_aware(datetime.fromtimestamp(0)) # Last time stats were printed self.last_stat_print_time = timezone.datetime(2000, 1, 1) # Last time we ran DAG cleanup self.last_dag_cleanup_time = timezone.utcnow() # TODO: Remove magic number self._zombie_query_interval = 10 # How long to wait before timing out a process to parse a DAG file self._processor_timeout = processor_timeout # How often to scan the DAGs directory for new files. Default to 5 minutes. self.dag_dir_list_interval = conf.getint('scheduler', 'dag_dir_list_interval') # Mapping file name and callbacks requests self._callback_to_execute: Dict[str, List[FailureCallbackRequest]] = defaultdict(list) self._log = logging.getLogger('airflow.processor_manager')
from argparse import Namespace from airflow import settings import airflow.bin.cli as cli from airflow.bin.cli import get_num_ready_workers_running, run, get_dag from airflow.models import TaskInstance from airflow.utils import timezone from airflow.utils.state import State from airflow.settings import Session from airflow import models from tests.compat import mock import os dag_folder_path = '/'.join(os.path.realpath(__file__).split('/')[:-1]) DEFAULT_DATE = timezone.make_aware(datetime(2015, 1, 1)) TEST_DAG_FOLDER = os.path.join( os.path.dirname(dag_folder_path), 'dags') TEST_DAG_ID = 'unit_tests' def reset(dag_id): session = Session() tis = session.query(models.TaskInstance).filter_by(dag_id=dag_id) tis.delete() session.commit() session.close() def create_mock_args( task_id,
def date_range( start_date: datetime, end_date: Optional[datetime] = None, num: Optional[int] = None, delta: Optional[Union[str, timedelta, relativedelta]] = None, ) -> List[datetime]: """ Get a set of dates as a list based on a start, end and delta, delta can be something that can be added to `datetime.datetime` or a cron expression as a `str` .. code-block:: pycon >>> from airflow.utils.dates import datterange >>> from datetime import datetime, timedelta >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta=timedelta(1)) [datetime.datetime(2016, 1, 1, 0, 0, tzinfo=Timezone('UTC')), datetime.datetime(2016, 1, 2, 0, 0, tzinfo=Timezone('UTC')), datetime.datetime(2016, 1, 3, 0, 0, tzinfo=Timezone('UTC'))] >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta="0 0 * * *") [datetime.datetime(2016, 1, 1, 0, 0, tzinfo=Timezone('UTC')), datetime.datetime(2016, 1, 2, 0, 0, tzinfo=Timezone('UTC')), datetime.datetime(2016, 1, 3, 0, 0, tzinfo=Timezone('UTC'))] >>> date_range(datetime(2016, 1, 1), datetime(2016, 3, 3), delta="0 0 0 * *") [datetime.datetime(2016, 1, 1, 0, 0, tzinfo=Timezone('UTC')), datetime.datetime(2016, 2, 1, 0, 0, tzinfo=Timezone('UTC')), datetime.datetime(2016, 3, 1, 0, 0, tzinfo=Timezone('UTC'))] :param start_date: anchor date to start the series from :param end_date: right boundary for the date range :param num: alternatively to end_date, you can specify the number of number of entries you want in the range. This number can be negative, output will always be sorted regardless :param delta: step length. It can be datetime.timedelta or cron expression as string """ warnings.warn( "`airflow.utils.dates.date_range()` is deprecated. Please use `airflow.timetables`.", category=DeprecationWarning, stacklevel=2, ) if not delta: return [] if end_date: if start_date > end_date: raise Exception("Wait. start_date needs to be before end_date") if num: raise Exception("Wait. Either specify end_date OR num") if not end_date and not num: end_date = timezone.utcnow() delta_iscron = False time_zone = start_date.tzinfo abs_delta: Union[timedelta, relativedelta] if isinstance(delta, str): delta_iscron = True if timezone.is_localized(start_date): start_date = timezone.make_naive(start_date, time_zone) cron = croniter(cron_presets.get(delta, delta), start_date) elif isinstance(delta, timedelta): abs_delta = abs(delta) elif isinstance(delta, relativedelta): abs_delta = abs(delta) else: raise Exception( "Wait. delta must be either datetime.timedelta or cron expression as str" ) dates = [] if end_date: if timezone.is_naive(start_date) and not timezone.is_naive(end_date): end_date = timezone.make_naive(end_date, time_zone) while start_date <= end_date: # type: ignore if timezone.is_naive(start_date): dates.append(timezone.make_aware(start_date, time_zone)) else: dates.append(start_date) if delta_iscron: start_date = cron.get_next(datetime) else: start_date += abs_delta else: num_entries: int = num # type: ignore for _ in range(abs(num_entries)): if timezone.is_naive(start_date): dates.append(timezone.make_aware(start_date, time_zone)) else: dates.append(start_date) if delta_iscron and num_entries > 0: start_date = cron.get_next(datetime) elif delta_iscron: start_date = cron.get_prev(datetime) elif num_entries > 0: start_date += abs_delta else: start_date -= abs_delta return sorted(dates)
def parse_datetime_f(value): if not isinstance(value, dt.datetime): return value return timezone.make_aware(value)
def __init__( self, dag_directory: str, max_runs: int, processor_factory: Callable[[str, List[CallbackRequest]], AbstractDagFileProcessorProcess], processor_timeout: timedelta, signal_conn: MultiprocessingConnection, dag_ids: Optional[List[str]], pickle_dags: bool, async_mode: bool = True, ): super().__init__() self._file_paths: List[str] = [] self._file_path_queue: List[str] = [] self._dag_directory = dag_directory self._max_runs = max_runs self._processor_factory = processor_factory self._signal_conn = signal_conn self._pickle_dags = pickle_dags self._dag_ids = dag_ids self._async_mode = async_mode self._parsing_start_time: Optional[int] = None self._parallelism = conf.getint('scheduler', 'parsing_processes') if 'sqlite' in conf.get('core', 'sql_alchemy_conn') and self._parallelism > 1: self.log.warning( "Because we cannot use more than 1 thread (parsing_processes = " "%d ) when using sqlite. So we set parallelism to 1.", self._parallelism, ) self._parallelism = 1 # Parse and schedule each file no faster than this interval. self._file_process_interval = conf.getint('scheduler', 'min_file_process_interval') # How often to print out DAG file processing stats to the log. Default to # 30 seconds. self.print_stats_interval = conf.getint('scheduler', 'print_stats_interval') # How many seconds do we wait for tasks to heartbeat before mark them as zombies. self._zombie_threshold_secs = conf.getint('scheduler', 'scheduler_zombie_task_threshold') # Should store dag file source in a database? self.store_dag_code = STORE_DAG_CODE # Map from file path to the processor self._processors: Dict[str, AbstractDagFileProcessorProcess] = {} self._num_run = 0 # Map from file path to stats about the file self._file_stats: Dict[str, DagFileStat] = {} self._last_zombie_query_time = None # Last time that the DAG dir was traversed to look for files self.last_dag_dir_refresh_time = timezone.make_aware(datetime.fromtimestamp(0)) # Last time stats were printed self.last_stat_print_time = 0 # TODO: Remove magic number self._zombie_query_interval = 10 # How long to wait before timing out a process to parse a DAG file self._processor_timeout = processor_timeout # How often to scan the DAGs directory for new files. Default to 5 minutes. self.dag_dir_list_interval = conf.getint('scheduler', 'dag_dir_list_interval') # Mapping file name and callbacks requests self._callback_to_execute: Dict[str, List[CallbackRequest]] = defaultdict(list) self._log = logging.getLogger('airflow.processor_manager') self.waitables: Dict[Any, Union[MultiprocessingConnection, AbstractDagFileProcessorProcess]] = { self._signal_conn: self._signal_conn, }
def __init__( self, dag_directory: Union[str, "pathlib.Path"], max_runs: int, processor_timeout: timedelta, dag_ids: Optional[List[str]], pickle_dags: bool, signal_conn: Optional[MultiprocessingConnection] = None, async_mode: bool = True, ): super().__init__() self._file_paths: List[str] = [] self._file_path_queue: List[str] = [] self._dag_directory = dag_directory self._max_runs = max_runs # signal_conn is None for dag_processor_standalone mode. self._direct_scheduler_conn = signal_conn self._pickle_dags = pickle_dags self._dag_ids = dag_ids self._async_mode = async_mode self._parsing_start_time: Optional[int] = None # Set the signal conn in to non-blocking mode, so that attempting to # send when the buffer is full errors, rather than hangs for-ever # attempting to send (this is to avoid deadlocks!) # # Don't do this in sync_mode, as we _need_ the DagParsingStat sent to # continue the scheduler if self._async_mode and self._direct_scheduler_conn is not None: os.set_blocking(self._direct_scheduler_conn.fileno(), False) self._parallelism = conf.getint('scheduler', 'parsing_processes') if (conf.get_mandatory_value('database', 'sql_alchemy_conn').startswith('sqlite') and self._parallelism > 1): self.log.warning( "Because we cannot use more than 1 thread (parsing_processes = " "%d) when using sqlite. So we set parallelism to 1.", self._parallelism, ) self._parallelism = 1 # Parse and schedule each file no faster than this interval. self._file_process_interval = conf.getint('scheduler', 'min_file_process_interval') # How often to print out DAG file processing stats to the log. Default to # 30 seconds. self.print_stats_interval = conf.getint('scheduler', 'print_stats_interval') # Map from file path to the processor self._processors: Dict[str, DagFileProcessorProcess] = {} self._num_run = 0 # Map from file path to stats about the file self._file_stats: Dict[str, DagFileStat] = {} # Last time that the DAG dir was traversed to look for files self.last_dag_dir_refresh_time = timezone.make_aware( datetime.fromtimestamp(0)) # Last time stats were printed self.last_stat_print_time = 0 # Last time we cleaned up DAGs which are no longer in files self.last_deactivate_stale_dags_time = timezone.make_aware( datetime.fromtimestamp(0)) # How often to check for DAGs which are no longer in files self.deactivate_stale_dags_interval = conf.getint( 'scheduler', 'deactivate_stale_dags_interval') # How long to wait before timing out a process to parse a DAG file self._processor_timeout = processor_timeout # How often to scan the DAGs directory for new files. Default to 5 minutes. self.dag_dir_list_interval = conf.getint('scheduler', 'dag_dir_list_interval') # Mapping file name and callbacks requests self._callback_to_execute: Dict[ str, List[CallbackRequest]] = defaultdict(list) self._log = logging.getLogger('airflow.processor_manager') self.waitables: Dict[Any, Union[MultiprocessingConnection, DagFileProcessorProcess]] = ({ self._direct_scheduler_conn: self._direct_scheduler_conn, } if self._direct_scheduler_conn is not None else {})
def test_make_aware(self): self.assertEqual( timezone.make_aware(datetime.datetime(2011, 9, 1, 13, 20, 30), EAT), datetime.datetime(2011, 9, 1, 13, 20, 30, tzinfo=EAT)) with self.assertRaises(ValueError): timezone.make_aware(datetime.datetime(2011, 9, 1, 13, 20, 30, tzinfo=EAT), EAT)
def clean(self, value): dt = super(UtcDateTimeFilterMixin, self).clean(value) return timezone.make_aware(dt, timezone=timezone.utc)
def round_time(dt, delta, start_date=timezone.make_aware(datetime.min)): """ Returns the datetime of the form start_date + i * delta which is closest to dt for any non-negative integer i. Note that delta may be a datetime.timedelta or a dateutil.relativedelta >>> round_time(datetime(2015, 1, 1, 6), timedelta(days=1)) datetime.datetime(2015, 1, 1, 0, 0) >>> round_time(datetime(2015, 1, 2), relativedelta(months=1)) datetime.datetime(2015, 1, 1, 0, 0) >>> round_time(datetime(2015, 9, 16, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0)) datetime.datetime(2015, 9, 16, 0, 0) >>> round_time(datetime(2015, 9, 15, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0)) datetime.datetime(2015, 9, 15, 0, 0) >>> round_time(datetime(2015, 9, 14, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0)) datetime.datetime(2015, 9, 14, 0, 0) >>> round_time(datetime(2015, 9, 13, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0)) datetime.datetime(2015, 9, 14, 0, 0) """ if isinstance(delta, six.string_types): # It's cron based, so it's easy tz = start_date.tzinfo start_date = timezone.make_naive(start_date, tz) cron = croniter(delta, start_date) prev = cron.get_prev(datetime) if prev == start_date: return timezone.make_aware(start_date, tz) else: return timezone.make_aware(prev, tz) # Ignore the microseconds of dt dt -= timedelta(microseconds=dt.microsecond) # We are looking for a datetime in the form start_date + i * delta # which is as close as possible to dt. Since delta could be a relative # delta we don't know its exact length in seconds so we cannot rely on # division to find i. Instead we employ a binary search algorithm, first # finding an upper and lower limit and then disecting the interval until # we have found the closest match. # We first search an upper limit for i for which start_date + upper * delta # exceeds dt. upper = 1 while start_date + upper * delta < dt: # To speed up finding an upper limit we grow this exponentially by a # factor of 2 upper *= 2 # Since upper is the first value for which start_date + upper * delta # exceeds dt, upper // 2 is below dt and therefore forms a lower limited # for the i we are looking for lower = upper // 2 # We now continue to intersect the interval between # start_date + lower * delta and start_date + upper * delta # until we find the closest value while True: # Invariant: start + lower * delta < dt <= start + upper * delta # If start_date + (lower + 1)*delta exceeds dt, then either lower or # lower+1 has to be the solution we are searching for if start_date + (lower + 1) * delta >= dt: # Check if start_date + (lower + 1)*delta or # start_date + lower*delta is closer to dt and return the solution if ( (start_date + (lower + 1) * delta) - dt <= dt - (start_date + lower * delta)): return start_date + (lower + 1) * delta else: return start_date + lower * delta # We intersect the interval and either replace the lower or upper # limit with the candidate candidate = lower + (upper - lower) // 2 if start_date + candidate * delta >= dt: upper = candidate else: lower = candidate
def date_range(start_date, end_date=None, num=None, delta=None): # pylint: disable=too-many-branches """ Get a set of dates as a list based on a start, end and delta, delta can be something that can be added to `datetime.datetime` or a cron expression as a `str` .. code-block:: python date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta=timedelta(1)) [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0)] date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta='0 0 * * *') [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0)] date_range(datetime(2016, 1, 1), datetime(2016, 3, 3), delta="0 0 0 * *") [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0)] :param start_date: anchor date to start the series from :type start_date: datetime.datetime :param end_date: right boundary for the date range :type end_date: datetime.datetime :param num: alternatively to end_date, you can specify the number of number of entries you want in the range. This number can be negative, output will always be sorted regardless :type num: int :param delta: step length. It can be datetime.timedelta or cron expression as string :type delta: datetime.timedelta or str """ if not delta: return [] if end_date and start_date > end_date: raise Exception("Wait. start_date needs to be before end_date") if end_date and num: raise Exception("Wait. Either specify end_date OR num") if not end_date and not num: end_date = timezone.utcnow() if delta in cron_presets: delta = cron_presets.get(delta) delta_iscron = False time_zone = start_date.tzinfo if isinstance(delta, str): delta_iscron = True if timezone.is_localized(start_date): start_date = timezone.make_naive(start_date, time_zone) cron = croniter(delta, start_date) elif isinstance(delta, timedelta): delta = abs(delta) else: raise Exception( "Wait. delta must be either datetime.timedelta or cron expression as str" ) dates = [] if end_date: if timezone.is_naive(start_date) and not timezone.is_naive(end_date): end_date = timezone.make_naive(end_date, time_zone) while start_date <= end_date: if timezone.is_naive(start_date): dates.append(timezone.make_aware(start_date, time_zone)) else: dates.append(start_date) if delta_iscron: start_date = cron.get_next(datetime) else: start_date += delta else: for _ in range(abs(num)): if timezone.is_naive(start_date): dates.append(timezone.make_aware(start_date, time_zone)) else: dates.append(start_date) if delta_iscron and num > 0: start_date = cron.get_next(datetime) elif delta_iscron: start_date = cron.get_prev(datetime) elif num > 0: start_date += delta else: start_date -= delta return sorted(dates)
def round_time(dt, delta, start_date=timezone.make_aware(datetime.min)): """ Returns the datetime of the form start_date + i * delta which is closest to dt for any non-negative integer i. Note that delta may be a datetime.timedelta or a dateutil.relativedelta >>> round_time(datetime(2015, 1, 1, 6), timedelta(days=1)) datetime.datetime(2015, 1, 1, 0, 0) >>> round_time(datetime(2015, 1, 2), relativedelta(months=1)) datetime.datetime(2015, 1, 1, 0, 0) >>> round_time(datetime(2015, 9, 16, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0)) datetime.datetime(2015, 9, 16, 0, 0) >>> round_time(datetime(2015, 9, 15, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0)) datetime.datetime(2015, 9, 15, 0, 0) >>> round_time(datetime(2015, 9, 14, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0)) datetime.datetime(2015, 9, 14, 0, 0) >>> round_time(datetime(2015, 9, 13, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0)) datetime.datetime(2015, 9, 14, 0, 0) """ if isinstance(delta, six.string_types): # It's cron based, so it's easy tz = start_date.tzinfo start_date = timezone.make_naive(start_date, tz) cron = croniter(delta, start_date) prev = cron.get_prev(datetime) if prev == start_date: return timezone.make_aware(start_date, tz) else: return timezone.make_aware(prev, tz) # Ignore the microseconds of dt dt -= timedelta(microseconds=dt.microsecond) # We are looking for a datetime in the form start_date + i * delta # which is as close as possible to dt. Since delta could be a relative # delta we don't know it's exact length in seconds so we cannot rely on # division to find i. Instead we employ a binary search algorithm, first # finding an upper and lower limit and then disecting the interval until # we have found the closest match. # We first search an upper limit for i for which start_date + upper * delta # exceeds dt. upper = 1 while start_date + upper * delta < dt: # To speed up finding an upper limit we grow this exponentially by a # factor of 2 upper *= 2 # Since upper is the first value for which start_date + upper * delta # exceeds dt, upper // 2 is below dt and therefore forms a lower limited # for the i we are looking for lower = upper // 2 # We now continue to intersect the interval between # start_date + lower * delta and start_date + upper * delta # until we find the closest value while True: # Invariant: start + lower * delta < dt <= start + upper * delta # If start_date + (lower + 1)*delta exceeds dt, then either lower or # lower+1 has to be the solution we are searching for if start_date + (lower + 1) * delta >= dt: # Check if start_date + (lower + 1)*delta or # start_date + lower*delta is closer to dt and return the solution if ((start_date + (lower + 1) * delta) - dt <= dt - (start_date + lower * delta)): return start_date + (lower + 1) * delta else: return start_date + lower * delta # We intersect the interval and either replace the lower or upper # limit with the candidate candidate = lower + (upper - lower) // 2 if start_date + candidate * delta >= dt: upper = candidate else: lower = candidate
def clean(self, value): dt = super(UtcDateTimeFilterMixin, self).clean(value) if isinstance(dt, list): return [timezone.make_aware(d, timezone=timezone.utc) for d in dt] return timezone.make_aware(dt, timezone=timezone.utc)
def date_range(start_date, end_date=None, num=None, delta=None): """ Get a set of dates as a list based on a start, end and delta, delta can be something that can be added to ``datetime.datetime`` or a cron expression as a ``str`` :param start_date: anchor date to start the series from :type start_date: datetime.datetime :param end_date: right boundary for the date range :type end_date: datetime.datetime :param num: alternatively to end_date, you can specify the number of number of entries you want in the range. This number can be negative, output will always be sorted regardless :type num: int >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta=timedelta(1)) [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0)] >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta='0 0 * * *') [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0)] >>> date_range(datetime(2016, 1, 1), datetime(2016, 3, 3), delta="0 0 0 * *") [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0)] """ if not delta: return [] if end_date and start_date > end_date: raise Exception("Wait. start_date needs to be before end_date") if end_date and num: raise Exception("Wait. Either specify end_date OR num") if not end_date and not num: end_date = timezone.utcnow() delta_iscron = False if isinstance(delta, six.string_types): delta_iscron = True tz = start_date.tzinfo start_date = timezone.make_naive(start_date, tz) cron = croniter(delta, start_date) elif isinstance(delta, timedelta): delta = abs(delta) l = [] if end_date: while start_date <= end_date: if timezone.is_naive(start_date): l.append(timezone.make_aware(start_date, tz)) else: l.append(start_date) if delta_iscron: start_date = cron.get_next(datetime) else: start_date += delta else: for _ in range(abs(num)): if timezone.is_naive(start_date): l.append(timezone.make_aware(start_date, tz)) else: l.append(start_date) if delta_iscron: if num > 0: start_date = cron.get_next(datetime) else: start_date = cron.get_prev(datetime) else: if num > 0: start_date += delta else: start_date -= delta return sorted(l)
def populate_obj(self, item): # TODO: This is probably better done as a custom field type so we can # set TZ at parse time super(DagRunForm, self).populate_obj(item) item.execution_date = timezone.make_aware(item.execution_date)
def prepare_file_path_queue(self): """Generate more file paths to process. Result are saved in _file_path_queue.""" self._parsing_start_time = time.perf_counter() # If the file path is already being processed, or if a file was # processed recently, wait until the next batch file_paths_in_progress = self._processors.keys() now = timezone.utcnow() # Sort the file paths by the parsing order mode list_mode = conf.get("scheduler", "file_parsing_sort_mode") files_with_mtime = {} file_paths = [] is_mtime_mode = list_mode == "modified_time" file_paths_recently_processed = [] for file_path in self._file_paths: if is_mtime_mode: try: files_with_mtime[file_path] = os.path.getmtime(file_path) except FileNotFoundError: self.log.warning("Skipping processing of missing file: %s", file_path) continue file_modified_time = timezone.make_aware( datetime.fromtimestamp(files_with_mtime[file_path])) else: file_paths.append(file_path) file_modified_time = None # Find file paths that were recently processed to exclude them # from being added to file_path_queue # unless they were modified recently and parsing mode is "modified_time" # in which case we don't honor "self._file_process_interval" (min_file_process_interval) last_finish_time = self.get_last_finish_time(file_path) if (last_finish_time is not None and (now - last_finish_time).total_seconds() < self._file_process_interval and not (is_mtime_mode and file_modified_time and (file_modified_time > last_finish_time))): file_paths_recently_processed.append(file_path) # Sort file paths via last modified time if is_mtime_mode: file_paths = sorted(files_with_mtime, key=files_with_mtime.get, reverse=True) elif list_mode == "alphabetical": file_paths = sorted(file_paths) elif list_mode == "random_seeded_by_host": # Shuffle the list seeded by hostname so multiple schedulers can work on different # set of files. Since we set the seed, the sort order will remain same per host random.Random(get_hostname()).shuffle(file_paths) files_paths_at_run_limit = [ file_path for file_path, stat in self._file_stats.items() if stat.run_count == self._max_runs ] file_paths_to_exclude = set(file_paths_in_progress).union( file_paths_recently_processed, files_paths_at_run_limit) # Do not convert the following list to set as set does not preserve the order # and we need to maintain the order of file_paths for `[scheduler] file_parsing_sort_mode` files_paths_to_queue = [ file_path for file_path in file_paths if file_path not in file_paths_to_exclude ] for file_path, processor in self._processors.items(): self.log.debug( "File path %s is still being processed (started: %s)", processor.file_path, processor.start_time.isoformat(), ) self.log.debug("Queuing the following files for processing:\n\t%s", "\n\t".join(files_paths_to_queue)) for file_path in files_paths_to_queue: if file_path not in self._file_stats: self._file_stats[file_path] = DagFileStat( num_dags=0, import_errors=0, last_finish_time=None, last_duration=None, run_count=0) self._file_path_queue.extend(files_paths_to_queue)
def date_range( start_date, end_date=None, num=None, delta=None): """ Get a set of dates as a list based on a start, end and delta, delta can be something that can be added to ``datetime.datetime`` or a cron expression as a ``str`` :param start_date: anchor date to start the series from :type start_date: datetime.datetime :param end_date: right boundary for the date range :type end_date: datetime.datetime :param num: alternatively to end_date, you can specify the number of number of entries you want in the range. This number can be negative, output will always be sorted regardless :type num: int >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta=timedelta(1)) [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0)] >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta='0 0 * * *') [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0)] >>> date_range(datetime(2016, 1, 1), datetime(2016, 3, 3), delta="0 0 0 * *") [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0)] """ if not delta: return [] if end_date and start_date > end_date: raise Exception("Wait. start_date needs to be before end_date") if end_date and num: raise Exception("Wait. Either specify end_date OR num") if not end_date and not num: end_date = timezone.utcnow() delta_iscron = False tz = start_date.tzinfo if isinstance(delta, six.string_types): delta_iscron = True start_date = timezone.make_naive(start_date, tz) cron = croniter(delta, start_date) elif isinstance(delta, timedelta): delta = abs(delta) dates = [] if end_date: if timezone.is_naive(start_date): end_date = timezone.make_naive(end_date, tz) while start_date <= end_date: if timezone.is_naive(start_date): dates.append(timezone.make_aware(start_date, tz)) else: dates.append(start_date) if delta_iscron: start_date = cron.get_next(datetime) else: start_date += delta else: for _ in range(abs(num)): if timezone.is_naive(start_date): dates.append(timezone.make_aware(start_date, tz)) else: dates.append(start_date) if delta_iscron: if num > 0: start_date = cron.get_next(datetime) else: start_date = cron.get_prev(datetime) else: if num > 0: start_date += delta else: start_date -= delta return sorted(dates)
from airflow import settings from airflow.cli import cli_parser from airflow.cli.commands import dag_command from airflow.exceptions import AirflowException from airflow.models import DagBag, DagModel, DagRun from airflow.utils import timezone from airflow.utils.session import create_session from airflow.utils.state import State from airflow.utils.types import DagRunType from tests.test_utils.config import conf_vars from tests.test_utils.db import clear_db_dags, clear_db_runs dag_folder_path = '/'.join(os.path.realpath(__file__).split('/')[:-1]) DEFAULT_DATE = timezone.make_aware(datetime(2015, 1, 1)) TEST_DAG_FOLDER = os.path.join(os.path.dirname(dag_folder_path), 'dags') TEST_DAG_ID = 'unit_tests' EXAMPLE_DAGS_FOLDER = os.path.join( os.path.dirname( os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), "airflow/example_dags") class TestCliDags(unittest.TestCase): @classmethod def setUpClass(cls): cls.dagbag = DagBag(include_examples=True) cls.dagbag.sync_to_db() cls.parser = cli_parser.get_parser()
def test_make_aware(self): assert timezone.make_aware(datetime.datetime(2011, 9, 1, 13, 20, 30), EAT) == datetime.datetime( 2011, 9, 1, 13, 20, 30, tzinfo=EAT ) with pytest.raises(ValueError): timezone.make_aware(datetime.datetime(2011, 9, 1, 13, 20, 30, tzinfo=EAT), EAT)