def with_dependencies(self, dag_collection): """Perfom a dry_run to get upstream dependencies.""" dependencies = [] if self.is_python_script: # dry run is not possible for python script queries; skip self.dependencies = dependencies return for table in self._get_referenced_tables(): upstream_task = dag_collection.task_for_table( table[0], table[1], table[2]) task_schedule_interval = dag_collection.dag_by_name( self.dag_name).schedule_interval if upstream_task is not None: # ensure there are no duplicate dependencies # manual dependency definitions overwrite automatically detected ones if not any(d.dag_name == upstream_task.dag_name and d.task_id == upstream_task.task_name for d in self.depends_on): upstream_schedule_interval = dag_collection.dag_by_name( upstream_task.dag_name).schedule_interval execution_delta = schedule_interval_delta( upstream_schedule_interval, task_schedule_interval) if execution_delta == "0s": execution_delta = None dependencies.append( TaskRef( dag_name=upstream_task.dag_name, task_id=upstream_task.task_name, execution_delta=execution_delta, )) else: # see if there are some static dependencies for task, patterns in EXTERNAL_TASKS.items(): if any( fnmatchcase(f"{table[1]}.{table[2]}", p) for p in patterns): # ensure there are no duplicate dependencies # manual dependency definitions overwrite automatically detected if not any(d.dag_name == task.dag_name and d.task_id == task.task_id for d in self.depends_on + dependencies): execution_delta = schedule_interval_delta( task.schedule_interval, task_schedule_interval) if execution_delta: dependencies.append( TaskRef( dag_name=task.dag_name, task_id=task.task_id, execution_delta=execution_delta, )) break # stop after the first match self.dependencies = dependencies
def _create_export_task(self, task, dag_collection): if not task.public_json: raise ValueError( f"Task {task.task_name} not marked as public JSON.") converter = cattr.Converter() task_dict = converter.unstructure(task) del task_dict["dataset"] del task_dict["table"] del task_dict["version"] del task_dict["project"] export_task = converter.structure(task_dict, Task) export_task.dag_name = self.name export_task.task_name = f"export_public_data_json_{export_task.task_name}" task_schedule_interval = dag_collection.dag_by_name( task.dag_name).schedule_interval execution_delta = schedule_interval_delta(task_schedule_interval, self.schedule_interval) if execution_delta == "0s": execution_delta = None export_task.dependencies = [ TaskRef( dag_name=task.dag_name, task_id=task.task_name, execution_delta=execution_delta, ) ] return export_task
def get_execution_delta(self, schedule_interval): """Determine execution_delta, via schedule_interval if necessary.""" if self.execution_delta is not None: return self.execution_delta elif self.schedule_interval is not None and schedule_interval is not None: execution_delta = schedule_interval_delta(self.schedule_interval, schedule_interval) if execution_delta != "0s": return execution_delta return None
def test_schedule_interval_delta(self): assert schedule_interval_delta("0 1 * * *", "0 2 * * *") == "3600s" assert schedule_interval_delta("0 2 * * *", "0 0 * * *") == "-7200s" assert schedule_interval_delta("* * * * *", "* * * * *") == "0s" assert schedule_interval_delta("0 1 * * *", "0 1 * * *") == "0s" assert schedule_interval_delta("daily", "0 1 * * *") == "3600s" assert schedule_interval_delta("daily", "0 0 * * *") == "0s"