def job_to_dag(self, job): # type: (dict) -> Union[DAG, None] start_day = convert_to_utc(job.get("start_date", None)) end_date = convert_to_utc(job.get("end_date", None)) default_args = { "owner": job.get("create_user", None), "depends_on_past": job.get("depends_on_past", False), "start_date": job["start_date"], "end_date": end_date, } job_name = clean_job_name(job["name"]) dag = DAG( "dbnd_launcher__%s" % job_name, start_date=start_day, default_args=default_args, schedule_interval=job.get("schedule_interval", None), catchup=job.get("catchup", False), ) DbndSchedulerOperator( scheduled_cmd=job["cmd"], scheduled_job_name=job_name, scheduled_job_uid=job.get("uid", None), shell=config.getboolean("scheduler", "shell_cmd"), task_id="launcher", dag=dag, retries=job.get("retries", self.default_retries) or self.default_retries, ) return dag
def job_to_dag(self, job): # type: (dict) -> Union[DAG, None] # convert_to_utc usage might be dangerous, as there is the same function at airflow # however, that one use pendulum not from _vendorized default_args = {} if job.get("depends_on_past"): default_args["depends_on_past"] = job.get("depends_on_past") start_date = convert_to_utc(job.get("start_date")) if start_date: default_args["start_day"] = start_date if job.get("end_date"): default_args["end_date"] = convert_to_utc(job.get("end_date")) if job.get("owner"): default_args["owner"] = job.get("create_user") job_name = clean_job_name(job["name"]) dag = DAG( "%s" % job_name, start_date=start_date, default_args=default_args, schedule_interval=job.get("schedule_interval", None), catchup=job.get("catchup", False), ) custom_operator_class = self.custom_operator_class or DbndSchedulerOperator custom_operator_class( scheduled_cmd=job["cmd"], scheduled_job_name=job_name, extra_args=job.get("extra_args", None), with_name=False, scheduled_job_uid=job.get("uid", None), shell=config.getboolean("scheduler", "shell_cmd"), task_id="launcher", dag=dag, retries=job.get("retries") or self.default_retries, ) return dag
def clean_name_dns1123(value, max_size, postfix=None): # type:(str, int, Optional[str]) -> str """ Create a dns-1123 compatible name. @param value: the base value to transform @param max_size: the maximum length allowed for the output @param postfix: optional string to add to end of the result value @return: dns-1123 compatible name """ cleaned_value = clean_job_name( value=value, enabled_characters="-.", placeholder="-", max_size=max_size, postfix=postfix, ) # remove any none alphanumeric characters in the beginning and end of the cleaned value return strip_by(lambda c: not c.isalnum(), cleaned_value)
def job_to_dag(self, job): # type: (dict) -> Union[DAG, None] default_args = {} if job.get("depends_on_past"): default_args["depends_on_past"] = job.get("depends_on_past") start_date = convert_to_utc(job.get("start_date")) if start_date: default_args["start_day"] = start_date if job.get("end_date"): default_args["end_date"] = convert_to_utc(job.get("end_date")) if job.get("owner"): default_args["owner"] = job.get("owner") job_name = clean_job_name(job["name"]) dag = DAG( "%s" % job_name, start_date=start_date, default_args=default_args, schedule_interval=job.get("schedule_interval", None), catchup=job.get("catchup", False), ) DbndSchedulerOperator( task_id="launcher", dag=dag, retries=job.get("retries") or self.default_retries, scheduled_cmd=job["cmd"], scheduled_job_name=job_name, with_name=False, scheduled_job_uid=job.get("uid", None), shell=config.getboolean("scheduler", "shell_cmd"), ) return dag
def __init__( self, task, run, task_af_id=None, try_number=1, is_dynamic=None, task_engine=None, ): # type: (Task, DatabandRun, str, int, bool, EngineConfig)-> None # actually this is used as Task uid self.task = task # type: Task self.run = run # type: DatabandRun self.task_engine = task_engine self.try_number = try_number self.is_dynamic = is_dynamic if is_dynamic is not None else task.task_is_dynamic self.is_system = task.task_is_system self.task_af_id = task_af_id or self.task.task_id if task.ctrl.force_task_run_uid: self.task_run_uid = tr_uid = task.ctrl.force_task_run_uid if isinstance(tr_uid, TaskRunUidGen): self.task_run_uid = tr_uid.generate_task_run_uid( run=run, task=task, task_af_id=self.task_af_id ) else: self.task_run_uid = get_uuid() # used by all kind of submission controllers self.job_name = clean_job_name(self.task_af_id).lower() self.job_id = self.job_name + "_" + str(self.task_run_uid)[:8] # DNS-1123 subdomain name (k8s) self.job_id__dns1123 = clean_job_name_dns1123( "dbnd.{task_family}.{task_name}".format( task_family=self.task.task_meta.task_family, task_name=self.task.task_meta.task_name, ), postfix=".%s" % str(self.task_run_uid)[:8], ) # custom per task engine , or just use one from global env dbnd_local_root = ( self.task_engine.dbnd_local_root or self.run.env.dbnd_local_root ) self.local_task_run_root = ( dbnd_local_root.folder(run.run_folder_prefix) .folder("tasks") .folder(self.task.task_id) ) self._attempt_number = 1 self.task_run_attempt_uid = get_uuid() self.attempt_folder = None self.meta_files = None self.log = None self.init_attempt() # TODO: inherit from parent task if disabled self.is_tracked = task._conf__tracked if self.is_tracked and self.run.is_tracked: tracking_store = self.run.context.tracking_store else: tracking_store = ConsoleStore() self.tracking_store = tracking_store self.tracker = TaskRunTracker(task_run=self, tracking_store=tracking_store) self.runner = TaskRunRunner(task_run=self) self.deploy = TaskSyncCtrl(task_run=self) self.task_tracker_url = self.tracker.task_run_url() self.external_resource_urls = dict() self.errors = [] self.is_root = False self.is_reused = False self.is_skipped = False # Task can be skipped as it's not required by any other task scheduled to run self.is_skipped_as_not_required = False self._airflow_context = None self._task_run_state = None self.start_time = None self.finished_time = None
def test_clean_job_name_3(self): assert clean_job_name("AaBb[]1111", placeholder=r"-") == "aa-bb-1111"
def test_clean_job_name_1(self): assert clean_job_name("Aa[]1111") == "aa_1111"