Example #1
0
 def _generate_run_name(self,
                        af_context: Optional[AirflowTaskContext]) -> str:
     """
     If this is an airflow run, generate a name to reflect it still
     awaits a sync.
     Otherwise, generate a human friendly name for the run.
     """
     if af_context is not None:
         return f"Airflow-run-await-sync_{self.run_uid}"
     return get_random_name(seed=self.run_uid)
Example #2
0
    def run(self):
        """
        Generates bogus data and writes it into the :py:meth:`~.Streams.output` target.
        """
        logger.warning("Hey, this is streams task!")

        with self.output().open("w") as output:
            for _ in range(1000):
                output.write("{} {} {}\n".format(
                    random.randint(0, 999),
                    get_random_name(),
                    random.randint(0, 999),
                ))
            log_metric("lines", 1000)
Example #3
0
    def __init__(
        self,
        context,  # type: DatabandContext
        job_name,
        run_uid=None,  # type:  Optional[UUID]
        scheduled_run_info=None,  # type:  Optional[ScheduledRunInfo]
        existing_run=None,
        source=UpdateSource.dbnd,  # type:Optional[UpdateSource]
        af_context=None,
        is_orchestration=False,
    ):
        self.context = context
        s = self.context.settings  # type: DatabandSettings

        self.job_name = job_name

        self.description = s.run.description
        self.is_archived = s.run.is_archived
        self.source = source
        self.is_orchestration = is_orchestration

        self.existing_run = existing_run or False
        # this was added to allow the scheduler to create the run which will be continued by the actually run command instead of having 2 separate runs
        if not run_uid and DBND_RUN_UID in os.environ:
            # we pop so if this run spawnes subprocesses with their own runs they will be associated using the sub-runs mechanism instead
            # of being fused into this run directly
            run_uid = os.environ.pop(DBND_RUN_UID)
        if run_uid:
            self.run_uid = run_uid
            self.existing_run = True
        else:
            self.run_uid = get_uuid()

        # if user provided name - use it
        # otherwise - generate human friendly name for the run
        self.name = s.run.name or get_random_name(seed=self.run_uid)
        self.execution_date = unique_execution_date()

        self.is_tracked = True

        # tracking/orchestration main task
        self.root_task = None  # type: Optional[Task]

        # task run that wraps execution (tracking or orchestration)
        self._driver_task_run = None

        # ORCHESTRATION: execution of the run
        self.run_executor = None  # type: Optional[RunExecutor]

        # dag_id , execution_date are used by Airflow,
        # should be deprecated (still used by DB tracking)
        self.dag_id = AD_HOC_DAG_PREFIX + self.job_name

        # RUN STATE
        self._run_state = None
        self.task_runs = []  # type: List[TaskRun]
        self.task_runs_by_id = {}
        self.task_runs_by_af_id = {}

        self.target_origin = TargetIdentitySourceMap()
        self.describe = RunBanner(self)
        self.tracker = RunTracker(self,
                                  tracking_store=self.context.tracking_store)

        # ALL RUN CONTEXT SPECIFIC thing
        self.root_run_info = RootRunInfo.from_env(current_run=self)
        self.scheduled_run_info = scheduled_run_info or ScheduledRunInfo.from_env(
            self.run_uid)
        self.env = self.context.env
        self.run_folder_prefix = os.path.join(
            "log",
            self.execution_date.strftime("%Y-%m-%d"),
            "%s_%s_%s" % (
                self.execution_date.strftime("%Y-%m-%dT%H%M%S.%f"),
                self.job_name,
                self.name,
            ),
        )
        self.run_root = self.env.dbnd_root.folder(self.run_folder_prefix)
        self.run_local_root = self.env.dbnd_local_root.folder(
            self.run_folder_prefix)

        self.local_engine = build_engine_config(
            self.env.local_engine).clone(require_submit=False)

        self.dynamic_af_tasks_count = dict()
        self.af_context = af_context
        self.start_time = None
        self.finished_time = None
Example #4
0
    def __init__(
        self,
        context,
        task_or_task_name,
        run_uid=None,
        scheduled_run_info=None,
        send_heartbeat=True,
        existing_run=None,
        job_name=None,
    ):
        # type:(DatabandContext, Union[Task, str] , Optional[UUID], Optional[ScheduledRunInfo], Optional[bool]) -> None
        self.context = context
        s = self.context.settings  # type: DatabandSettings

        if isinstance(task_or_task_name, six.string_types):
            self.root_task_name = task_or_task_name
            self.root_task = None
        elif isinstance(task_or_task_name, Task):
            self.root_task_name = task_or_task_name.task_name
            self.root_task = task_or_task_name
        else:
            raise

        self.job_name = job_name or self.root_task_name

        self.name = s.run.name or get_random_name()
        self.description = s.run.description
        self.is_archived = s.run.is_archived

        # this was added to allow the scheduler to create the run which will be continued by the actually run command instead of having 2 separate runs
        if not run_uid and DBND_RUN_UID in os.environ:
            # we pop so if this run spawnes subprocesses with their own runs they will be associated using the sub-runs mechanism instead
            # of being fused into this run directly
            run_uid = os.environ.pop(DBND_RUN_UID)
        if run_uid:
            self.run_uid = run_uid
            self.existing_run = True
        else:
            self.run_uid = get_uuid()
            self.existing_run = False

        if existing_run is not None:
            self.existing_run = existing_run

        # this is so the scheduler can create a run with partial information and then have the subprocess running the actual cmd fill in the details
        self.resubmit_run = (DBND_RESUBMIT_RUN in os.environ
                             and os.environ.pop(DBND_RESUBMIT_RUN) == "true")

        # AIRFLOW, move into executor
        # dag_id , execution_date and run_id is used by airflow
        self.dag_id = self.root_task_name
        self.execution_date = unique_execution_date()
        run_id = s.run.id
        if not run_id:
            # we need this name, otherwise Airflow will try to manage our local jobs at scheduler
            # ..zombies cleanup and so on
            run_id = "backfill_{0}_{1}".format(self.name,
                                               self.execution_date.isoformat())
        self.run_id = run_id

        self._template_vars = self._build_template_vars()

        self.is_tracked = True

        self.runtime_errors = []
        self._run_state = None
        self.task_runs = []  # type: List[TaskRun]
        self.task_runs_by_id = {}
        self.task_runs_by_af_id = {}

        self.target_origin = TargetIdentitySourceMap()
        self.describe = DescribeRun(self)
        self.tracker = RunTracker(self,
                                  tracking_store=self.context.tracking_store)

        # ALL RUN CONTEXT SPECIFIC thing
        self.root_run_info = RootRunInfo.from_env(current_run=self)
        self.scheduled_run_info = scheduled_run_info or ScheduledRunInfo.from_env(
            self.run_uid)

        # now we can add driver task
        self.driver_task_run = None  # type: Optional[TaskRun]
        self.root_task_run = None  # type: Optional[TaskRun]

        self.run_folder_prefix = os.path.join(
            "log",
            self.execution_date.strftime("%Y-%m-%d"),
            "%s_%s_%s" % (
                self.execution_date.strftime("%Y-%m-%dT%H%M%S.%f"),
                self.root_task_name,
                self.name,
            ),
        )

        self.run_config = self.context.settings.run  # type: RunConfig
        self.env = env = self.context.env

        self.local_engine = self._get_engine_config(env.local_engine)
        self.remote_engine = self._get_engine_config(env.remote_engine
                                                     or env.local_engine)

        self.submit_driver = (self.run_config.submit_driver
                              if self.run_config.submit_driver is not None else
                              env.submit_driver)
        self.submit_tasks = (self.run_config.submit_tasks
                             if self.run_config.submit_tasks is not None else
                             env.submit_tasks)
        self.task_executor_type, self.parallel = calculate_task_executor_type(
            self.submit_tasks, self.remote_engine, self.context.settings)

        self.sends_heartbeat = send_heartbeat