Ejemplo n.º 1
0
def project_init(ctx, overwrite, dbnd_home, dbnd_system):
    """Initialize the project structure"""

    from dbnd._core.errors import DatabandSystemError
    from dbnd import databand_lib_path

    os.environ["SKIP_DAGS_PARSING"] = "True"  # Exclude airflow dag examples

    conf_folder = databand_lib_path("conf/project_init")

    if os.path.exists(os.path.join(dbnd_home, "project.cfg")):
        if not overwrite:
            raise DatabandSystemError(
                "You are trying to re-initialize your project. You already have dbnd configuration at %s. "
                "You can force project-init by providing --overwrite flag. "
                "If you need to create/update database use `dbnd db init` instead"
                % dbnd_system
            )

        logger.warning(
            "You are re-initializing your project, all files at %s are going to be over written!"
            % dbnd_home
        )

    copy_tree(conf_folder, dbnd_home)
    click.echo("Databand project has been initialized at %s" % dbnd_home)
    return
Ejemplo n.º 2
0
    def _describe_task(self, task, describe_format=None, msg=None, color=None):
        describe_format = describe_format or self.describe_format
        describe_config = self.config  # type: DescribeConfig
        msg = msg or ""

        if color is None:
            color = "white"
            if not describe_config.no_checks:
                color = "green" if self._get_task_complete(task) else "cyan"

        if describe_format == DescribeFormat.short:
            return colored(str(task.task_id), color)

        if (describe_format == DescribeFormat.long
                or describe_format == DescribeFormat.verbose):
            title = "%s - %s" % (task.task_name, task.task_id)
            if task.task_name != task.get_task_family():
                title += "(%s)" % task.get_task_family()
            if msg:
                title += ": %s" % msg
            return task.ctrl.visualiser.banner(
                title,
                color=color,
                verbose=describe_format == DescribeFormat.verbose)

        raise DatabandSystemError("Not supported format mode %s" %
                                  self.describe_format)
Ejemplo n.º 3
0
def project_init(ctx, no_init_db, overwrite, dbnd_home, dbnd_system):
    """Initialize the project structure and local db"""

    from dbnd._core.errors import DatabandSystemError
    from dbnd import databand_lib_path

    os.environ["SKIP_DAGS_PARSING"] = "True"  # Exclude airflow dag examples

    conf_folder = databand_lib_path("conf/project_init")
    project_name = os.path.basename(dbnd_home)
    output_dir = os.path.dirname(dbnd_home)

    if os.path.exists(os.path.join(dbnd_home, "project.cfg")):
        if not overwrite:
            raise DatabandSystemError(
                "You are trying to re-initialize your project. You already have dbnd configuration at %s. "
                "You can force project-init by providing --overwrite flag. "
                "If you need to create/update database use `dbnd db init` instead"
                % dbnd_system)

        logger.warning(
            "You are re-initializing your project, all files at %s are going to be over written!"
            % dbnd_home)

    copy_tree(conf_folder, dbnd_home)
    click.echo("Databand project has been initialized at %s" % dbnd_home)
    config.load_system_configs(force=True)
    if no_init_db:
        click.echo("Don't forget to run `dbnd db init` ")
        return

    if is_web_enabled():
        from dbnd_web.cmd_db import init as db_init

        ctx.invoke(db_init)
Ejemplo n.º 4
0
    def get_instance(cls):
        """Singleton getter"""
        if not cls._instance:
            raise DatabandSystemError(
                "%s is not set, call .global_instance or .try_instance first" %
                cls.__name__)

        return cls._instance
Ejemplo n.º 5
0
def as_task(task_or_result):
    from dbnd import Task

    if isinstance(task_or_result, Target):
        return task_or_result.source_task
    if isinstance(task_or_result, Task):
        return task_or_result
    raise DatabandSystemError("Can not extract task from %s" % task_or_result)
Ejemplo n.º 6
0
    def new_context(cls, *args, **kwargs):  # type: ('T', *Any, **Any)-> 'T'
        """
        Meant to be used as a context manager.
        """
        allow_override = kwargs.pop("allow_override", False)
        context = kwargs.pop("_context", None)

        orig_value = cls._instance
        if orig_value is not None:
            if not allow_override:
                raise DatabandSystemError(
                    "You are trying to create new %s out of existing context '%s', "
                    "are you sure you are allowed to do that? " %
                    (cls.__name__, cls._instance))
        if context is None:
            try:
                context = cls(*args, **kwargs)
            except DatabandError:
                logger.error("Failed to create new context for %s", cls)
                raise
            except Exception:
                logger.exception("Failed to create new context for %s", cls)
                raise

        cls._instance = context
        _track_context(context, "enter")
        try:
            cls._instance._on_enter()
            yield cls._instance
        finally:
            _track_context(context, "exit")
            if cls._instance is not context:
                msg = ("Something wrong with %s context manager, "
                       "somebody has change context while we were running: "
                       "actual=%s(%s), expected=%s(%s)" % (
                           cls.__name__,
                           cls._instance,
                           id(cls._instance),
                           context,
                           id(context),
                       ))
                logger.warning(msg)
                raise DatabandSystemError(msg)

            cls._instance._on_exit()
            cls._instance = orig_value
Ejemplo n.º 7
0
def __set_target(target, target_source):
    if not target:
        return target

    if not isinstance(target, Target):
        raise DatabandSystemError("Expected target object, got '%s' : %s" %
                                  (type(target), target))
    if not target.source:
        target.source = target_source
    return target
Ejemplo n.º 8
0
 def run(self):
     run_executor = current_task_run().run.run_executor
     run_executor_type = run_executor.run_executor_type
     if run_executor_type == SystemTaskName.driver:
         return run_executor.run_driver()
     elif run_executor_type == SystemTaskName.driver_submit:
         return run_executor.run_submitter()
     else:
         raise DatabandSystemError("Unsupported run executor type: %s",
                                   run_executor_type)
Ejemplo n.º 9
0
def get_task_name_safe(task_or_task_name):
    # type: (Union[Task, str]) -> str
    if task_or_task_name is None or isinstance(task_or_task_name,
                                               six.string_types):
        return task_or_task_name

    if isinstance(task_or_task_name, Task):
        return task_or_task_name.task_name
    raise DatabandSystemError(
        "Can't calculate task name from %s - %s",
        task_or_task_name,
        type(task_or_task_name),
    )
Ejemplo n.º 10
0
def get_task_name_safe(task_or_task_name):
    if task_or_task_name is None or isinstance(task_or_task_name,
                                               six.string_types):
        return task_or_task_name

    from dbnd._core.task.task import Task

    if isinstance(task_or_task_name, Task):
        return task_or_task_name.task_name
    raise DatabandSystemError(
        "Can't calculate task name from %s - %s",
        task_or_task_name,
        type(task_or_task_name),
    )
Ejemplo n.º 11
0
    def run(self):
        executor_task_run = current_task_run()
        run_executor = executor_task_run.run.run_executor
        run_executor_type = run_executor.run_executor_type
        try:
            if run_executor_type == SystemTaskName.driver:
                return run_executor.run_driver()
            elif run_executor_type == SystemTaskName.driver_submit:
                return run_executor.run_submitter()
            else:
                raise DatabandSystemError("Unsupported run executor type: %s" %
                                          run_executor_type)
        except BaseException as ex:
            # we print it on any exception
            logger.warning("Run failure: %s" % ex)
            logger.warning(
                "\n\n\n\n{sep}\n\n   -= Your run has failed, please review errors below =-\n\n{sep}\n"
                .format(sep=console_utils.error_separator()))

            failed_msgs = []
            canceled_msgs = []
            for task_run in executor_task_run.run.get_task_runs():
                if task_run.task_run_state == TaskRunState.FAILED:
                    failed_msgs.append(
                        task_run.task.ctrl.banner(msg="Task has failed!",
                                                  color="red",
                                                  task_run=task_run))
                elif task_run.task_run_state == TaskRunState.CANCELLED:
                    canceled_msgs.append(
                        task_run.task.ctrl.banner(
                            msg="Task has been canceled!",
                            color="yellow",
                            task_run=task_run,
                        ))

            if canceled_msgs:
                logger.warning(
                    "\nNumber of canceled tasks={count}:\n{banner}\n".format(
                        banner="\n".join(canceled_msgs),
                        count=len(canceled_msgs)))

            if failed_msgs:
                logger.warning(
                    "\nNumber of failed tasks={count}:\n{banner}\n".format(
                        banner="\n".join(failed_msgs), count=len(failed_msgs)))
            raise
Ejemplo n.º 12
0
    def _execute(self, session=None):
        """
        Initializes all components required to run a dag for a specified date range and
        calls helper method to execute the tasks.
        """
        # Trigger cleaning
        if self.airflow_config.clean_zombies_during_backfill:
            ClearZombieJob().run()

        ti_status = BackfillJob._DagRunTaskStatus()

        # picklin'
        pickle_id = self.dag.pickle_id
        # We don't need to pickle our dag again as it already pickled on job creattion
        # also this will save it into databand table, that have no use for the airflow
        # if not self.donot_pickle and self.executor.__class__ not in (
        #     executors.LocalExecutor,
        #     executors.SequentialExecutor,
        # ):
        #     pickle_id = airflow_pickle(self.dag, session=session)

        executor = self.executor
        executor.start()

        ti_status.total_runs = 1  # total dag runs in backfill

        dag_run = None
        try:
            dag_run = self._get_dag_run(session=session)

            # Create relation DagRun <> Job
            dag_run.conf = {"job_id": self.id}
            session.merge(dag_run)
            session.commit()

            run_date = dag_run.execution_date
            if dag_run is None:
                raise DatabandSystemError("Can't build dagrun")

            tis_map = self._task_instances_for_dag_run(dag_run, session=session)

            if not tis_map:
                raise DatabandSystemError("There are no task instances to run!")
            ti_status.active_runs.append(dag_run)
            ti_status.to_run.update(tis_map or {})

            processed_dag_run_dates = self._process_dag_task_instances(
                ti_status=ti_status,
                executor=executor,
                pickle_id=pickle_id,
                session=session,
            )
            ti_status.executed_dag_run_dates.update(processed_dag_run_dates)

            err = self._collect_errors(ti_status=ti_status, session=session)
            if err:
                raise DatabandRunError("Airflow executor has failed to run the run")

            if run_date not in ti_status.executed_dag_run_dates:
                self.log.warning(
                    "Dag %s is not marked as completed!  %s not found in %s",
                    self.dag_id,
                    run_date,
                    ti_status.executed_dag_run_dates,
                )
        finally:
            # in sequential executor a keyboard interrupt would reach here and
            # then executor.end() -> heartbeat() -> sync() will cause the queued commands
            # to be run again before exiting
            if hasattr(executor, "commands_to_run"):
                executor.commands_to_run = []
            try:
                executor.end()
            except Exception:
                logger.exception("Failed to terminate executor")
            if dag_run and dag_run.state == State.RUNNING:
                _kill_dag_run_zombi(dag_run, session)
            session.commit()

        self.log.info("Run is completed. Exiting.")
Ejemplo n.º 13
0
    def __init__(self,
                 run,
                 root_task_or_task_name,
                 send_heartbeat,
                 force_task_name=None):

        self.run = run  # type: DatabandRun
        self.send_heartbeat = send_heartbeat

        if root_task_or_task_name is None:
            raise DatabandSystemError(
                "Run executor requires task name or task, got None")

        # we are building it only in driver,
        # root task sometimes can be executed only inside the docker
        # it can be affected by docker env (k8s secrets/external source)
        self.root_task_name_to_build = None
        self.force_task_name = force_task_name
        if isinstance(root_task_or_task_name, six.string_types):
            self.root_task_name_to_build = root_task_or_task_name
        elif isinstance(root_task_or_task_name, Task):
            # we have a ready task, we will not build it, just run
            self.run.root_task = root_task_or_task_name
        else:
            raise DatabandSystemError(
                "Run executor requires task name or task, got %s - %s",
                root_task_or_task_name,
                type(root_task_or_task_name),
            )

        env = self.run.env
        self.run_config = self.run.context.settings.run  # type: RunConfig

        self.driver_dump = run.run_root.file("run.pickle")

        self.local_engine = build_engine_config(env.local_engine)
        self.remote_engine = build_engine_config(env.remote_engine
                                                 or env.local_engine)

        # we take values from run config (if defined) - otherwise from specific env definition
        # usually run_config will contain "negative" override
        # values at env_config are based on env.remote_config ( try to submit_driver/submit_tasks if defined)
        self.submit_driver = (self.run_config.submit_driver
                              if self.run_config.submit_driver is not None else
                              env.submit_driver)
        self.submit_tasks = (self.run_config.submit_tasks
                             if self.run_config.submit_tasks is not None else
                             env.submit_tasks)
        self.task_executor_type, self.parallel = calculate_task_executor_type(
            self.submit_tasks, self.remote_engine, run.context.settings)

        run = self.run
        if self.submit_driver and not run.existing_run:
            # we are running submitter, that will send driver to remote
            self.run_executor_type = SystemTaskName.driver_submit
            self.host_engine = self.local_engine
        else:
            # We are running Driver ( submitter already sent this , or no submitter at all)
            self.run_executor_type = SystemTaskName.driver
            if self.submit_driver:
                # submit drive is true, but we are in existing run:
                # we are after the jump from submit to driver execution (to remote engine)
                self.host_engine = self.remote_engine
            else:
                self.host_engine = self.local_engine
            if not self.submit_tasks or self.task_executor_type == "airflow_kubernetes":
                # if we are not in submit tasks, we disable engine "resubmit"
                # airflow kubernetes - we don't want task resubmition, even if engine is k8s
                self.remote_engine = self.remote_engine.clone(
                    require_submit=False)
        # we are running at this engine already
        self.host_engine = self.host_engine.clone(require_submit=False)

        # dag_id , execution_date are used by Airflow,
        # should be moved to this class (still used by DB tracking)
        # run.dag_id = AD_HOC_DAG_PREFIX + run.job_name

        run_executor__task = _RunExecutor_Task(
            task_name=self.run_executor_type, task_version=run.run_uid)
        if self.run.root_task:
            # if root_task == None, we will create it in the context of driver task
            # otherwise, we need it to add manually
            run_executor__task.descendants.add_child(run.root_task.task_id)

        run.build_and_set_driver_task_run(run_executor__task,
                                          driver_engine=self.host_engine)

        self._result_location = None
        self.runtime_errors = []