Example #1
0
    def _calculate_task_defaults(self, defaults):
        # type: (...)->  Dict[str, Any]
        base_defaults = dict()
        for base_schema in self.base_task_definitions:
            base_defaults = combine_mappings(base_defaults,
                                             base_schema.defaults)

        return combine_mappings(base_defaults, defaults)
Example #2
0
    def _calculate_task_class_values(self, classdict):
        # reflect inherited attributes
        params, base_defaults = self._discover_base_attributes()

        # let update params with new class attributes
        self._update_params_from_attributes(classdict, params)

        # this is the place we add parameters from function definition
        if self.task_class._conf__decorator_spec is not None:
            func_params_builder = FuncParamsBuilder(
                base_params=params,
                decorator_spec=self.task_class._conf__decorator_spec)
            func_classdict = func_params_builder.get_task_parameters()

            self._update_params_from_attributes(func_classdict, params)

        defaults = combine_mappings(base_defaults,
                                    classdict.get("defaults", None))

        # add parameters config
        params = {
            name: param.evolve_with_owner(task_cls=self.task_class, name=name)
            for name, param in six.iteritems(params)
        }

        params = _ordered_params(params)
        return params, defaults
Example #3
0
    def _calculate_task_class_values(self, classdict):
        # reflect inherited attributes
        params, base_defaults = self._discover_base_attributes()

        defaults = combine_mappings(base_defaults,
                                    classdict.get("defaults", None))

        self._update_and_evolve_owner(classdict, params)

        return params, defaults
Example #4
0
    def _discover_base_attributes(self):
        # type: ()-> (Dict[str,ParameterDefinition], Dict[str, Any])
        params = dict()
        defaults = dict()
        # we process only "direct" inheritance
        # params will contain definition of param, even it's was overrided by the parent task
        for c in reversed(self.task_class.__bases__):  # type: TaskMetaclass
            if not hasattr(c, "task_definition"):
                logger.debug(
                    "you should inherit from Task objects only: %s -> %s ",
                    self.task_class,
                    c,
                )
                continue
            base_schema = c.task_definition  # type: TaskDefinition
            defaults = combine_mappings(defaults, base_schema.defaults)
            params = combine_mappings(params, c.task_definition.task_params)

        return params, defaults
Example #5
0
 def _pd_to(self, value, *args, **kwargs):
     kwargs = combine_mappings(
         {
             "key": "data",
             "format": _get_supported_hd5_storage_format(value),
             "mode": "w",
             "complib": "zlib",
         },
         kwargs,
     )
     value.to_hdf(*args, **kwargs)
Example #6
0
    def _calculate_task_class_values(self, classdict, external_parameters):
        # type: (Optional[Dict],  Optional[Parameters]) -> Dict[str, ParameterDefinition]
        # reflect inherited attributes
        params = dict()
        # params will contain definition of param, even it's was overrided by the parent task
        for base_schema in self.base_task_definitions:
            params = combine_mappings(params, base_schema.task_param_defs)

        # let update params with new class attributes
        self._update_params_from_attributes(classdict, params)

        # this is the place we add parameters from function definition
        if self.task_decorator is not None:
            func_params_builder = DecoratedCallableParamBuilder(
                base_params=params, task_decorator=self.task_decorator)

            func_params_builder.build_func_params()
            params_dict = dict(func_params_builder.decorator_kwargs_params)
            params_dict.update(func_params_builder.func_spec_params)
            params_dict.update(func_params_builder.result_params)

            self._update_params_from_attributes(params_dict, params)

        if external_parameters:
            params.update({
                param.name: param
                for param in external_parameters.get_params()
            })

        updated_params = {}
        for name, param in six.iteritems(params):
            # add parameters config
            param_with_owner = param.evolve_with_owner(task_definition=self,
                                                       name=name)

            # updated the owner in the external parameters
            param_value = external_parameters and external_parameters.get_param_value(
                name)
            if param_value:
                param_value.parameter = param_with_owner

            updated_params[name] = param_with_owner

        params = _ordered_params(updated_params)
        return params
Example #7
0
def run(
    ctx,
    is_help,
    task,
    module,
    _sets,
    _sets_config,
    _sets_root,
    _overrides,
    verbose,
    describe,
    env,
    parallel,
    conf_file,
    task_version,
    project_name,
    name,
    description,
    run_driver,
    alternative_task_name,
    scheduled_job_name,
    scheduled_date,
    interactive,
    submit_driver,
    submit_tasks,
    disable_web_tracker,
):
    """
    Run a task or a DAG

    To see tasks use `dbnd show-tasks` (tab completion is available).
    """

    from dbnd._core.context.databand_context import new_dbnd_context, DatabandContext
    from dbnd._core.utils.structures import combine_mappings
    from dbnd import config

    task_name = task
    # --verbose, --describe, --env, --parallel, --conf-file and --project-name
    # we filter out false flags since otherwise they will always override the config with their falseness
    main_switches = dict(
        databand=filter_dict_remove_false_values(
            dict(
                verbose=verbose > 0,
                describe=describe,
                env=env,
                conf_file=conf_file,
                project_name=project_name,
            )
        ),
        run=filter_dict_remove_false_values(
            dict(
                name=name,
                parallel=parallel,
                description=description,
                is_archived=describe,
            )
        ),
    )

    if submit_driver is not None:
        main_switches["run"]["submit_driver"] = bool(submit_driver)
    if submit_tasks is not None:
        main_switches["run"]["submit_tasks"] = bool(submit_tasks)
    if disable_web_tracker:
        main_switches.setdefault("core", {})["tracker_api"] = "disabled"

    if task_version is not None:
        main_switches["task"] = {"task_version": task_version}

    cmd_line_config = parse_and_build_config_store(
        source="cli", config_values=main_switches
    )

    _sets = list(_sets)
    _sets_config = list(_sets_config)
    _sets_root = list(_sets_root)

    root_task_config = {}
    for _set in _sets_root:
        root_task_config = combine_mappings(left=root_task_config, right=_set)

    # remove all "first level" config values, assume that they are for the main task
    # add them to _sets_root
    for _set in _sets:
        for k, v in list(_set.items()):
            # so json-like values won't be included
            if "." not in k and isinstance(v, six.string_types):
                root_task_config[k] = v
                del _set[k]

    # --set, --set-config
    if _sets:
        cmd_line_config.update(_parse_cli(_sets, source="--set"))
    if _sets_config:
        cmd_line_config.update(_parse_cli(_sets_config, source="--set-config"))
    if _overrides:
        cmd_line_config.update(
            _parse_cli(_overrides, source="--set-override", override=True)
        )
    if interactive:
        cmd_line_config.update(
            _parse_cli([{"run.interactive": True}], source="--interactive")
        )
    if verbose > 1:
        cmd_line_config.update(
            _parse_cli([{"task_build.verbose": True}], source="-v -v")
        )

    if cmd_line_config:
        config.set_values(cmd_line_config, source="cmdline")
    if verbose:
        logger.info("CLI config: \n%s", pformat_config_store_as_table(cmd_line_config))

    # double checking on bootstrap, as we can run from all kind of locations
    # usually we should be bootstraped already as we run from cli.
    dbnd_bootstrap()
    if not config.getboolean("log", "disabled"):
        configure_basic_logging(None)

    scheduled_run_info = None
    if scheduled_job_name:
        scheduled_run_info = ScheduledRunInfo(
            scheduled_job_name=scheduled_job_name, scheduled_date=scheduled_date
        )

    with new_dbnd_context(
        name="run", module=module
    ) as context:  # type: DatabandContext
        task_registry = get_task_registry()

        tasks = task_registry.list_dbnd_task_classes()
        completer.refresh(tasks)

        # modules are loaded, we can load the task
        task_cls = None
        if task_name:
            task_cls = task_registry.get_task_cls(task_name)
            if alternative_task_name:
                task_cls = build_dynamic_task(
                    original_cls=task_cls, new_cls_name=alternative_task_name
                )
                task_name = alternative_task_name

        # --set-root
        # now we can get it config, as it's not main task, we can load config after the configuration is loaded
        if task_cls is not None:
            if root_task_config:
                # adding root task to configuration
                config.set_values(
                    {task_cls.task_definition.task_config_section: root_task_config},
                    source="--set-root",
                )

        if is_help or not task_name:
            print_help(ctx, task_cls)
            return

        return context.dbnd_run_task(
            task_or_task_name=task_name,
            run_uid=run_driver,
            scheduled_run_info=scheduled_run_info,
        )
Example #8
0
def cmd_run(
    ctx,
    is_help,
    task,
    module,
    _sets,
    _sets_config,
    _sets_root,
    _overrides,
    _extend,
    verbose,
    print_task_band,
    describe,
    env,
    parallel,
    conf_file,
    task_version,
    project,
    name,
    description,
    run_driver,
    override_run_uid,
    alternative_task_name,
    job_name,
    scheduled_job_name,
    scheduled_date,
    interactive,
    submit_driver,
    submit_tasks,
    disable_web_tracker,
    open_web_tab,
    docker_build_tag,
):
    """
    Run a task or a DAG

    To see all available tasks use `dbnd show-tasks` (tab completion is available).
    `dbnd show-configs` will print all available configs.
    """

    from dbnd import config
    from dbnd._core.context.databand_context import DatabandContext, new_dbnd_context
    from dbnd._core.utils.structures import combine_mappings

    task_registry = get_task_registry()

    # we need to do it before we are looking for the task cls
    load_user_modules(dbnd_config=config, modules=module)

    task_name = task
    # --verbose, --describe, --env, --parallel, --conf-file and --project
    # we filter out false flags since otherwise they will always override the config with their falseness
    main_switches = dict(
        databand=dict(
            verbose=verbose > 0,
            print_task_band=print_task_band,
            describe=describe,
            env=env,
            conf_file=conf_file,
            project=project,
        ),
        run=dict(
            name=name,
            parallel=parallel,
            interactive=interactive,
            description=description,
            is_archived=describe,
            open_web_tracker_in_browser=open_web_tab,
            submit_driver=_nullable_flag(submit_driver),
            submit_tasks=_nullable_flag(submit_tasks),
        ),
        kubernetes=dict(docker_build_tag=docker_build_tag),
        task=dict(task_version=task_version),
        task_build=dict(verbose=True if verbose > 1 else None),
        core=dict(tracker_api="disabled" if disable_web_tracker else None),
    )

    main_switches = cleanup_empty_switches(main_switches)

    _sets = list(_sets)
    _sets_config = list(_sets_config)
    _sets_root = list(_sets_root)

    root_task_config = {}
    for _set in _sets_root:
        root_task_config = combine_mappings(left=root_task_config, right=_set)

    # remove all "first level" config values, assume that they are for the main task
    # add them to _sets_root
    for _set in _sets:
        for k, v in list(_set.items()):
            # so json-like values won't be included
            if "." not in k and isinstance(v, six.string_types):
                root_task_config[k] = v
                del _set[k]

    cmd_line_config = parse_and_build_config_store(source="cli",
                                                   config_values=main_switches)
    # --set, --set-config
    if _sets:
        cmd_line_config.update(_parse_cli(_sets, source="--set"))
    if _sets_config:
        cmd_line_config.update(_parse_cli(_sets_config, source="--set-config"))
    if _extend:
        cmd_line_config.update(
            _parse_cli(_extend, source="--extend-config", extend=True))
    if _overrides:
        cmd_line_config.update(
            _parse_cli(
                _overrides,
                source="--set-override",
                priority=ConfigValuePriority.OVERRIDE,
            ))

    # --set-root
    if root_task_config:
        task_cls = task_registry.get_task_cls(task_name)
        task_section = task_cls.task_definition.task_config_section
        # adding root task to configuration
        cmd_line_config.update(
            parse_and_build_config_store(
                config_values={task_section: root_task_config},
                source="--set-root"))

    # UPDATE CURRENT CONFIG with CLI values
    if cmd_line_config:
        if verbose:
            logger.info("CLI config: \n%s",
                        pformat_config_store_as_table(cmd_line_config))
        config.set_values(cmd_line_config, source="cmdline")

    # double checking on bootstrap, as we can run from all kind of locations
    # usually we should be bootstraped already as we run from cli.
    dbnd_bootstrap()

    # initialize basic logging (until we get to the context logging
    if not config.getboolean("log", "disabled"):
        configure_basic_logging(None)

    scheduled_run_info = None
    if scheduled_job_name:
        scheduled_run_info = ScheduledRunInfo(
            scheduled_job_name=scheduled_job_name,
            scheduled_date=scheduled_date)

    # update completer
    if config.getboolean("databand", "completer"):
        tasks = task_registry.list_dbnd_task_classes()
        completer.refresh(tasks)

    # bootstrap and modules are loaded, we can load the task
    task_cls = None
    if task_name:
        task_cls = task_registry.get_task_cls(task_name)

    if not task_name:
        print_help(ctx, None)
        return

    if is_help:
        print_help(ctx, task_cls)
        return

    with tracking_mode_context(tracking=False), new_dbnd_context(
            name="run") as context:  # type: DatabandContext
        if context.settings.system.describe:
            # we want to print describe without triggering real run
            logger.info("Building main task '%s'", task_name)
            root_task = get_task_registry().build_dbnd_task(task_name)
            root_task.ctrl.describe_dag.describe_dag()
            # currently there is bug with the click version we have when using python 2
            # so we don't use the click.echo function
            # https://github.com/pallets/click/issues/564
            print("Task %s has been described!" % task_name)
            return root_task
        return context.dbnd_run_task(
            task_or_task_name=task_name,
            force_task_name=alternative_task_name,
            job_name=job_name or alternative_task_name or task_name,
            run_uid=run_driver or override_run_uid,
            existing_run=run_driver is not None,
            scheduled_run_info=scheduled_run_info,
            project=project,
        )
Example #9
0
    def build_pod(
        self,
        task_run,
        cmds,
        args=None,
        labels=None,
        try_number=None,
        include_system_secrets=False,
    ):
        # type: (TaskRun, List[str], Optional[List[str]], Optional[Dict[str,str]], Optional[int]) ->Pod
        pod_name = self.get_pod_name(task_run=task_run, try_number=try_number)

        image = self.full_image
        labels = combine_mappings(labels, self.labels)
        labels["dbnd_run_uid"] = clean_job_name_dns1123(
            str(task_run.run.run_uid))
        labels["dbnd_task_run_uid"] = clean_job_name_dns1123(
            str(task_run.task_run_uid))
        labels[
            "dbnd"] = "task_run"  # for easier pod deletion (kubectl delete pod -l dbnd=task_run -n <my_namespace>)

        annotations = self.annotations.copy()
        if self.gcp_service_account_keys:
            annotations[
                "iam.cloud.google.com/service-account"] = self.gcp_service_account_keys
        annotations["dbnd_tracker"] = task_run.task_tracker_url

        from dbnd_docker.kubernetes.dbnd_extended_resources import DbndExtendedResources

        resources = DbndExtendedResources(
            requests=self.requests,
            limits=self.limits,
            request_memory=self.request_memory,
            request_cpu=self.request_cpu,
            limit_memory=self.limit_memory,
            limit_cpu=self.limit_cpu,
        )
        env_vars = {
            ENV_DBND_POD_NAME: pod_name,
            ENV_DBND_POD_NAMESPACE: self.namespace,
            ENV_DBND_USER: task_run.task_run_env.user,
            ENV_DBND__ENV_IMAGE: image,
            ENV_DBND_ENV: task_run.run.env.task_name,
            ENV_DBND__ENV_MACHINE: "%s at %s" % (pod_name, self.namespace),
        }
        if self.auto_remove:
            env_vars[ENV_DBND_AUTO_REMOVE_POD] = "True"
        env_vars[self._params.get_param_env_key("in_cluster")] = "True"
        env_vars["AIRFLOW__KUBERNETES__IN_CLUSTER"] = "True"
        env_vars[
            "DBND__RUN_INFO__SOURCE_VERSION"] = task_run.run.context.task_run_env.user_code_version

        # we want that all next runs will be able to use the image that we have in our configuration

        env_vars.update(
            self._params.to_env_map("container_repository", "container_tag"))

        env_vars.update(self.env_vars)
        env_vars.update(task_run.run.get_context_spawn_env())

        secrets = self.get_secrets(
            include_system_secrets=include_system_secrets)

        from airflow.contrib.kubernetes.pod import Pod

        if self.trap_exit_file_flag:
            args = [
                textwrap.dedent("""
                trap "touch {trap_file}" EXIT
                {command}
                """.format(
                    trap_file=self.trap_exit_file_flag,
                    command=subprocess.list2cmdline(cmds),
                ))
            ]
            # we update cmd now
            cmds = ["/bin/bash", "-c"]

        if not self.container_tag:
            raise DatabandConfigError(
                "Your container tag is None, please check your configuration",
                help_msg="Container tag should be assigned",
            )

        pod = Pod(
            namespace=self.namespace,
            name=pod_name,
            envs=env_vars,
            image=image,
            cmds=cmds,
            args=args,
            labels=labels,
            image_pull_policy=self.image_pull_policy,
            image_pull_secrets=self.image_pull_secrets,
            secrets=secrets,
            service_account_name=self.service_account_name,
            volumes=self.volumes,
            volume_mounts=self.volume_mounts,
            annotations=annotations,
            node_selectors=self.node_selectors,
            affinity=self.affinity,
            tolerations=self.tolerations,
            security_context=self.security_context,
            configmaps=self.configmaps,
            hostnetwork=self.hostnetwork,
            resources=resources,
        )

        if self.pod_yaml:
            pod.pod_yaml = target(self.pod_yaml).read()

        return pod
Example #10
0
 def _pd_to(self, data, file_or_path, *args, **kwargs):
     kwargs = combine_mappings({"format": "fixed"}, kwargs)
     with pd.HDFStore(file_or_path, "w") as store:
         kwargs.pop("mode", None)
         store.put("features", data.features, data_columns=True, **kwargs)
         store.put("targets", data.targets, data_columns=True, **kwargs)
    def build_pod(
        self,
        task_run: TaskRun,
        cmds: List[str],
        args: Optional[List[str]] = None,
        labels: Optional[Dict[str, str]] = None,
        try_number: Optional[int] = None,
        include_system_secrets: bool = False,
    ) -> k8s.V1Pod:
        if not self.container_tag:
            raise DatabandConfigError(
                "Your container tag is None, please check your configuration",
                help_msg="Container tag should be assigned",
            )

        pod_name = self.get_pod_name(task_run=task_run, try_number=try_number)

        image = self.full_image
        labels = combine_mappings(labels, self.labels)
        labels["pod_name"] = pod_name

        labels["dbnd_run_uid"] = task_run.run.run_uid
        labels["dbnd_task_run_uid"] = task_run.task_run_uid
        labels["dbnd_task_run_attempt_uid"] = task_run.task_run_attempt_uid
        labels[
            "dbnd_task_family"] = task_run.task.task_definition.full_task_family_short
        labels["dbnd_task_name"] = task_run.task.task_name
        labels["dbnd_task_af_id"] = task_run.task_af_id

        # for easier pod deletion (kubectl delete pod -l dbnd=task_run -n <my_namespace>)
        if task_run.task.task_is_system:
            labels["dbnd"] = "dbnd_system_task_run"
        else:
            labels["dbnd"] = "task_run"

        # we need to be sure that the values meet the dns label names RFC
        # https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#dns-label-names
        labels = {
            label_name: clean_label_name_dns1123(str(label_value))
            for label_name, label_value in six.iteritems(labels)
        }
        if is_verbose():
            logger.info("Build pod with kubernetes labels {}".format(labels))

        annotations = self.annotations.copy()
        if self.gcp_service_account_keys:
            annotations[
                "iam.cloud.google.com/service-account"] = self.gcp_service_account_keys
        annotations["dbnd_tracker"] = task_run.task_tracker_url

        from dbnd_docker.kubernetes.vendorized_airflow.dbnd_extended_resources import (
            DbndExtendedResources, )

        resources = DbndExtendedResources(
            requests=self.requests,
            limits=self.limits,
            request_memory=self.request_memory,
            request_cpu=self.request_cpu,
            limit_memory=self.limit_memory,
            limit_cpu=self.limit_cpu,
        )
        env_vars = {
            ENV_DBND_POD_NAME: pod_name,
            ENV_DBND_POD_NAMESPACE: self.namespace,
            ENV_DBND_USER: task_run.task_run_env.user,
            ENV_DBND__ENV_IMAGE: image,
            ENV_DBND_ENV: task_run.run.env.task_name,
            ENV_DBND__ENV_MACHINE: "%s at %s" % (pod_name, self.namespace),
        }

        if AIRFLOW_VERSION_2:
            env_vars[
                "AIRFLOW__CORE__TASK_RUNNER"] = "dbnd_airflow.compat.dbnd_task_runner.DbndStandardTaskRunner"

        if self.auto_remove:
            env_vars[ENV_DBND_AUTO_REMOVE_POD] = "True"
        env_vars[self._params.get_param_env_key(self, "in_cluster")] = "True"
        env_vars["AIRFLOW__KUBERNETES__IN_CLUSTER"] = "True"
        env_vars[
            "DBND__RUN_INFO__SOURCE_VERSION"] = task_run.run.context.task_run_env.user_code_version
        env_vars["AIRFLOW__KUBERNETES__DAGS_IN_IMAGE"] = "True"
        if not get_dbnd_project_config().is_tracking_mode():
            env_vars[ENV_DBND__TRACKING] = "False"
        # we want that all next runs will be able to use the image that we have in our configuration

        env_vars.update(
            self._params.to_env_map(self, "container_repository",
                                    "container_tag"))

        env_vars.update(self.env_vars)
        env_vars.update(task_run.run.get_context_spawn_env())

        secrets = self.get_secrets(
            include_system_secrets=include_system_secrets)

        if self.trap_exit_file_flag:
            args = [
                textwrap.dedent("""
                trap "touch {trap_file}" EXIT
                {command}
                """.format(
                    trap_file=self.trap_exit_file_flag,
                    command=subprocess.list2cmdline(cmds),
                ))
            ]
            # we update cmd now
            cmds = ["/bin/bash", "-c"]

        if self.debug_with_command:
            logger.warning(
                "%s replacing pod %s command with '%s', original command=`%s`",
                task_run,
                pod_name,
                self.debug_with_command,
                subprocess.list2cmdline(cmds),
            )
            cmds = shlex.split(self.debug_with_command)

        base_pod = self._build_base_pod()

        pod = self._to_real_pod(
            cmds=cmds,
            args=args,
            namespace=self.namespace,
            name=pod_name,
            envs=env_vars,
            image=image,
            labels=labels,
            secrets=secrets,
            resources=resources,
            annotations=annotations,
        )

        final_pod = reconcile_pods(base_pod, pod)

        return final_pod