コード例 #1
0
def read_from_config_file(config_file_target):
    """
    Read config from config file (.ini, .cfg)
    """

    # Check if config files are inside zip first - can happen if we run in fat wheel
    if (isinstance(config_file_target.fs, LocalFileSystem)
            and ".zip/" in config_file_target.path):
        zip_file_path, config_path_inside_zip = config_file_target.path.split(
            ".zip/")
        if os.path.exists(zip_file_path + ".zip"):
            import zipfile

            archive = zipfile.ZipFile(zip_file_path + ".zip", "r")
            if config_path_inside_zip not in archive.namelist():
                raise DatabandConfigError(
                    "Failed to read configuration file at %s, file not found!"
                    % config_file_target)
            archive = zipfile.ZipFile(zip_file_path + ".zip", "r")
            with archive.open(config_path_inside_zip) as file_io:
                return read_from_config_stream(Text.pipe_reader(file_io),
                                               str(config_file_target))

    if not config_file_target.exists():
        raise DatabandConfigError(
            "Failed to read configuration file at %s, file not found!" %
            config_file_target)
    with config_file_target.open("r") as fp:
        return read_from_config_stream(fp, str(config_file_target))
コード例 #2
0
ファイル: spark.py プロジェクト: kalebinn/dbnd
    def _task_submit(self):
        spark_ctrl = self._get_spark_ctrl()
        if self._use_spark_context_inplace():
            logger.info("Reusing existing spark session in inline task "
                        "due to spark_local.enable_spark_context_inplace")
            return self._task_run()
        driver_dump = self.current_task_run.run.run_executor.driver_dump
        if not driver_dump:
            raise DatabandConfigError(
                "Please configure your cloud to always_save_pipeline=True, we need to pickle pipeline first"
            )
        self._application_args = [
            "execute",
            "--expected-dbnd-version",
            get_dbnd_version(),
            "--expected-python-version",
            get_python_version(),
            "--dbnd-run",
            spark_ctrl.sync(driver_dump),
            "task",
            "--task-id",
            self.task_id,
        ]

        if spark_ctrl.config.disable_tracking_api:
            self._application_args[1:1] = ["--disable-tracking-api"]

        return spark_ctrl.run_pyspark(
            pyspark_script=databand_lib_path("_core", "cli", "main.py"))
コード例 #3
0
ファイル: _plugin.py プロジェクト: turbaszek/dbnd
def dbnd_on_pre_init_context(ctx):
    from mlflow import get_tracking_uri, set_tracking_uri

    if not config.getboolean("mlflow_tracking", "databand_tracking"):
        return

    databand_url = config.get("core", "databand_url")
    if not databand_url:
        logger.info(
            "Although 'databand_tracking' was set in 'mlflow_tracking', "
            "dbnd will not use it since 'core.databand_url' was not set."
        )
        return

    duplicate_tracking_to = config.get("mlflow_tracking", "duplicate_tracking_to")

    if not duplicate_tracking_to:
        duplicate_tracking_to = get_tracking_uri()

        # check if dbnd store uri was already defined with MLFlow config
        if is_composite_uri(duplicate_tracking_to):
            raise DatabandConfigError(
                "Config conflict: MLFlow and DBND configs both define dbnd store uri"
            )

    composite_uri = build_composite_uri(databand_url, duplicate_tracking_to)

    global _original_mlflow_tracking_uri
    _original_mlflow_tracking_uri = get_tracking_uri()
    set_tracking_uri(composite_uri)
コード例 #4
0
    def _task_submit(self):
        if not self.beam.main_jar:
            raise DatabandConfigError("main_jar is not configured for %s" % self)

        return self._get_job_ctrl().run_cmd_java(
            jar=self.beam.jar, main_class=self.main_class
        )
コード例 #5
0
 def find_and_raise_missing_inputs(self):
     missing = find_non_completed(self.relations.task_outputs_user)
     missing_str = non_completed_outputs_to_str(missing)
     raise DatabandConfigError(
         "You are missing some input tasks in your pipeline! \n\t%s\n"
         "The task execution was disabled for '%s'." %
         (missing_str, self.task.task_id))
コード例 #6
0
    def __init__(self,
                 endpoint,
                 headers,
                 retry_policy,
                 ignore_ssl_errors=False):
        self._endpoint = endpoint
        self._headers = headers
        self._retry_policy = retry_policy
        if self._endpoint.auth == constants.AUTH_KERBEROS:
            from requests_kerberos import HTTPKerberosAuth, REQUIRED

            self._auth = HTTPKerberosAuth(mutual_authentication=REQUIRED)
        elif self._endpoint.auth == constants.AUTH_BASIC:
            self._auth = (self._endpoint.username, self._endpoint.password)
        elif self._endpoint.auth != constants.NO_AUTH:
            raise DatabandConfigError(u"Unsupported auth %s" %
                                      self._endpoint.auth)

        self.logger = logger

        self.verify_ssl = not ignore_ssl_errors
        if not self.verify_ssl:
            self.logger.debug(
                u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks."
            )
            requests.packages.urllib3.disable_warnings()
コード例 #7
0
 def get_monitor_configuration(self) -> BaseServerConfig:
     configs = self._fetch_source_monitor_config()
     if not configs:
         raise DatabandConfigError(
             f"Missing configuration for tracking source: {self.tracking_source_uid}"
         )
     return self.server_monitor_config.create(configs[0])
コード例 #8
0
def task_not_found_in_pipeline(task, tasks, task_regex):
    all_tasks_names = ",".join([t.task_id for t in tasks])
    return DatabandConfigError(
        "None of '%s' tasks have been found at current pipeline!" % task_regex,
        help_msg="check your --run-task switch, "
        "select one of following tasks: %s" % all_tasks_names,
        show_exc_info=False,
    )
コード例 #9
0
def kubernetes_image_not_found(image_name, message):
    return DatabandConfigError(
        "Failed to start Kubernetes pod because the configured image (%s) could not be pulled by Kubernetes: %s"
        % (image_name, message),
        help_msg=
        "Make sure you built and pushed your image. If the image is in a private repository make sure you "
        "configured image pull secrets for it in the Kubernetes cluster and configured image_pull_secrets in the Kubernetes engine config.",
    )
コード例 #10
0
def no_matching_tasks_in_pipeline(tasks, tasks_regexes):
    all_tasks_names = ",".join([t.task_id for t in tasks])
    return DatabandConfigError(
        "None of '%s' tasks have been found at current pipeline!" %
        tasks_regexes,
        help_msg="check your run.selected_tasks_regex switch, "
        "select one of following tasks: %s" % all_tasks_names,
        show_exc_info=False,
    )
コード例 #11
0
ファイル: __init__.py プロジェクト: databand-ai/dbnd
def get_data_fetcher(server_config: AirflowServerConfig) -> AirflowDataFetcher:
    fetcher = FETCHERS.get(server_config.fetcher_type)
    if fetcher:
        return decorate_fetcher(fetcher(server_config), server_config.base_url)

    err = "Unsupported fetcher_type: {}, use one of the following: {}".format(
        server_config.fetcher_type, "/".join(FETCHERS.keys()))
    raise DatabandConfigError(err,
                              help_msg="Please specify correct fetcher type")
コード例 #12
0
def airflow_bad_user_configuration(ex, file_path):
    return DatabandConfigError(
        "Error while trying to load additional airflow configuration from %s" %
        file_path,
        help_msg="Please make sure that the configuration file %s does exist."
        % file_path,
        nested_exceptions=ex,
        show_exc_info=False,
    )
コード例 #13
0
ファイル: task_registry.py プロジェクト: turbaszek/dbnd
def wrong_type_for_task(section, task_cls, expected_type):
    return DatabandConfigError(
        "You config '{section}' should be derived from '{expected_type}. Got {task_cls}"
        .format(section=section,
                expected_type=expected_type,
                task_cls=task_cls),
        help_msg="Please check your [{section}] _type = value. ".format(
            section=section),
    )
コード例 #14
0
    def get_env_config(self, name_or_env):
        # type: ( Union[str, EnvConfig]) -> EnvConfig
        if isinstance(name_or_env, EnvConfig):
            return name_or_env

        if name_or_env not in self.core.environments:
            raise DatabandConfigError(
                "Unknown env name '%s', available environments are %s,  please enable it at '[core]environments' "
                % (name_or_env, self.core.environments))
        return build_task_from_config(name_or_env, EnvConfig)
コード例 #15
0
    def __init__(self,
                 url,
                 auth=NO_AUTH,
                 username="",
                 password="",
                 implicitly_added=False):
        if not url:
            raise DatabandConfigError(u"URL must not be empty")
        if auth not in AUTHS_SUPPORTED:
            raise DatabandConfigError(u"Auth '{}' not supported".format(auth))

        self.url = url.rstrip(u"/")
        self.username = username
        self.password = password
        self.auth = auth
        # implicitly_added is set to True only if the endpoint wasn't configured manually by the user through
        # a widget, but was instead implicitly defined as an endpoint to a wrapper kernel in the configuration
        # JSON file.
        self.implicitly_added = implicitly_added
コード例 #16
0
def parse_kub_memory_string(memory_string):
    """
    https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory
    :return: float value of requested bytes or None
    """
    if not memory_string:
        return None

    try:
        for suffixes, base, power in MEMORY_SUFFIXES_WITH_BASE_AND_POWER:
            for i, s in enumerate(suffixes, start=1):
                if memory_string.endswith(s) or memory_string.endswith(
                        s.lower()):
                    return float(memory_string[:-len(s)]) * math.pow(
                        base, power * i)
    except ValueError as e:
        raise DatabandConfigError("memory parse failed for %s: %s" %
                                  (memory_string, e))

    raise DatabandConfigError(
        "memory parse failed for %s: suffix not recognized" % memory_string)
コード例 #17
0
ファイル: config_readers.py プロジェクト: ipattarapong/dbnd
def read_from_config_files(config_files):
    files_to_load = [target(f) for f in config_files]
    configs = []

    if not in_quiet_mode():
        logger.info("Reading configuration from: \n\t%s\n",
                    "\n\t".join(map(str, files_to_load)))

    for f in files_to_load:
        if not f.exists():
            raise DatabandConfigError(
                "Failed to read configuration file at %s, file not found!" % f)
        try:
            configs.append(read_from_config_file(f))
        except Exception as ex:
            raise DatabandConfigError(
                "Failed to read configuration file at %s: %s" % (f, ex),
                nested_exceptions=ex,
            )

    merged_file_config = functools.reduce((lambda x, y: x.update(y)), configs)
    return merged_file_config
コード例 #18
0
def kubernetes_pod_unschedulable(kub_message, extra_help=None):
    help_msg = ""
    if "taints" in kub_message:
        help_msg = "Either remove taints from at least one of the Kubernetes nodes or add tolerations to the Kubernetes engine config."

    if extra_help:
        help_msg = help_msg + "\n" + extra_help

    return DatabandConfigError(
        "Failed to start Kubernetes pod because it couldn't be scheduled. Reason: %s"
        % kub_message,
        help_msg=help_msg,
    )
コード例 #19
0
ファイル: task_registry.py プロジェクト: lbtanh/dbnd
def wrong_type_for_task(section, task_cls, expected_type):
    return DatabandConfigError(
        "Your config '{section}' should be derived from '{expected_type}. Got {task_cls}"
        .format(section=section,
                expected_type=expected_type,
                task_cls=task_cls),
        help_msg=
        "Make sure '{section}' section in your configuration is of the correct type! (check [{"
        "section}]._type and [{section}]._from ".format(
            section=section,
            task_cls=task_cls,
            # expected_type=expected_type,
        ),
    )
コード例 #20
0
def conn_str_to_conn_params(conn_str):
    # type: (str) -> dict
    # TODO: Drop in favor of sqlalchemy.engine.url
    conn = urlparse(conn_str)
    if conn.scheme != "snowflake":
        raise DatabandConfigError(
            "Unsupported connection string scheme '{}'. snowflake is required".
            format(conn.scheme))

    return {
        "account": conn.hostname,
        "user": conn.username,
        "password": conn.password,
    }
コード例 #21
0
ファイル: retry_policy.py プロジェクト: turbaszek/dbnd
def get_retry_policy(name):
    # provide policy per name
    policy = LINEAR_RETRY

    if policy == LINEAR_RETRY:
        return LinearRetryPolicy(seconds_to_sleep=5, max_retries=5)
    elif policy == CONFIGURABLE_RETRY:
        return ConfigurableRetryPolicy(
            retry_seconds_to_sleep_list=conf_retry_seconds_to_sleep_list,
            max_retries=conf_retry_policy_max_retries,
        )
    else:
        raise DatabandConfigError(
            u"Retry policy '{}' not supported".format(policy))
コード例 #22
0
ファイル: retry_policy.py プロジェクト: turbaszek/dbnd
    def __init__(self, retry_seconds_to_sleep_list, max_retries):
        super(ConfigurableRetryPolicy, self).__init__(-1, max_retries)

        # If user configured to an empty list, let's make this behave as
        # a Linear Retry Policy by assigning a list of 1 element.
        if len(retry_seconds_to_sleep_list) == 0:
            retry_seconds_to_sleep_list = [5]
        elif not all(n > 0 for n in retry_seconds_to_sleep_list):
            raise DatabandConfigError(
                u"All items in the list in your config need to be positive for configurable retry policy"
            )

        self.retry_seconds_to_sleep_list = retry_seconds_to_sleep_list
        self._max_index = len(self.retry_seconds_to_sleep_list) - 1
コード例 #23
0
ファイル: structure.py プロジェクト: turbaszek/dbnd
    def parse_from_str(self, x):
        """
               Parses an immutable and ordered ``dict`` from a JSON string using standard JSON library.
        Parse an individual value from the input.

        """

        # if isinstance(value, Mapping):
        #     # we are good to go, it'x dictionary already
        #     return value
        if not x:
            return self._generate_empty_default()

        # this is string and we need to parse it
        if not isinstance(x, six.string_types):
            raise DatabandConfigError(
                "Can't parse '%x' into parameter. Value should be string" % x
            )

        x = x.strip()
        if not x:
            return self._generate_empty_default()

        if x[0] in _PARSABLE_PARAM_PREFIX:
            value = json_utils.loads(x)
        else:
            value = self._parse_from_str_simple(x)

            if not self.is_type_of(value):
                raise DatabandConfigError(
                    "Can't parse '%s' into %s" % (value, self.type)
                )
        if self.sub_value_type:
            value = traverse(value, self.sub_value_type.parse_value)

        return value
コード例 #24
0
    def validate_task_inputs(self):
        if not self.task.ctrl.should_run():
            missing = find_non_completed(self.relations.task_outputs_user)
            missing_str = non_completed_outputs_to_str(missing)
            raise DatabandConfigError(
                "You are missing some input tasks in your pipeline! \n\t%s\n"
                "The task execution was disabled for '%s'." %
                (missing_str, self.task.task_id))

        missing = []
        for partial_output in flatten(self.relations.task_inputs_user):
            if not partial_output.exists():
                missing.append(partial_output)
        if missing:
            raise friendly_error.task_data_source_not_exists(
                self, missing, downstream=[self.task])
コード例 #25
0
    def get_cpu_request(self):
        """
        https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu
        :return: float value of requested cpu fractions or None
        """
        try:
            if not self.kube_config.request_cpu:
                return None

            if self.kube_config.request_cpu[-1].lower() == "m":
                return int(self.kube_config.request_cpu[:-1]) / 1000.0
            else:
                return float(self.kube_config.request_cpu)
        except ValueError as e:
            raise DatabandConfigError("failed to parse request_cpu %s: %s" %
                                      (self.kube_config.request_cpu, e))
コード例 #26
0
def kubernetes_pod_config_error(kub_message):
    help_msg = ""
    if "databand-secrets" in kub_message:
        help_msg = (
            "by default Databand assumes a secret exists in Kubernetes with the name databand-secrets "
            "containing the db connection string and airflow fernet key. "
            "Either create the secret (see the kubernetes deployment script) or override the secrets "
            "property in your Kubernetes engine config section")
    elif kub_message.startswith("secret"):
        help_msg = "Check that the secret exsits in the cluster or remove it from the engine config"

    return DatabandConfigError(
        "Failed to start Kubernetes pod because of a configuration error: %s" %
        kub_message,
        help_msg=help_msg,
    )
コード例 #27
0
ファイル: credentials.py プロジェクト: turbaszek/dbnd
def get_boto_session():
    if use_airflow_connections():
        from dbnd_airflow_contrib.credentials_helper_aws import AwsCredentials

        aws_credentials = AwsCredentials()
        logger.debug("getting aws credentials from airflow connection '%s'" %
                     aws_credentials.aws_conn_id)
        return aws_credentials.get_credentials()[0]
    else:
        logger.debug(
            "getting aws credentials from from environment using boto3 default strategy"
        )
        session = boto3.session.Session()
        if not session.get_credentials():
            raise DatabandConfigError("AWS credentials not found")
        return session
コード例 #28
0
ファイル: retry_policy.py プロジェクト: databand-ai/dbnd
def get_retry_policy(name, policy=None, seconds_to_sleep=5, max_retries=5):
    # provide policy per name
    policy = policy or LINEAR_RETRY

    if policy == LINEAR_RETRY:
        return LinearRetryPolicy(seconds_to_sleep=seconds_to_sleep,
                                 max_retries=max_retries)
    elif policy == CONFIGURABLE_RETRY:
        return ConfigurableRetryPolicy(
            retry_seconds_to_sleep_list=conf_retry_seconds_to_sleep_list,
            max_retries=conf_retry_policy_max_retries,
        )
    elif policy == LINEAR_RETRY_ANY_ERROR:
        return LinearRetryOnAnyError(seconds_to_sleep=seconds_to_sleep,
                                     max_retries=max_retries)
    else:
        raise DatabandConfigError(
            "Retry policy '{}' not supported".format(policy))
コード例 #29
0
def parse_and_build_config_store(
    source,
    config_values,
    override=False,
    auto_section_parse=False,
    set_if_not_exists_only=False,
):
    # type:(str, Mapping[str, Mapping[str, Any]], bool, bool , bool)->_ConfigStore
    """
    Read user defined values. Following format are supported:
        1. SomeTask.some_param [ParameterDefinition] : value
        2. { "section" : { "key" : "value" }}
        3 ? "SomeTask.some_param" [str]  : value
    """
    if isinstance(config_values, _ConfigStore):
        return config_values

    new_config = _ConfigStore()
    new_config.source = source
    for section, section_values in six.iteritems(config_values):
        if isinstance(section, six.string_types):
            if auto_section_parse:
                m = _SECTION_NAME_RE.match(section)
                if m:  # section contains key!
                    section, key = m.group(1), m.group(2)
                    section_values = {key: section_values}

            if not isinstance(section_values, Mapping):
                raise DatabandConfigError(
                    "can't convert '%s' to configuration " % config_values)
        elif isinstance(section, ParameterDefinition):
            # this is parameter ->  Spark.jars = ["jars"]
            section_values = {section.name: section_values}
            section = section.task_config_section

        else:
            raise Exception("section='%s' not supported" % section)

        new_section = new_config[section]
        for key, value in six.iteritems(section_values):
            if key in new_section:
                raise Exception(
                    "multiple definition of {section}.{key} at {config}".
                    format(section=section, key=key, config=config_values))
            if isinstance(key, ParameterDefinition):
                key = key.name
            if not isinstance(value, ConfigValue):
                value = ConfigValue(
                    value=value,
                    source=source,
                    require_parse=False,
                    override=override,
                    set_if_not_exists_only=set_if_not_exists_only,
                )
            else:
                # we can have override values without source
                if value.source is None:
                    value = attr.evolve(value, source=source)
            new_config.set_config_value(section, key, value)

    return new_config
コード例 #30
0
    def build_pod(
        self,
        task_run,
        cmds,
        args=None,
        labels=None,
        try_number=None,
        include_system_secrets=False,
    ):
        # type: (TaskRun, List[str], Optional[List[str]], Optional[Dict[str,str]], Optional[int]) ->Pod
        pod_name = self.get_pod_name(task_run=task_run, try_number=try_number)

        image = self.full_image
        labels = combine_mappings(labels, self.labels)
        labels["dbnd_run_uid"] = clean_job_name_dns1123(
            str(task_run.run.run_uid))
        labels["dbnd_task_run_uid"] = clean_job_name_dns1123(
            str(task_run.task_run_uid))
        labels[
            "dbnd"] = "task_run"  # for easier pod deletion (kubectl delete pod -l dbnd=task_run -n <my_namespace>)

        annotations = self.annotations.copy()
        if self.gcp_service_account_keys:
            annotations[
                "iam.cloud.google.com/service-account"] = self.gcp_service_account_keys
        annotations["dbnd_tracker"] = task_run.task_tracker_url

        from dbnd_docker.kubernetes.dbnd_extended_resources import DbndExtendedResources

        resources = DbndExtendedResources(
            requests=self.requests,
            limits=self.limits,
            request_memory=self.request_memory,
            request_cpu=self.request_cpu,
            limit_memory=self.limit_memory,
            limit_cpu=self.limit_cpu,
        )
        env_vars = {
            ENV_DBND_POD_NAME: pod_name,
            ENV_DBND_POD_NAMESPACE: self.namespace,
            ENV_DBND_USER: task_run.task_run_env.user,
            ENV_DBND__ENV_IMAGE: image,
            ENV_DBND_ENV: task_run.run.env.task_name,
            ENV_DBND__ENV_MACHINE: "%s at %s" % (pod_name, self.namespace),
        }
        if self.auto_remove:
            env_vars[ENV_DBND_AUTO_REMOVE_POD] = "True"
        env_vars[self._params.get_param_env_key("in_cluster")] = "True"
        env_vars["AIRFLOW__KUBERNETES__IN_CLUSTER"] = "True"
        env_vars[
            "DBND__RUN_INFO__SOURCE_VERSION"] = task_run.run.context.task_run_env.user_code_version

        # we want that all next runs will be able to use the image that we have in our configuration

        env_vars.update(
            self._params.to_env_map("container_repository", "container_tag"))

        env_vars.update(self.env_vars)
        env_vars.update(task_run.run.get_context_spawn_env())

        secrets = self.get_secrets(
            include_system_secrets=include_system_secrets)

        from airflow.contrib.kubernetes.pod import Pod

        if self.trap_exit_file_flag:
            args = [
                textwrap.dedent("""
                trap "touch {trap_file}" EXIT
                {command}
                """.format(
                    trap_file=self.trap_exit_file_flag,
                    command=subprocess.list2cmdline(cmds),
                ))
            ]
            # we update cmd now
            cmds = ["/bin/bash", "-c"]

        if not self.container_tag:
            raise DatabandConfigError(
                "Your container tag is None, please check your configuration",
                help_msg="Container tag should be assigned",
            )

        pod = Pod(
            namespace=self.namespace,
            name=pod_name,
            envs=env_vars,
            image=image,
            cmds=cmds,
            args=args,
            labels=labels,
            image_pull_policy=self.image_pull_policy,
            image_pull_secrets=self.image_pull_secrets,
            secrets=secrets,
            service_account_name=self.service_account_name,
            volumes=self.volumes,
            volume_mounts=self.volume_mounts,
            annotations=annotations,
            node_selectors=self.node_selectors,
            affinity=self.affinity,
            tolerations=self.tolerations,
            security_context=self.security_context,
            configmaps=self.configmaps,
            hostnetwork=self.hostnetwork,
            resources=resources,
        )

        if self.pod_yaml:
            pod.pod_yaml = target(self.pod_yaml).read()

        return pod