Ejemplo n.º 1
0
    def wait_and_report_training_status(self):
        timeout = self.task.max_ingestion_time
        sec = 0
        client = self.estimator.sagemaker_session.sagemaker_client
        job = self.estimator.latest_training_job

        while True:
            description = client.describe_training_job(
                TrainingJobName=job.name)
            logger.info(self._get_job_status_banner(description))

            if description["TrainingJobStatus"] == "Completed":
                for metric in description["FinalMetricDataList"]:
                    self.task.log_metric(metric["MetricName"], metric["Value"])
                break

            if timeout and sec > timeout:
                # ensure that the job gets killed if the max ingestion time is exceeded
                raise DatabandRuntimeError(
                    "SageMaker job took more than %s seconds" % timeout)

            time.sleep(self.task.check_interval)
            sec = sec + self.task.check_interval

        status = description["TrainingJobStatus"]
        if status in "Failed":
            reason = description.get("FailureReason", "(No reason provided)")
            raise DatabandRuntimeError("Error training %s: %s Reason: %s" %
                                       (job.name, status, reason))

        billable_time = (description["TrainingEndTime"] -
                         description["TrainingStartTime"]
                         ) * description["ResourceConfig"]["InstanceCount"]

        self.task.log_metric("billable time", str(billable_time))
Ejemplo n.º 2
0
    def get_metrics(self, key, source=None):
        # type: (str, MetricSource) -> Iterable[Metric]
        if source == MetricSource.histograms:
            return self.get_histogram_metrics(key)

        metric_target = self.meta.get_metric_target(key, source=source)
        if not metric_target.exists():
            raise DatabandRuntimeError("Metric '%s' not found" % key)
        metric_data = metric_target.readlines()
        if len(metric_data) == 0:
            raise DatabandRuntimeError(
                "Metric '%s' is malformed. No data found." % key)
        first_line = metric_data[0]

        metric_parsed = _METRICS_RE.match(first_line)
        if not metric_parsed:
            raise DatabandRuntimeError(
                "Metric '%s' is malformed. Expected format: 'TS VALUE', got='%s'"
                % (key, first_line))

        timestamp, val = metric_parsed.groups()

        metric = Metric(
            key=key,
            value=_parse_metric(val),
            timestamp=datetime.fromtimestamp(int(timestamp)),
        )
        return [metric]
Ejemplo n.º 3
0
    def wait(self):
        """
        Waits for pod completion
        :return:
        """
        self._wait_for_pod_started()
        self.log.info("Pod is running, reading logs..")
        self.stream_pod_logs(follow=True)
        self.log.info("Successfully read pod logs")

        pod_phase = self.get_pod_phase()
        wait_start = utcnow()
        while pod_phase not in {PodPhase.SUCCEEDED, PodPhase.FAILED}:
            logger.debug(
                "Pod '%s' is not completed with state %s, waiting..",
                self.name,
                pod_phase,
            )
            if (utcnow() - wait_start
                ) > self.kube_config.submit_termination_grace_period:
                raise DatabandRuntimeError(
                    "Pod is not in a final state after {grace_period}: {state}"
                    .format(
                        grace_period=self.kube_config.
                        submit_termination_grace_period,
                        state=pod_phase,
                    ))
            time.sleep(5)
            pod_phase = self.get_pod_phase()

        if pod_phase != PodPhase.SUCCEEDED:
            raise DatabandRuntimeError(
                "Pod returned a failure: {pod_phase}".format(
                    pod_phase=pod_phase))
        return self
Ejemplo n.º 4
0
    def get_cluster(self, cluster_name):
        # type: (str) -> EmrCluster
        if cluster_name.startswith("j-"):
            cluster = self.emr_conn.describe_cluster(ClusterId=cluster_name)
            if not cluster:
                raise DatabandRuntimeError("Cluster '%s' doesn't exists." %
                                           cluster_name)
            cluster = cluster["Cluster"]
        else:
            all_clusters = self.emr_conn.list_clusters(ClusterStates=[
                "STARTING", "BOOTSTRAPPING", "RUNNING", "WAITING"
            ])["Clusters"]

            cluster = [c for c in all_clusters if c["Name"] == cluster_name]
            if len(cluster) > 1:
                raise DatabandRuntimeError(
                    "Can't select cluster from %s, please use cluster id (j-..)"
                    % cluster)
            elif not cluster:
                available_clusters = [
                    "%s(%s)" % (c["Id"], c["Name"]) for c in all_clusters
                ]
                raise DatabandRuntimeError(
                    "Cluster '%s' doesn't exist in '%s' region, please use cluster id (j-..), Available clusters: %s"
                    % (
                        cluster_name,
                        self.emr_conn.meta.region_name,
                        ",".join(available_clusters),
                    ))
            cluster = cluster[0]

        return EmrCluster(emr_conn=self.emr_conn, cluster_id=cluster["Id"])
Ejemplo n.º 5
0
    def run_using_docker_build(self):
        if self.tag:
            self.image_name_with_tag = "{}:{}".format(self.image_name, self.tag)
        else:
            self.image_name_with_tag = self.full_image_name

        try:
            cmd = "docker build -t {} -f {} .".format(
                self.image_name_with_tag, self.docker_file
            )
            if self.label:
                cmd = cmd + " --label " + self.label
            if self.target:
                cmd = cmd + " --target " + self.target

            if self.build_args:
                build_args_list = [
                    " --build-arg {}".format(arg) for arg in self.build_args
                ]
                cmd = cmd + "".join(build_args_list)

            logger.info("Running docker build command: `%s`\n\n", cmd)
            cwd = self.working_dir or project_path()
            run_cmd(cmd, shell=True, cwd=cwd)

        except Exception as e:
            logger.error(
                "^^^^^^^^^^^^^^^ SEE DOCKER ERROR MESSAGE ABOVE THIS LINE ^^^^^^^^^^^^^^^\n\n"
            )
            raise DatabandRuntimeError(
                "failed building docker image {}".format(self.image_name_with_tag),
                nested_exceptions=[e],
            )

        if self.push:
            try:
                cmd = "docker push {}".format(self.image_name_with_tag)
                logger.info("Running docker push command: '%s'", cmd)
                run_cmd(cmd, shell=True)

            except Exception as e:
                raise DatabandRuntimeError(
                    "failed to push docker image {}".format(self.image_name_with_tag),
                    nested_exceptions=[e],
                )
        else:
            logger.info("skipping docker push")

        return self.image_name_with_tag
Ejemplo n.º 6
0
def failed_to_run_emr_step(reason, logs_path, error_snippets):
    if logs_path and error_snippets:
        return DatabandRuntimeError(
            "EMR Spark step failed with reason: %s " % reason,
            show_exc_info=False,
            nested_exceptions=error_snippets,
            help_msg="Check your application logic. Inspect spark emr logs for more info.\n "
            "Logs are available at %s." % logs_path,
        )
    return DatabandRuntimeError(
        "EMR Spark step failed with reason: %s. Additionally Databand failed to get EMR logs."
        % reason,
        show_exc_info=False,
        help_msg="Check your application logic. Inspect emr console for logs and more info\n ",
    )
Ejemplo n.º 7
0
def task_data_source_not_exists(task, missing, downstream=None):
    if downstream:
        tasks = ",".join(map(str, downstream))
        dependent = "Tasks that depend on this input are: %s\n" % tasks
    else:
        dependent = ""
    if len(missing) == 1:
        missing_target = missing[0]

        from targets.dir_target import DirTarget

        if (isinstance(missing_target, DirTarget)
                and missing_target.folder_exists()
                and missing_target.flag_target):
            # we are missing flag!
            return DatabandRuntimeError(
                "Data source '%s' success flag is missing! %s" %
                (missing_target.flag_target, dependent),
                help_msg=
                "Check that SUCCESS flag exists and your configurations is ok. "
                "You can override flag check by \n"
                "1. adding '[noflag]' to your path: '%s[noflag]' \n"
                "2. --PARAMETER-target '[noflag]' \n"
                "3. Define parameter using 'parameter.folder.with_flag(None)' \n"
                "4 .Create the flag if you think that the input is ok" %
                missing_target,
                show_exc_info=False,
            )
        return DatabandRuntimeError(
            "Task input at location '%s' is missing! %s" %
            (missing_target, dependent),
            help_msg="Validate that this data exists. "
            "This string considered as a path, as it defined as 'data' in our system",
            show_exc_info=False,
        )

    if len(missing) > 5:
        missing_msg = "%s... (%s files)" % (missing[:10], len(missing))
    else:
        missing_msg = ",".join(map(str, missing))

    return DatabandRuntimeError(
        "Data source '%s' is missing! %s" % (missing_msg, dependent),
        help_msg="Check that file exists and your configurations is ok. "
        "If it's directory, validate that you have _SUCCESS flag, "
        "or override that via target config ('noflag')",
        show_exc_info=False,
    )
Ejemplo n.º 8
0
def get_file_system_name(path):
    fs_prefix = None
    if ":" in path:
        fs_prefix = path.split(":")[0]
        if fs_prefix in KNOWN_FILE_SYSTEMS:
            return fs_prefix
        if os.name == "nt" and fs_prefix.lower() in get_windows_drives():
            return FileSystems.local

    for cfs in CUSTOM_FILE_SYSTEM_MATCHERS:
        found_fs_name = cfs(path)
        if found_fs_name:
            return found_fs_name

    if path.startswith("/"):
        return FileSystems.local

    if fs_prefix:
        # TODO: it would be nice to provide more useful information
        raise DatabandRuntimeError(
            "Can't find file system '%s'" % fs_prefix,
            help_msg="Please check that you have registered required schema with"
            " `register_file_system` or relevant plugin is installed",
        )
    return FileSystems.local
Ejemplo n.º 9
0
    def log_artifact(self, task_run, name, artifact, artifact_target):
        artifact_target.mkdir_parent()

        if isinstance(artifact, six.string_types):
            from targets.dir_target import DirTarget

            artifact_target_source = target(artifact)
            if isinstance(artifact_target_source, DirTarget):
                artifact_target_source.copy(artifact_target)
            else:
                data = artifact_target_source.read()
                artifact_target.write(data)

            return artifact_target

        if PYPLOT_INSTALLED and isinstance(artifact, Figure):
            temp = BytesIO()
            artifact.savefig(temp)
            temp.seek(0)
            artifact_target.write(temp.read(), mode="wb")
            return artifact_target

        raise DatabandRuntimeError(
            "Could not recognize artifact of type %s, must be string or matplotlib Figure"
            % type(artifact))
Ejemplo n.º 10
0
def wrong_return_value_len(task_def, names, result):
    return DatabandRuntimeError(
        "Returned result from '{task}' doesn't match expected schema. "
        "Expected tuple of '{names}', got tuple of length '{result}'".format(
            task=task_def.run_name(), names=names, result=len(result)
        )
    )
Ejemplo n.º 11
0
def wrong_return_value_type(task_def, names, result):
    return DatabandRuntimeError(
        "Returned value from '{task}' should be tuple/list/dict as task has multiple result."
        "Expected tuple of '{names}', got value of type '{result}'".format(
            task=task_def.run_name(), names=names, result=type(result)
        )
    )
Ejemplo n.º 12
0
    def run_using_kaniko(self):
        if self.tag:
            self.image_name_with_tag = "{}:{}".format(self.image_name, self.tag)
        else:
            self.image_name_with_tag = self.full_image_name

        command = "{} -c {} -f {}".format(
            self.kaniko_command, self.context, self.docker_file
        )

        if not self.destinations:
            command = command + " --no-push"
        else:
            destination_list = [
                " -d {}".format(destination) for destination in self.destinations
            ]
            command = command + "".join(destination_list)

        if self.build_args:
            build_args_list = [" --build-arg {}".format(arg) for arg in self.build_args]
            command = command + "".join(build_args_list)

        if self.label:
            command = command + " --label " + self.label

        if self.target:
            command = command + " --target " + self.target

        try:
            logger.info("Running build using Kaniko: %s", command)
            run_cmd(command, shell=True, cwd=project_path())
        except Exception as e:
            raise DatabandRuntimeError(
                "failed building docker image {}".format("?"), nested_exceptions=[e]
            )
Ejemplo n.º 13
0
def export_db(
    archive,
    include_db=True,
    include_logs=True,
    task_version=utcnow().strftime("%Y%m%d_%H%M%S"),
):
    # type: (Path, bool, bool, str)-> None

    from dbnd._core.current import get_databand_context

    logger.info("Compressing files to %s..." % archive)
    with tarfile.open(str(archive), "w:gz") as tar:

        if include_db:
            dbnd_context = get_databand_context()
            conn_string = dbnd_context.config.get("webserver", "sql_alchemy_conn")
            if conn_string.startswith("postgresql"):
                with tempfile.NamedTemporaryFile(prefix="dbdump.", suffix=".sql") as tf:
                    dump_postgres(conn_string, tf.name)
                    tar.add(tf.name, arcname="postgres-dbnd.sql")
            else:
                raise DatabandRuntimeError(
                    "Can not export db! "
                    "Currently, we support only sqlite and postgres db in automatic export"
                )

        if include_logs:
            context = get_databand_context()
            local_env = context.settings.get_env_config(CloudType.local)
            logs_folder = local_env.dbnd_local_root.folder("logs").path
            if os.path.exists(logs_folder):
                logger.info("Adding run folder from '%s'", logs_folder)
                tar.add(logs_folder, "run")
            else:
                logger.warning("Logs dir '%s' is not found", logs_folder)
Ejemplo n.º 14
0
def failed_to_assign_result(task, result_parameter):
    return DatabandRuntimeError(
        "The result of the band/run call is None, "
        "it can not be assigned to {schema}".format(
            task=task, schema=result_parameter.schema),
        help_msg="Check your %s return value" % (_task_name(task)),
    )
Ejemplo n.º 15
0
def dag_with_different_contexts(task_id):
    return DatabandRuntimeError(
        "The task '%s' isn't part of the current context!" % task_id,
        help_msg="The task '%s' isn't part of the current context! \n"
        "Make sure you did not fiddle with internal APIs" % task_id,
        show_exc_info=False,
    )
Ejemplo n.º 16
0
def check_if_completed_bfs(root_task, number_of_threads):
    completed_status = {}
    tasks_to_check_list = [root_task]

    with ThreadPoolExecutor(max_workers=number_of_threads) as executor:
        while tasks_to_check_list:
            new_task_to_check_list = []

            task_results = {}
            for task in tasks_to_check_list:
                task_results[executor.submit(task._complete)] = task.task_id

            for future in as_completed(task_results):
                task_id = task_results[future]
                try:
                    data = future.result()
                    completed_status[task_id] = data
                except Exception as e:
                    raise DatabandRuntimeError(
                        "Failed to get completeness result of task_id {}".
                        format(task_id),
                        nested_exceptions=e,
                    )

            for task in tasks_to_check_list:
                if completed_status[task.task_id]:
                    continue

                for upstream_task in task.ctrl.task_dag.upstream:
                    if upstream_task.task_id not in completed_status:
                        new_task_to_check_list.append(upstream_task)

            tasks_to_check_list = new_task_to_check_list

    return completed_status
Ejemplo n.º 17
0
def failed_to_run_databricks_job(status_code, error_message, log_url):
    return DatabandRuntimeError(
        "Databricks run failed with code %s." % status_code,
        show_exc_info=False,
        nested_exceptions=error_message,
        help_msg="Check cluster log for more info: %s." % log_url,
    )
Ejemplo n.º 18
0
def failed_to_run_cmd(name, cmd_str, return_code):
    return DatabandRuntimeError(
        "{name} has failed, returncode='{return_code}'. Failed to run: {cmd}".
        format(name=name, return_code=return_code, cmd=cmd_str),
        show_exc_info=False,
        help_msg="Inspect logs for more info.",
    )
Ejemplo n.º 19
0
    def update_task_run_attempt(self, attempt_number):
        if attempt_number is None:
            raise DatabandRuntimeError("cannot set None as the attempt number")

        if self.attempt_number != attempt_number:
            self.attempt_number = attempt_number
            self.init_new_task_run_attempt()
Ejemplo n.º 20
0
    def _get_task_by_id(self, task_id):
        task = self.context.task_instance_cache.get_task_by_id(task_id)
        if task is None:
            raise DatabandRuntimeError(
                "Failed to find task %s in current context" % task_id)

        return task
Ejemplo n.º 21
0
def system_exit_at_task_run(task, ex):
    return DatabandRuntimeError(
        "Task execution has been aborted with sys.exit() call: %s" % ex,
        nested_exceptions=ex,
        show_exc_info=False,
        help_msg="Check your task run()\n ",
    )
Ejemplo n.º 22
0
    def _build_submit_task(self, run):
        if run.root_task:
            raise DatabandRuntimeError(
                "Can't send to remote execution task created via code, only command line is supported"
            )

        # dont' describe in local run, do it in remote run
        settings = self.settings
        settings.system.describe = False

        cmd_line_args = (["run"] + _get_dbnd_run_relative_cmd() +
                         ["--run-driver", str(run.run_uid)])

        args = run.remote_engine.dbnd_executable + cmd_line_args

        root_task = run.remote_engine.submit_to_engine_task(
            env=run.env,
            args=args,
            task_name="dbnd_driver_run",
            interactive=settings.run.interactive,
        )
        root_task._conf_confirm_on_kill_msg = (
            "Ctrl-C Do you want to kill your submitted pipeline?"
            "If selection is 'no', this process will detach from the run.")
        return root_task
Ejemplo n.º 23
0
def failed_to_run_qubole_job(status_code, log_url, spark_log):
    return DatabandRuntimeError(
        "Qubole run failed with code %s." % status_code,
        show_exc_info=False,
        nested_exceptions=spark_log,
        help_msg="Check spark log for more info: %s." % log_url,
    )
Ejemplo n.º 24
0
def failed_to_read_value_from_target(ex, task, parameter, target):
    return DatabandRuntimeError(
        "Can't read %s from %s': %s" % (_parameter_name(task, parameter), target, ex),
        show_exc_info=True,
        nested_exceptions=[ex],
        help_msg="Check your %s logic. " % task.friendly_task_name,
    )
Ejemplo n.º 25
0
def failed_to_process_non_empty_result(task, result):
    return DatabandRuntimeError(
        "Can' process non empty result of {task} while it's marked as task without outputs: result={result}".format(
            task=_task_name(task), result=result
        ),
        help_msg="Please, use @task(result=YOU RESULT SCHEMA)",
    )
Ejemplo n.º 26
0
def can_run_only_tasks(task):
    return DatabandRuntimeError(
        "Databand can run only Tasks, got {task} instead".format(
            task=type(task)),
        help_msg=
        "Please, use check that you don't call function while providing it to databand. "
        "Use YOUR_TASK_FUNCTION.task()",
    )
Ejemplo n.º 27
0
def target_must_be_local_for_tensorflow_marshalling(target):
    return DatabandRuntimeError(
        "Can not read value of tensorflow model in path {path}! Path must be local!".format(
            path=target.path
        ),
        help_msg="To marshall tensorflow objects you must use 'require_local_access`, e.g.:\n@task("
        "result=output.tfmodel.require_local_access[tf.keras.models.Model])\ndef my_task(p1, p2):...",
    )
Ejemplo n.º 28
0
def failed_to_save_value__wrong_type(value, target, expected_type):
    return DatabandRuntimeError(
        "Can not save value at {target}, "
        "expected type is '{expected_type}', got '{value_type}'".format(
            target=target, expected_type=expected_type, value_type=type(value)
        ),
        help_msg="Review type implementation and set support_parse_from_str flag to True",
    )
Ejemplo n.º 29
0
def dataflow_pipeline_not_set(task):
    return DatabandRuntimeError(
        "dataflow_pipeline at {task} is None. Can't wait on dataflow job completion."
        .format(task=_run_name(task)),
        help_msg=
        "Please set task.pipeline first at '{task}' or change task.dataflow_wait_until_finish flag"
        .format(task=_run_name(task)),
    )
Ejemplo n.º 30
0
def no_marshaller(target, config, value_type, options_message):
    return DatabandRuntimeError(
        "There is no defined way to read/write value of type '{type}' with file format '{format}'. ".format(
            type=value_type.type, format=config.format
        ),
        help_msg="You can provide the expected format of {target}"
        " using --PARAMETER-target switch, for example --my-input--target csv.  "
        "{options_message}".format(target=target, options_message=options_message),
    )