def wait_and_report_training_status(self): timeout = self.task.max_ingestion_time sec = 0 client = self.estimator.sagemaker_session.sagemaker_client job = self.estimator.latest_training_job while True: description = client.describe_training_job( TrainingJobName=job.name) logger.info(self._get_job_status_banner(description)) if description["TrainingJobStatus"] == "Completed": for metric in description["FinalMetricDataList"]: self.task.log_metric(metric["MetricName"], metric["Value"]) break if timeout and sec > timeout: # ensure that the job gets killed if the max ingestion time is exceeded raise DatabandRuntimeError( "SageMaker job took more than %s seconds" % timeout) time.sleep(self.task.check_interval) sec = sec + self.task.check_interval status = description["TrainingJobStatus"] if status in "Failed": reason = description.get("FailureReason", "(No reason provided)") raise DatabandRuntimeError("Error training %s: %s Reason: %s" % (job.name, status, reason)) billable_time = (description["TrainingEndTime"] - description["TrainingStartTime"] ) * description["ResourceConfig"]["InstanceCount"] self.task.log_metric("billable time", str(billable_time))
def get_metrics(self, key, source=None): # type: (str, MetricSource) -> Iterable[Metric] if source == MetricSource.histograms: return self.get_histogram_metrics(key) metric_target = self.meta.get_metric_target(key, source=source) if not metric_target.exists(): raise DatabandRuntimeError("Metric '%s' not found" % key) metric_data = metric_target.readlines() if len(metric_data) == 0: raise DatabandRuntimeError( "Metric '%s' is malformed. No data found." % key) first_line = metric_data[0] metric_parsed = _METRICS_RE.match(first_line) if not metric_parsed: raise DatabandRuntimeError( "Metric '%s' is malformed. Expected format: 'TS VALUE', got='%s'" % (key, first_line)) timestamp, val = metric_parsed.groups() metric = Metric( key=key, value=_parse_metric(val), timestamp=datetime.fromtimestamp(int(timestamp)), ) return [metric]
def wait(self): """ Waits for pod completion :return: """ self._wait_for_pod_started() self.log.info("Pod is running, reading logs..") self.stream_pod_logs(follow=True) self.log.info("Successfully read pod logs") pod_phase = self.get_pod_phase() wait_start = utcnow() while pod_phase not in {PodPhase.SUCCEEDED, PodPhase.FAILED}: logger.debug( "Pod '%s' is not completed with state %s, waiting..", self.name, pod_phase, ) if (utcnow() - wait_start ) > self.kube_config.submit_termination_grace_period: raise DatabandRuntimeError( "Pod is not in a final state after {grace_period}: {state}" .format( grace_period=self.kube_config. submit_termination_grace_period, state=pod_phase, )) time.sleep(5) pod_phase = self.get_pod_phase() if pod_phase != PodPhase.SUCCEEDED: raise DatabandRuntimeError( "Pod returned a failure: {pod_phase}".format( pod_phase=pod_phase)) return self
def get_cluster(self, cluster_name): # type: (str) -> EmrCluster if cluster_name.startswith("j-"): cluster = self.emr_conn.describe_cluster(ClusterId=cluster_name) if not cluster: raise DatabandRuntimeError("Cluster '%s' doesn't exists." % cluster_name) cluster = cluster["Cluster"] else: all_clusters = self.emr_conn.list_clusters(ClusterStates=[ "STARTING", "BOOTSTRAPPING", "RUNNING", "WAITING" ])["Clusters"] cluster = [c for c in all_clusters if c["Name"] == cluster_name] if len(cluster) > 1: raise DatabandRuntimeError( "Can't select cluster from %s, please use cluster id (j-..)" % cluster) elif not cluster: available_clusters = [ "%s(%s)" % (c["Id"], c["Name"]) for c in all_clusters ] raise DatabandRuntimeError( "Cluster '%s' doesn't exist in '%s' region, please use cluster id (j-..), Available clusters: %s" % ( cluster_name, self.emr_conn.meta.region_name, ",".join(available_clusters), )) cluster = cluster[0] return EmrCluster(emr_conn=self.emr_conn, cluster_id=cluster["Id"])
def run_using_docker_build(self): if self.tag: self.image_name_with_tag = "{}:{}".format(self.image_name, self.tag) else: self.image_name_with_tag = self.full_image_name try: cmd = "docker build -t {} -f {} .".format( self.image_name_with_tag, self.docker_file ) if self.label: cmd = cmd + " --label " + self.label if self.target: cmd = cmd + " --target " + self.target if self.build_args: build_args_list = [ " --build-arg {}".format(arg) for arg in self.build_args ] cmd = cmd + "".join(build_args_list) logger.info("Running docker build command: `%s`\n\n", cmd) cwd = self.working_dir or project_path() run_cmd(cmd, shell=True, cwd=cwd) except Exception as e: logger.error( "^^^^^^^^^^^^^^^ SEE DOCKER ERROR MESSAGE ABOVE THIS LINE ^^^^^^^^^^^^^^^\n\n" ) raise DatabandRuntimeError( "failed building docker image {}".format(self.image_name_with_tag), nested_exceptions=[e], ) if self.push: try: cmd = "docker push {}".format(self.image_name_with_tag) logger.info("Running docker push command: '%s'", cmd) run_cmd(cmd, shell=True) except Exception as e: raise DatabandRuntimeError( "failed to push docker image {}".format(self.image_name_with_tag), nested_exceptions=[e], ) else: logger.info("skipping docker push") return self.image_name_with_tag
def failed_to_run_emr_step(reason, logs_path, error_snippets): if logs_path and error_snippets: return DatabandRuntimeError( "EMR Spark step failed with reason: %s " % reason, show_exc_info=False, nested_exceptions=error_snippets, help_msg="Check your application logic. Inspect spark emr logs for more info.\n " "Logs are available at %s." % logs_path, ) return DatabandRuntimeError( "EMR Spark step failed with reason: %s. Additionally Databand failed to get EMR logs." % reason, show_exc_info=False, help_msg="Check your application logic. Inspect emr console for logs and more info\n ", )
def task_data_source_not_exists(task, missing, downstream=None): if downstream: tasks = ",".join(map(str, downstream)) dependent = "Tasks that depend on this input are: %s\n" % tasks else: dependent = "" if len(missing) == 1: missing_target = missing[0] from targets.dir_target import DirTarget if (isinstance(missing_target, DirTarget) and missing_target.folder_exists() and missing_target.flag_target): # we are missing flag! return DatabandRuntimeError( "Data source '%s' success flag is missing! %s" % (missing_target.flag_target, dependent), help_msg= "Check that SUCCESS flag exists and your configurations is ok. " "You can override flag check by \n" "1. adding '[noflag]' to your path: '%s[noflag]' \n" "2. --PARAMETER-target '[noflag]' \n" "3. Define parameter using 'parameter.folder.with_flag(None)' \n" "4 .Create the flag if you think that the input is ok" % missing_target, show_exc_info=False, ) return DatabandRuntimeError( "Task input at location '%s' is missing! %s" % (missing_target, dependent), help_msg="Validate that this data exists. " "This string considered as a path, as it defined as 'data' in our system", show_exc_info=False, ) if len(missing) > 5: missing_msg = "%s... (%s files)" % (missing[:10], len(missing)) else: missing_msg = ",".join(map(str, missing)) return DatabandRuntimeError( "Data source '%s' is missing! %s" % (missing_msg, dependent), help_msg="Check that file exists and your configurations is ok. " "If it's directory, validate that you have _SUCCESS flag, " "or override that via target config ('noflag')", show_exc_info=False, )
def get_file_system_name(path): fs_prefix = None if ":" in path: fs_prefix = path.split(":")[0] if fs_prefix in KNOWN_FILE_SYSTEMS: return fs_prefix if os.name == "nt" and fs_prefix.lower() in get_windows_drives(): return FileSystems.local for cfs in CUSTOM_FILE_SYSTEM_MATCHERS: found_fs_name = cfs(path) if found_fs_name: return found_fs_name if path.startswith("/"): return FileSystems.local if fs_prefix: # TODO: it would be nice to provide more useful information raise DatabandRuntimeError( "Can't find file system '%s'" % fs_prefix, help_msg="Please check that you have registered required schema with" " `register_file_system` or relevant plugin is installed", ) return FileSystems.local
def log_artifact(self, task_run, name, artifact, artifact_target): artifact_target.mkdir_parent() if isinstance(artifact, six.string_types): from targets.dir_target import DirTarget artifact_target_source = target(artifact) if isinstance(artifact_target_source, DirTarget): artifact_target_source.copy(artifact_target) else: data = artifact_target_source.read() artifact_target.write(data) return artifact_target if PYPLOT_INSTALLED and isinstance(artifact, Figure): temp = BytesIO() artifact.savefig(temp) temp.seek(0) artifact_target.write(temp.read(), mode="wb") return artifact_target raise DatabandRuntimeError( "Could not recognize artifact of type %s, must be string or matplotlib Figure" % type(artifact))
def wrong_return_value_len(task_def, names, result): return DatabandRuntimeError( "Returned result from '{task}' doesn't match expected schema. " "Expected tuple of '{names}', got tuple of length '{result}'".format( task=task_def.run_name(), names=names, result=len(result) ) )
def wrong_return_value_type(task_def, names, result): return DatabandRuntimeError( "Returned value from '{task}' should be tuple/list/dict as task has multiple result." "Expected tuple of '{names}', got value of type '{result}'".format( task=task_def.run_name(), names=names, result=type(result) ) )
def run_using_kaniko(self): if self.tag: self.image_name_with_tag = "{}:{}".format(self.image_name, self.tag) else: self.image_name_with_tag = self.full_image_name command = "{} -c {} -f {}".format( self.kaniko_command, self.context, self.docker_file ) if not self.destinations: command = command + " --no-push" else: destination_list = [ " -d {}".format(destination) for destination in self.destinations ] command = command + "".join(destination_list) if self.build_args: build_args_list = [" --build-arg {}".format(arg) for arg in self.build_args] command = command + "".join(build_args_list) if self.label: command = command + " --label " + self.label if self.target: command = command + " --target " + self.target try: logger.info("Running build using Kaniko: %s", command) run_cmd(command, shell=True, cwd=project_path()) except Exception as e: raise DatabandRuntimeError( "failed building docker image {}".format("?"), nested_exceptions=[e] )
def export_db( archive, include_db=True, include_logs=True, task_version=utcnow().strftime("%Y%m%d_%H%M%S"), ): # type: (Path, bool, bool, str)-> None from dbnd._core.current import get_databand_context logger.info("Compressing files to %s..." % archive) with tarfile.open(str(archive), "w:gz") as tar: if include_db: dbnd_context = get_databand_context() conn_string = dbnd_context.config.get("webserver", "sql_alchemy_conn") if conn_string.startswith("postgresql"): with tempfile.NamedTemporaryFile(prefix="dbdump.", suffix=".sql") as tf: dump_postgres(conn_string, tf.name) tar.add(tf.name, arcname="postgres-dbnd.sql") else: raise DatabandRuntimeError( "Can not export db! " "Currently, we support only sqlite and postgres db in automatic export" ) if include_logs: context = get_databand_context() local_env = context.settings.get_env_config(CloudType.local) logs_folder = local_env.dbnd_local_root.folder("logs").path if os.path.exists(logs_folder): logger.info("Adding run folder from '%s'", logs_folder) tar.add(logs_folder, "run") else: logger.warning("Logs dir '%s' is not found", logs_folder)
def failed_to_assign_result(task, result_parameter): return DatabandRuntimeError( "The result of the band/run call is None, " "it can not be assigned to {schema}".format( task=task, schema=result_parameter.schema), help_msg="Check your %s return value" % (_task_name(task)), )
def dag_with_different_contexts(task_id): return DatabandRuntimeError( "The task '%s' isn't part of the current context!" % task_id, help_msg="The task '%s' isn't part of the current context! \n" "Make sure you did not fiddle with internal APIs" % task_id, show_exc_info=False, )
def check_if_completed_bfs(root_task, number_of_threads): completed_status = {} tasks_to_check_list = [root_task] with ThreadPoolExecutor(max_workers=number_of_threads) as executor: while tasks_to_check_list: new_task_to_check_list = [] task_results = {} for task in tasks_to_check_list: task_results[executor.submit(task._complete)] = task.task_id for future in as_completed(task_results): task_id = task_results[future] try: data = future.result() completed_status[task_id] = data except Exception as e: raise DatabandRuntimeError( "Failed to get completeness result of task_id {}". format(task_id), nested_exceptions=e, ) for task in tasks_to_check_list: if completed_status[task.task_id]: continue for upstream_task in task.ctrl.task_dag.upstream: if upstream_task.task_id not in completed_status: new_task_to_check_list.append(upstream_task) tasks_to_check_list = new_task_to_check_list return completed_status
def failed_to_run_databricks_job(status_code, error_message, log_url): return DatabandRuntimeError( "Databricks run failed with code %s." % status_code, show_exc_info=False, nested_exceptions=error_message, help_msg="Check cluster log for more info: %s." % log_url, )
def failed_to_run_cmd(name, cmd_str, return_code): return DatabandRuntimeError( "{name} has failed, returncode='{return_code}'. Failed to run: {cmd}". format(name=name, return_code=return_code, cmd=cmd_str), show_exc_info=False, help_msg="Inspect logs for more info.", )
def update_task_run_attempt(self, attempt_number): if attempt_number is None: raise DatabandRuntimeError("cannot set None as the attempt number") if self.attempt_number != attempt_number: self.attempt_number = attempt_number self.init_new_task_run_attempt()
def _get_task_by_id(self, task_id): task = self.context.task_instance_cache.get_task_by_id(task_id) if task is None: raise DatabandRuntimeError( "Failed to find task %s in current context" % task_id) return task
def system_exit_at_task_run(task, ex): return DatabandRuntimeError( "Task execution has been aborted with sys.exit() call: %s" % ex, nested_exceptions=ex, show_exc_info=False, help_msg="Check your task run()\n ", )
def _build_submit_task(self, run): if run.root_task: raise DatabandRuntimeError( "Can't send to remote execution task created via code, only command line is supported" ) # dont' describe in local run, do it in remote run settings = self.settings settings.system.describe = False cmd_line_args = (["run"] + _get_dbnd_run_relative_cmd() + ["--run-driver", str(run.run_uid)]) args = run.remote_engine.dbnd_executable + cmd_line_args root_task = run.remote_engine.submit_to_engine_task( env=run.env, args=args, task_name="dbnd_driver_run", interactive=settings.run.interactive, ) root_task._conf_confirm_on_kill_msg = ( "Ctrl-C Do you want to kill your submitted pipeline?" "If selection is 'no', this process will detach from the run.") return root_task
def failed_to_run_qubole_job(status_code, log_url, spark_log): return DatabandRuntimeError( "Qubole run failed with code %s." % status_code, show_exc_info=False, nested_exceptions=spark_log, help_msg="Check spark log for more info: %s." % log_url, )
def failed_to_read_value_from_target(ex, task, parameter, target): return DatabandRuntimeError( "Can't read %s from %s': %s" % (_parameter_name(task, parameter), target, ex), show_exc_info=True, nested_exceptions=[ex], help_msg="Check your %s logic. " % task.friendly_task_name, )
def failed_to_process_non_empty_result(task, result): return DatabandRuntimeError( "Can' process non empty result of {task} while it's marked as task without outputs: result={result}".format( task=_task_name(task), result=result ), help_msg="Please, use @task(result=YOU RESULT SCHEMA)", )
def can_run_only_tasks(task): return DatabandRuntimeError( "Databand can run only Tasks, got {task} instead".format( task=type(task)), help_msg= "Please, use check that you don't call function while providing it to databand. " "Use YOUR_TASK_FUNCTION.task()", )
def target_must_be_local_for_tensorflow_marshalling(target): return DatabandRuntimeError( "Can not read value of tensorflow model in path {path}! Path must be local!".format( path=target.path ), help_msg="To marshall tensorflow objects you must use 'require_local_access`, e.g.:\n@task(" "result=output.tfmodel.require_local_access[tf.keras.models.Model])\ndef my_task(p1, p2):...", )
def failed_to_save_value__wrong_type(value, target, expected_type): return DatabandRuntimeError( "Can not save value at {target}, " "expected type is '{expected_type}', got '{value_type}'".format( target=target, expected_type=expected_type, value_type=type(value) ), help_msg="Review type implementation and set support_parse_from_str flag to True", )
def dataflow_pipeline_not_set(task): return DatabandRuntimeError( "dataflow_pipeline at {task} is None. Can't wait on dataflow job completion." .format(task=_run_name(task)), help_msg= "Please set task.pipeline first at '{task}' or change task.dataflow_wait_until_finish flag" .format(task=_run_name(task)), )
def no_marshaller(target, config, value_type, options_message): return DatabandRuntimeError( "There is no defined way to read/write value of type '{type}' with file format '{format}'. ".format( type=value_type.type, format=config.format ), help_msg="You can provide the expected format of {target}" " using --PARAMETER-target switch, for example --my-input--target csv. " "{options_message}".format(target=target, options_message=options_message), )