def dump(self, filename=None, stream=sys.stdout, indent=2): """Convert the configuration to structured text format. Parameters ---------- filename : str | None Write configuration to this file (must be .json or .toml). If None, write the text to stream. Recommend using .json for large files. .toml is much slower. stream : file File-like interface that supports write(). indent : int If JSON, use this indentation. Raises ------ InvalidParameter Raised if filename does not have a supported extenstion. """ if filename is None and stream is None: raise InvalidParameter("must set either filename or stream") if filename is not None: ext = os.path.splitext(filename)[1] if ext not in (".json", ".toml"): raise InvalidParameter("Only .json and .toml are supported") with open(filename, "w") as f_out: self._dump(f_out, fmt=ext, indent=indent) else: self._dump(stream, indent=indent) logger.info("Dumped configuration to %s", filename)
def _get_key(job=None, key=None): if key is None and job is None: raise InvalidParameter("either key or job must be passed") if key is not None and job is not None: raise InvalidParameter("only one of key and job can be " "passed") if key is None: key = job.name return key
def output_to_file(data, filename=None, stream=sys.stdout, indent=2): if filename is None and stream is None: raise InvalidParameter("must set either filename or stream") if filename is not None: ext = os.path.splitext(filename)[1] if ext not in (".json", ".toml"): raise InvalidParameter("Only .json and .toml are supported") with open(filename, "w") as f_out: _write_file(data, f_out, fmt=ext) else: _write_file(data) logger.info("Dumped configuration to %s", filename)
def _add_extension(self, extension): for field in DEFAULT_REGISTRY["extensions"][0]: if field not in extension: raise InvalidParameter(f"required field {field} not present") try: cmod = importlib.import_module( extension["job_configuration_module"]) emod = importlib.import_module(extension["job_execution_module"]) pmod = importlib.import_module(extension["job_parameters_module"]) cli_mod = importlib.import_module(extension["cli_module"]) except ImportError as exc: if "statsmodels" in exc.msg: # Older versions of Jade installed the demo extension into the registry as # well as its dependencies. Newer versions do not. This causes import errors # when a user upgrades to the newer version. # Remove the demo extension. The user can add it later if they want. # This can be removed whenever all users have gone through an upgrade. self._remove_demo_extension() return else: raise ext = copy.copy(extension) ext[ExtensionClassType.CONFIGURATION] = getattr( cmod, extension["job_configuration_class"]) ext[ExtensionClassType.EXECUTION] = getattr( emod, extension["job_execution_class"]) ext[ExtensionClassType.PARAMETERS] = getattr( pmod, extension["job_parameters_class"]) ext[ExtensionClassType.CLI] = cli_mod self._extensions[extension["name"]] = ext
def read_dataframe_handle_missing(filename, index_col=None, columns=None): """Convert filename to a dataframe. Returns None if the file is missing. Parameters ---------- filename : str index_col : str | int | None Index column name or index columns : list or None Use these columns if the file is CSV and does not define them. Returns ------- pd.DataFrame | None """ if not os.path.exists(filename): directory = os.path.split(filename)[0] if os.path.exists(directory) and not os.listdir(directory): logger.warning("missing data %s", filename) return None raise InvalidParameter(f"directory={directory} does not exist.") return read_dataframe(filename, index_col=index_col, columns=columns)
def create_config(self, config_file): """Creates a configuration from a config file. Parameters ---------- config_file : str | dict HPC config Returns ------- dict """ if isinstance(config_file, dict): config = config_file else: if not os.path.exists(config_file): raise FileNotFoundError( f"HPC config file {config_file} does not exist") config = load_data(config_file) for param in self.get_required_config_params(): if param not in config["hpc"]: raise InvalidParameter(f"missing HPC config parameter {param}") for param, val in self.get_optional_config_params().items(): if param not in config["hpc"]: config["hpc"][param] = val return config
def _submit_next_stage(self, stage_num, return_code=None): if return_code is None: assert stage_num == 1, str(stage_num) else: if stage_num != self.stage_num + 1: raise InvalidParameter( f"expected stage_num {self.stage_num + 1}, received {stage_num}" ) self._config.stages[stage_num - 2].return_code = return_code self._config.stage_num += 1 if self._config.stage_num == len(self._config.stages) + 1: logger.info("Pipeline is complete") self._config.is_complete = True self._serialize() return logger.info("Start execution pipeline stage %s/%s", stage_num, len(self._config.stages)) self._serialize() stage = self._config.stages[self.stage_num - 1] os.environ["JADE_PIPELINE_STAGE_ID"] = str(self.stage_num) self._run_auto_config(stage) output = self.get_stage_output_path(self.path, self.stage_num) ret = JobSubmitter.run_submit_jobs( stage.config_file, output, stage.submitter_params, pipeline_stage_num=self.stage_num, ) if ret != 0: raise ExecutionError(f"stage {self.stage_num} failed")
def __init__(self, package): # This will be the directory containing the package. self._path = os.path.dirname(getattr(package, "__path__")[0]) if not os.path.exists(os.path.join(self._path, ".git")): raise InvalidParameter("{package} is not in a git repository") self._patch_filename = None
def wrapped(*args, **kwargs): try: result = func(*args, **kwargs) except FileNotFoundError: msg = "one or more input parameters do not exist" logger.debug(msg, exc_info=True) raise InvalidParameter("{}: {}".format(msg, args[1:])) return result
def wrapped(*args, **kwargs): try: result = func(*args, **kwargs) except KeyError as err: msg = "invalid parameter: {}".format(err) logger.debug(msg, exc_info=True) raise InvalidParameter(msg) return result
def show_results(self, only_failed=False, only_successful=False): """Show the results in a table.""" if only_successful and only_failed: raise InvalidParameter( "only_failed and only_successful are mutually exclusive" ) print(f"Results from directory: {self._output_dir}") print(f"JADE Version: {self._results['jade_version']}") print(f"{self._results['timestamp']}\n") #if "repository_info" in self._results: # git_status = self._results["repository_info"]["status"] # print(f"git status: {git_status}") num_successful = 0 num_failed = 0 table = PrettyTable() table.field_names = ["Job Name", "Return Code", "Status", "Execution Time (s)", "Completion Time"] min_exec = 0 max_exec = 0 if self._results["results"]: min_exec = self._results["results"][0].exec_time_s max_exec = self._results["results"][0].exec_time_s exec_times = [] for result in self._results["results"]: if result.return_code == 0 and result.status == "finished": num_successful += 1 else: num_failed += 1 if only_failed and result.return_code == 0: continue if only_successful and result.return_code == 1: continue if result.exec_time_s < min_exec: min_exec = result.exec_time_s if result.exec_time_s > max_exec: max_exec = result.exec_time_s exec_times.append(result.exec_time_s) table.add_row([result.name, result.return_code, result.status, result.exec_time_s, datetime.fromtimestamp(result.completion_time)]) total = num_successful + num_failed assert total == len(self._results["results"]) avg_exec = sum(exec_times) / len(exec_times) print(table) print(f"\nNum successful: {num_successful}") print(f"Num failed: {num_failed}") print(f"Total: {total}\n") print("Avg execution time (s): {:.2f}".format(avg_exec)) print("Min execution time (s): {:.2f}".format(min_exec)) print("Max execution time (s): {:.2f}\n".format(max_exec))
def add_job(self, job, key=None): if key is None: key = job.name if key in self._jobs: raise InvalidParameter(f"key={key} is already stored") check_filename(key) self._jobs[key] = job logger.debug("Added job %s", key)
def _get_module_from_extension(filename, **kwargs): ext = os.path.splitext(filename)[1].lower() if ext == ".json": mod = json elif ext == ".toml": mod = toml # elif ext in (".yml", ".yaml"): # mod = yaml elif "mod" in kwargs: mod = kwargs["mod"] else: raise InvalidParameter(f"Unsupported extension {filename}") return mod
def unregister_extension(self, extension_name): """Unregisters an extension. Parameters ---------- extension_name : str """ if extension_name not in self._extensions: raise InvalidParameter( f"extension {extension_name} isn't registered") self._extensions.pop(extension_name) self._serialize_registry()
def check_filename(name): """ Validates that a name is valid for use as a filename or directory. Valid characters: letters, numbers, underscore, hyphen, period Parameters ---------- string: str, A given string. Raises ------ InvalidParameter Raised if the name contains illegal characters or is too long. """ if not re.search(r"^[\w\.-]+$", name): raise InvalidParameter(f"{name} contains illegal characters.") if len(name) > MAX_PATH_LENGTH: raise InvalidParameter( f"length of {name} is greater than the limit of {MAX_PATH_LENGTH}." )
def create_config_from_previous_run(config_file, output, result_type="successful", **kwargs): """Create instance of a JobConfiguration from a previous config file, returning only those of the type given Parameters ---------- config_file : str location of config output : str location of previous results result_type : string type of results Returns ------- JobConfiguration Raises ------ InvalidParameter Raised if result_type is not successful or failed """ allowed_types = ["successful", "failed", "missing"] if result_type not in allowed_types: raise InvalidParameter(f"given result type invalid: {result_type}") config = deserialize_config(load_data(config_file)) summary = ResultsSummary(output) results_of_type = [] if result_type == "successful": results_of_type = summary.get_successful_results() elif result_type == "failed": results_of_type = summary.get_failed_results() elif result_type == "missing": results_of_type = summary.get_missing_jobs(config.iter_jobs()) parameters = [] # Note that both jobs and results have `.name`. for result in results_of_type: job_parameters = config.get_job(result.name) parameters.append(job_parameters) config.reconfigure_jobs(parameters) return deserialize_config(config.serialize(), **kwargs)
def get_submission_group(self, name): """Return the submission group matching name. Parameters ---------- name : str Returns ------- SubmissionGroup """ for group in self.submission_groups: if group.name == name: return group raise InvalidParameter(f"submission group {name} is not stored")
def get_user_data(self, key): """Get the user data associated with key. Parameters ---------- key : str Returns ------- any """ data = self._user_data.get(key) if data is None: raise InvalidParameter(f"{key} is not stored.") return data
def _add_extension(self, extension): for field in DEFAULT_REGISTRY["extensions"][0]: if field not in extension: raise InvalidParameter(f"required field {field} not present") cmod = importlib.import_module(extension["job_configuration_module"]) emod = importlib.import_module(extension["job_execution_module"]) cli_mod = importlib.import_module(extension["cli_module"]) ext = copy.copy(extension) ext[ExtensionClassType.CONFIGURATION] = getattr( cmod, extension["job_configuration_class"]) ext[ExtensionClassType.EXECUTION] = getattr( emod, extension["job_execution_class"]) ext[ExtensionClassType.CLI] = cli_mod self._extensions[extension["name"]] = ext
def get_extension_class(self, extension_name, class_type): """Get the class associated with the extension. Parameters ---------- extension_name : str class_type : ExtensionClassType Raises ------ InvalidParameter Raised if the extension is not registered. """ extension = self._extensions.get(extension_name) if extension is None: raise InvalidParameter(f"{extension_name} is not registered") return extension[class_type]
def add_user_data(self, key, data): """Add user data referenced by a key. Must be JSON-serializable Parameters ---------- key : str data : any Raises ------ InvalidParameter Raised if the key is already stored. """ if key in self._user_data: raise InvalidParameter( f"{key} is already stored. Call remove_user_data first") self._user_data[key] = data
def get_successful_result(self, job_name): """Return the successful job result from the results Parameters ---------- job_name : str Returns ------- dict Raises ------ InvalidParameter Raised if job_name is not found. ExecutionError Raised if the result was not successful. """ result = self.get_result(job_name) if result is None: raise InvalidParameter(f"result not found {job_name}") if result.return_code != 0 or result.status != "finished": raise ExecutionError(f"result wasn't successful: {result}") return result
def check_status(self, name=None, job_id=None): """Return the status of a job by name or ID. Parameters ---------- name : str job name job_id : str job ID Returns ------- HpcJobInfo job info """ if (name is None and job_id is None) or \ (name is not None and job_id is not None): raise InvalidParameter("exactly one of name / job_id must be set") info = self._intf.check_status(name=name, job_id=job_id) logger.debug("info=%s", info) return info.status
def deserialize_config(data, **kwargs): """Create instance of a JobConfiguration from a dict. Parameters ---------- data : dict Dictionary loaded from a serialized config file. Returns ------- JobConfiguration """ registry = Registry() config_module = data["configuration_module"] config_class = data["configuration_class"] for ext in registry.iter_extensions(): ext_cfg_class = ext[ExtensionClassType.CONFIGURATION] if ext_cfg_class.__module__ == config_module and ext_cfg_class.__name__ == config_class: return ext_cfg_class.deserialize(data, **kwargs) raise InvalidParameter( f"Cannot deserialize {config_module}.{config_class}")
def get_result(results_file, job_name): """Return the job result from the results file. Parameters ---------- results_file : str job_name : str Returns ------- dict Raises ------ InvalidParameter Raised if job_name is not found. """ results = load_data(results_file) for result in results: if result.name == job_name: return result raise InvalidParameter(f"result not found {job_name}")
def get_job_by_key(self, key): job = self._jobs.get(key) if job is None: raise InvalidParameter(f"job key={key} not found") return job
def get_job(self, name): job = self._jobs.get(name) if job is None: raise InvalidParameter(f"job={name} is not stored") return self._jobs[name]
def show_results(self, only_failed=False, only_successful=False): """Show the results in a table.""" if only_successful and only_failed: raise InvalidParameter( "only_failed and only_successful are mutually exclusive") print(f"Results from directory: {self._output_dir}") print(f"JADE Version: {self._results['jade_version']}") print(f"{self._results['timestamp']}\n") if not self._results["results"]: print("There are no results.") return num_successful = 0 num_failed = 0 num_canceled = 0 table = PrettyTable() table.field_names = [ "Job Name", "Return Code", "Status", "Execution Time (s)", "Completion Time", ] first = next(iter(self._results["results"].values())) min_exec = first.exec_time_s max_exec = first.exec_time_s exec_times = [] for result in self._results["results"].values(): if result.is_successful(): num_successful += 1 elif result.is_failed(): num_failed += 1 else: assert result.is_canceled() num_canceled += 1 if only_failed and result.return_code == 0: continue if only_successful and result.return_code != 0: continue if result.exec_time_s < min_exec: min_exec = result.exec_time_s if result.exec_time_s > max_exec: max_exec = result.exec_time_s exec_times.append(result.exec_time_s) table.add_row([ result.name, result.return_code, result.status, result.exec_time_s, datetime.fromtimestamp(result.completion_time), ]) num_missing = len(self._missing_jobs) total = num_successful + num_failed + num_canceled + num_missing assert total == len(self._results["results"]) + num_missing avg_exec = sum(exec_times) / len(exec_times) print(table) print(f"\nNum successful: {num_successful}") print(f"Num failed: {num_failed}") print(f"Num canceled: {num_canceled}") print(f"Num missing: {num_missing}") print(f"Missing job names: {self._missing_jobs}") print(f"Total: {total}\n") print("Avg execution time (s): {:.2f}".format(avg_exec)) print("Min execution time (s): {:.2f}".format(min_exec)) print("Max execution time (s): {:.2f}\n".format(max_exec))
def write_dataframe(df, file_path, compress=False, keep_original=False, **kwargs): """Write the dataframe to a file with in a format matching the extension. Note that the feather and h5 formats do not support row indices. Index columns will be lost for those formats. If the dataframe has an index then it should be converted to a column before calling this function. This function only supports storing a single dataframe inside an HDF5 file. It always uses the key 'data'. Parameters ---------- df : pd.DataFrame file_path : str compress : bool keep_original : bool kwargs : pass keyword arguments to underlying library Raises ------ InvalidParameter if the file extension is not supported. InvalidParameter if the DataFrame index is set. """ if not isinstance(df.index, pd.RangeIndex) and not \ isinstance(df.index, pd.core.indexes.base.Index): raise InvalidParameter("DataFrame index must not be set") ext = os.path.splitext(file_path)[1] if ext == ".csv": df.to_csv(file_path, **kwargs) elif ext == ".feather": df.to_feather(file_path, **kwargs) elif ext == ".h5": # HDF5 supports built-in compression, levels 1-9 if "complevel" in kwargs: complevel = kwargs["complevel"] elif compress: complevel = 9 else: complevel = 0 df.to_hdf(file_path, "data", mode="w", complevel=complevel, **kwargs) elif ext == ".json": df.to_json(file_path, **kwargs) else: raise InvalidParameter(f"unsupported file extension {ext}") logger.debug("Created %s", file_path) if compress and ext != ".h5": zipped_path = file_path + ".gz" with open(file_path, "rb") as f_in: with gzip.open(zipped_path, "wb") as f_out: shutil.copyfileobj(f_in, f_out) if not keep_original: os.remove(file_path) file_path = zipped_path logger.debug("Compressed %s", zipped_path) return file_path
def read_dataframe(filename, index_col=None, columns=None, parse_dates=False, **kwargs): """Convert filename to a dataframe. Supports .csv, .json, .feather, .h5. Handles compressed files. Parameters ---------- filename : str index_col : str | int | None Index column name or index columns : list or None Use these columns if the file is CSV and does not define them. parse_dates : bool kwargs : kwargs Passed to underlying library for dataframe conversion. Consider setting parse_dates=True if the index is a timestamp. Returns ------- pd.DataFrame Raises ------ FileNotFoundError Raised if the file does not exist. """ if not os.path.exists(filename): raise FileNotFoundError("filename={} does not exist".format(filename)) needs_new_index = False ext = os.path.splitext(filename) if ext[1] == ".gz": ext = os.path.splitext(ext[0])[1] open_func = gzip.open else: ext = ext[1] open_func = open if ext == ".csv": df = pd.read_csv(filename, index_col=index_col, usecols=columns, parse_dates=parse_dates, **kwargs) elif ext == ".json": df = pd.read_json(filename, **kwargs) elif ext == ".feather": needs_new_index = True with open_func(filename, "rb") as f_in: df = feather.read_dataframe(f_in, **kwargs) elif ext == ".h5": # This assumes that the file has a single dataframe, and so the # key name is not relevant. df = pd.read_hdf(filename, **kwargs) needs_new_index = True else: raise InvalidParameter(f"unsupported file extension {ext}") if index_col is not None and needs_new_index: df.set_index(index_col, inplace=True) if parse_dates: df.set_index(pd.to_datetime(df.index), inplace=True) return df