Exemple #1
0
    def dump(self, filename=None, stream=sys.stdout, indent=2):
        """Convert the configuration to structured text format.

        Parameters
        ----------
        filename : str | None
            Write configuration to this file (must be .json or .toml).
            If None, write the text to stream.
            Recommend using .json for large files. .toml is much slower.
        stream : file
            File-like interface that supports write().
        indent : int
            If JSON, use this indentation.

        Raises
        ------
        InvalidParameter
            Raised if filename does not have a supported extenstion.

        """
        if filename is None and stream is None:
            raise InvalidParameter("must set either filename or stream")

        if filename is not None:
            ext = os.path.splitext(filename)[1]
            if ext not in (".json", ".toml"):
                raise InvalidParameter("Only .json and .toml are supported")

            with open(filename, "w") as f_out:
                self._dump(f_out, fmt=ext, indent=indent)
        else:
            self._dump(stream, indent=indent)

        logger.info("Dumped configuration to %s", filename)
Exemple #2
0
    def _get_key(job=None, key=None):
        if key is None and job is None:
            raise InvalidParameter("either key or job must be passed")
        if key is not None and job is not None:
            raise InvalidParameter("only one of key and job can be " "passed")
        if key is None:
            key = job.name

        return key
Exemple #3
0
def output_to_file(data, filename=None, stream=sys.stdout, indent=2):
    if filename is None and stream is None:
        raise InvalidParameter("must set either filename or stream")

    if filename is not None:
        ext = os.path.splitext(filename)[1]
        if ext not in (".json", ".toml"):
            raise InvalidParameter("Only .json and .toml are supported")

        with open(filename, "w") as f_out:
            _write_file(data, f_out, fmt=ext)
    else:
        _write_file(data)

    logger.info("Dumped configuration to %s", filename)
Exemple #4
0
    def _add_extension(self, extension):
        for field in DEFAULT_REGISTRY["extensions"][0]:
            if field not in extension:
                raise InvalidParameter(f"required field {field} not present")

        try:
            cmod = importlib.import_module(
                extension["job_configuration_module"])
            emod = importlib.import_module(extension["job_execution_module"])
            pmod = importlib.import_module(extension["job_parameters_module"])
            cli_mod = importlib.import_module(extension["cli_module"])
        except ImportError as exc:
            if "statsmodels" in exc.msg:
                # Older versions of Jade installed the demo extension into the registry as
                # well as its dependencies. Newer versions do not. This causes import errors
                # when a user upgrades to the newer version.
                # Remove the demo extension. The user can add it later if they want.
                # This can be removed whenever all users have gone through an upgrade.
                self._remove_demo_extension()
                return
            else:
                raise

        ext = copy.copy(extension)
        ext[ExtensionClassType.CONFIGURATION] = getattr(
            cmod, extension["job_configuration_class"])
        ext[ExtensionClassType.EXECUTION] = getattr(
            emod, extension["job_execution_class"])
        ext[ExtensionClassType.PARAMETERS] = getattr(
            pmod, extension["job_parameters_class"])
        ext[ExtensionClassType.CLI] = cli_mod

        self._extensions[extension["name"]] = ext
Exemple #5
0
def read_dataframe_handle_missing(filename, index_col=None, columns=None):
    """Convert filename to a dataframe. Returns None if the file is missing.

    Parameters
    ----------
    filename : str
    index_col : str | int | None
        Index column name or index
    columns : list or None
        Use these columns if the file is CSV and does not define them.

    Returns
    -------
    pd.DataFrame | None

    """
    if not os.path.exists(filename):
        directory = os.path.split(filename)[0]
        if os.path.exists(directory) and not os.listdir(directory):
            logger.warning("missing data %s", filename)
            return None

        raise InvalidParameter(f"directory={directory} does not exist.")

    return read_dataframe(filename, index_col=index_col, columns=columns)
Exemple #6
0
    def create_config(self, config_file):
        """Creates a configuration from a config file.

        Parameters
        ----------
        config_file : str | dict
            HPC config

        Returns
        -------
        dict

        """
        if isinstance(config_file, dict):
            config = config_file
        else:
            if not os.path.exists(config_file):
                raise FileNotFoundError(
                    f"HPC config file {config_file} does not exist")
            config = load_data(config_file)

        for param in self.get_required_config_params():
            if param not in config["hpc"]:
                raise InvalidParameter(f"missing HPC config parameter {param}")

        for param, val in self.get_optional_config_params().items():
            if param not in config["hpc"]:
                config["hpc"][param] = val

        return config
Exemple #7
0
    def _submit_next_stage(self, stage_num, return_code=None):
        if return_code is None:
            assert stage_num == 1, str(stage_num)
        else:
            if stage_num != self.stage_num + 1:
                raise InvalidParameter(
                    f"expected stage_num {self.stage_num + 1}, received {stage_num}"
                )

            self._config.stages[stage_num - 2].return_code = return_code
            self._config.stage_num += 1

        if self._config.stage_num == len(self._config.stages) + 1:
            logger.info("Pipeline is complete")
            self._config.is_complete = True
            self._serialize()
            return

        logger.info("Start execution pipeline stage %s/%s", stage_num,
                    len(self._config.stages))

        self._serialize()
        stage = self._config.stages[self.stage_num - 1]
        os.environ["JADE_PIPELINE_STAGE_ID"] = str(self.stage_num)
        self._run_auto_config(stage)
        output = self.get_stage_output_path(self.path, self.stage_num)
        ret = JobSubmitter.run_submit_jobs(
            stage.config_file,
            output,
            stage.submitter_params,
            pipeline_stage_num=self.stage_num,
        )
        if ret != 0:
            raise ExecutionError(f"stage {self.stage_num} failed")
Exemple #8
0
    def __init__(self, package):
        # This will be the directory containing the package.
        self._path = os.path.dirname(getattr(package, "__path__")[0])

        if not os.path.exists(os.path.join(self._path, ".git")):
            raise InvalidParameter("{package} is not in a git repository")

        self._patch_filename = None
Exemple #9
0
    def wrapped(*args, **kwargs):
        try:
            result = func(*args, **kwargs)
        except FileNotFoundError:
            msg = "one or more input parameters do not exist"
            logger.debug(msg, exc_info=True)
            raise InvalidParameter("{}: {}".format(msg, args[1:]))

        return result
Exemple #10
0
    def wrapped(*args, **kwargs):
        try:
            result = func(*args, **kwargs)
        except KeyError as err:
            msg = "invalid parameter: {}".format(err)
            logger.debug(msg, exc_info=True)
            raise InvalidParameter(msg)

        return result
Exemple #11
0
    def show_results(self, only_failed=False, only_successful=False):
        """Show the results in a table."""
        if only_successful and only_failed:
            raise InvalidParameter(
                "only_failed and only_successful are mutually exclusive"
            )

        print(f"Results from directory: {self._output_dir}")
        print(f"JADE Version: {self._results['jade_version']}")
        print(f"{self._results['timestamp']}\n")

        #if "repository_info" in self._results:
        #    git_status = self._results["repository_info"]["status"]
        #    print(f"git status:  {git_status}")

        num_successful = 0
        num_failed = 0
        table = PrettyTable()
        table.field_names = ["Job Name", "Return Code", "Status",
                             "Execution Time (s)", "Completion Time"]
        min_exec = 0
        max_exec = 0
        if self._results["results"]:
            min_exec = self._results["results"][0].exec_time_s
            max_exec = self._results["results"][0].exec_time_s

        exec_times = []
        for result in self._results["results"]:
            if result.return_code == 0 and result.status == "finished":
                num_successful += 1
            else:
                num_failed += 1
            if only_failed and result.return_code == 0:
                continue
            if only_successful and result.return_code == 1:
                continue
            if result.exec_time_s < min_exec:
                min_exec = result.exec_time_s
            if result.exec_time_s > max_exec:
                max_exec = result.exec_time_s
            exec_times.append(result.exec_time_s)
            table.add_row([result.name, result.return_code, result.status,
                           result.exec_time_s, datetime.fromtimestamp(result.completion_time)])

        total = num_successful + num_failed
        assert total == len(self._results["results"])
        avg_exec = sum(exec_times) / len(exec_times)

        print(table)
        print(f"\nNum successful: {num_successful}")
        print(f"Num failed: {num_failed}")
        print(f"Total: {total}\n")
        print("Avg execution time (s): {:.2f}".format(avg_exec))
        print("Min execution time (s): {:.2f}".format(min_exec))
        print("Max execution time (s): {:.2f}\n".format(max_exec))
Exemple #12
0
    def add_job(self, job, key=None):
        if key is None:
            key = job.name

        if key in self._jobs:
            raise InvalidParameter(f"key={key} is already stored")

        check_filename(key)

        self._jobs[key] = job
        logger.debug("Added job %s", key)
Exemple #13
0
def _get_module_from_extension(filename, **kwargs):
    ext = os.path.splitext(filename)[1].lower()
    if ext == ".json":
        mod = json
    elif ext == ".toml":
        mod = toml
    # elif ext in (".yml", ".yaml"):
    #    mod = yaml
    elif "mod" in kwargs:
        mod = kwargs["mod"]
    else:
        raise InvalidParameter(f"Unsupported extension {filename}")

    return mod
Exemple #14
0
    def unregister_extension(self, extension_name):
        """Unregisters an extension.

        Parameters
        ----------
        extension_name : str

        """
        if extension_name not in self._extensions:
            raise InvalidParameter(
                f"extension {extension_name} isn't registered")

        self._extensions.pop(extension_name)
        self._serialize_registry()
Exemple #15
0
def check_filename(name):
    """
    Validates that a name is valid for use as a filename or directory.
    Valid characters:  letters, numbers, underscore, hyphen, period

    Parameters
    ----------
    string: str,
        A given string.

    Raises
    ------
    InvalidParameter
        Raised if the name contains illegal characters or is too long.

    """
    if not re.search(r"^[\w\.-]+$", name):
        raise InvalidParameter(f"{name} contains illegal characters.")

    if len(name) > MAX_PATH_LENGTH:
        raise InvalidParameter(
            f"length of {name} is greater than the limit of {MAX_PATH_LENGTH}."
        )
def create_config_from_previous_run(config_file,
                                    output,
                                    result_type="successful",
                                    **kwargs):
    """Create instance of a JobConfiguration from a previous config file,
    returning only those of the type given

    Parameters
    ----------
    config_file : str
        location of config
    output : str
        location of previous results
    result_type : string
        type of results

    Returns
    -------
    JobConfiguration

    Raises
    ------
    InvalidParameter
            Raised if result_type is not successful or failed

    """
    allowed_types = ["successful", "failed", "missing"]
    if result_type not in allowed_types:
        raise InvalidParameter(f"given result type invalid: {result_type}")

    config = deserialize_config(load_data(config_file))
    summary = ResultsSummary(output)
    results_of_type = []

    if result_type == "successful":
        results_of_type = summary.get_successful_results()
    elif result_type == "failed":
        results_of_type = summary.get_failed_results()
    elif result_type == "missing":
        results_of_type = summary.get_missing_jobs(config.iter_jobs())

    parameters = []
    # Note that both jobs and results have `.name`.
    for result in results_of_type:
        job_parameters = config.get_job(result.name)
        parameters.append(job_parameters)

    config.reconfigure_jobs(parameters)
    return deserialize_config(config.serialize(), **kwargs)
Exemple #17
0
    def get_submission_group(self, name):
        """Return the submission group matching name.

        Parameters
        ----------
        name : str

        Returns
        -------
        SubmissionGroup

        """
        for group in self.submission_groups:
            if group.name == name:
                return group

        raise InvalidParameter(f"submission group {name} is not stored")
Exemple #18
0
    def get_user_data(self, key):
        """Get the user data associated with key.

        Parameters
        ----------
        key : str

        Returns
        -------
        any

        """
        data = self._user_data.get(key)
        if data is None:
            raise InvalidParameter(f"{key} is not stored.")

        return data
Exemple #19
0
    def _add_extension(self, extension):
        for field in DEFAULT_REGISTRY["extensions"][0]:
            if field not in extension:
                raise InvalidParameter(f"required field {field} not present")

        cmod = importlib.import_module(extension["job_configuration_module"])
        emod = importlib.import_module(extension["job_execution_module"])
        cli_mod = importlib.import_module(extension["cli_module"])

        ext = copy.copy(extension)
        ext[ExtensionClassType.CONFIGURATION] = getattr(
            cmod, extension["job_configuration_class"])
        ext[ExtensionClassType.EXECUTION] = getattr(
            emod, extension["job_execution_class"])
        ext[ExtensionClassType.CLI] = cli_mod

        self._extensions[extension["name"]] = ext
Exemple #20
0
    def get_extension_class(self, extension_name, class_type):
        """Get the class associated with the extension.

        Parameters
        ----------
        extension_name : str
        class_type : ExtensionClassType

        Raises
        ------
        InvalidParameter
            Raised if the extension is not registered.

        """
        extension = self._extensions.get(extension_name)
        if extension is None:
            raise InvalidParameter(f"{extension_name} is not registered")

        return extension[class_type]
Exemple #21
0
    def add_user_data(self, key, data):
        """Add user data referenced by a key. Must be JSON-serializable

        Parameters
        ----------
        key : str
        data : any

        Raises
        ------
        InvalidParameter
            Raised if the key is already stored.

        """
        if key in self._user_data:
            raise InvalidParameter(
                f"{key} is already stored. Call remove_user_data first")

        self._user_data[key] = data
Exemple #22
0
    def get_successful_result(self, job_name):
        """Return the successful job result from the results
        Parameters
        ----------
        job_name : str
        Returns
        -------
        dict
        Raises
        ------
        InvalidParameter
            Raised if job_name is not found.
        ExecutionError
            Raised if the result was not successful.
        """
        result = self.get_result(job_name)
        if result is None:
            raise InvalidParameter(f"result not found {job_name}")

        if result.return_code != 0 or result.status != "finished":
            raise ExecutionError(f"result wasn't successful: {result}")

        return result
Exemple #23
0
    def check_status(self, name=None, job_id=None):
        """Return the status of a job by name or ID.

        Parameters
        ----------
        name : str
            job name
        job_id : str
            job ID

        Returns
        -------
        HpcJobInfo
            job info

        """
        if (name is None and job_id is None) or \
           (name is not None and job_id is not None):
            raise InvalidParameter("exactly one of name / job_id must be set")

        info = self._intf.check_status(name=name, job_id=job_id)
        logger.debug("info=%s", info)
        return info.status
def deserialize_config(data, **kwargs):
    """Create instance of a JobConfiguration from a dict.

    Parameters
    ----------
    data : dict
        Dictionary loaded from a serialized config file.

    Returns
    -------
    JobConfiguration

    """
    registry = Registry()
    config_module = data["configuration_module"]
    config_class = data["configuration_class"]
    for ext in registry.iter_extensions():
        ext_cfg_class = ext[ExtensionClassType.CONFIGURATION]
        if ext_cfg_class.__module__ == config_module and ext_cfg_class.__name__ == config_class:
            return ext_cfg_class.deserialize(data, **kwargs)

    raise InvalidParameter(
        f"Cannot deserialize {config_module}.{config_class}")
Exemple #25
0
def get_result(results_file, job_name):
    """Return the job result from the results file.

    Parameters
    ----------
    results_file : str
    job_name : str

    Returns
    -------
    dict

    Raises
    ------
    InvalidParameter
        Raised if job_name is not found.

    """
    results = load_data(results_file)
    for result in results:
        if result.name == job_name:
            return result

    raise InvalidParameter(f"result not found {job_name}")
Exemple #26
0
    def get_job_by_key(self, key):
        job = self._jobs.get(key)
        if job is None:
            raise InvalidParameter(f"job key={key} not found")

        return job
Exemple #27
0
 def get_job(self, name):
     job = self._jobs.get(name)
     if job is None:
         raise InvalidParameter(f"job={name} is not stored")
     return self._jobs[name]
Exemple #28
0
    def show_results(self, only_failed=False, only_successful=False):
        """Show the results in a table."""
        if only_successful and only_failed:
            raise InvalidParameter(
                "only_failed and only_successful are mutually exclusive")

        print(f"Results from directory: {self._output_dir}")
        print(f"JADE Version: {self._results['jade_version']}")
        print(f"{self._results['timestamp']}\n")

        if not self._results["results"]:
            print("There are no results.")
            return

        num_successful = 0
        num_failed = 0
        num_canceled = 0
        table = PrettyTable()
        table.field_names = [
            "Job Name",
            "Return Code",
            "Status",
            "Execution Time (s)",
            "Completion Time",
        ]
        first = next(iter(self._results["results"].values()))
        min_exec = first.exec_time_s
        max_exec = first.exec_time_s
        exec_times = []
        for result in self._results["results"].values():
            if result.is_successful():
                num_successful += 1
            elif result.is_failed():
                num_failed += 1
            else:
                assert result.is_canceled()
                num_canceled += 1
            if only_failed and result.return_code == 0:
                continue
            if only_successful and result.return_code != 0:
                continue
            if result.exec_time_s < min_exec:
                min_exec = result.exec_time_s
            if result.exec_time_s > max_exec:
                max_exec = result.exec_time_s
            exec_times.append(result.exec_time_s)
            table.add_row([
                result.name,
                result.return_code,
                result.status,
                result.exec_time_s,
                datetime.fromtimestamp(result.completion_time),
            ])

        num_missing = len(self._missing_jobs)
        total = num_successful + num_failed + num_canceled + num_missing
        assert total == len(self._results["results"]) + num_missing
        avg_exec = sum(exec_times) / len(exec_times)

        print(table)
        print(f"\nNum successful: {num_successful}")
        print(f"Num failed: {num_failed}")
        print(f"Num canceled: {num_canceled}")
        print(f"Num missing: {num_missing}")
        print(f"Missing job names: {self._missing_jobs}")
        print(f"Total: {total}\n")
        print("Avg execution time (s): {:.2f}".format(avg_exec))
        print("Min execution time (s): {:.2f}".format(min_exec))
        print("Max execution time (s): {:.2f}\n".format(max_exec))
Exemple #29
0
def write_dataframe(df,
                    file_path,
                    compress=False,
                    keep_original=False,
                    **kwargs):
    """Write the dataframe to a file with in a format matching the extension.

    Note that the feather and h5 formats do not support row indices.
    Index columns will be lost for those formats. If the dataframe has an index
    then it should be converted to a column before calling this function.

    This function only supports storing a single dataframe inside an HDF5 file.
    It always uses the key 'data'.

    Parameters
    ----------
    df : pd.DataFrame
    file_path : str
    compress : bool
    keep_original : bool
    kwargs : pass keyword arguments to underlying library

    Raises
    ------
    InvalidParameter if the file extension is not supported.
    InvalidParameter if the DataFrame index is set.

    """
    if not isinstance(df.index, pd.RangeIndex) and not \
            isinstance(df.index, pd.core.indexes.base.Index):
        raise InvalidParameter("DataFrame index must not be set")

    ext = os.path.splitext(file_path)[1]

    if ext == ".csv":
        df.to_csv(file_path, **kwargs)
    elif ext == ".feather":
        df.to_feather(file_path, **kwargs)
    elif ext == ".h5":
        # HDF5 supports built-in compression, levels 1-9
        if "complevel" in kwargs:
            complevel = kwargs["complevel"]
        elif compress:
            complevel = 9
        else:
            complevel = 0
        df.to_hdf(file_path, "data", mode="w", complevel=complevel, **kwargs)
    elif ext == ".json":
        df.to_json(file_path, **kwargs)
    else:
        raise InvalidParameter(f"unsupported file extension {ext}")

    logger.debug("Created %s", file_path)

    if compress and ext != ".h5":
        zipped_path = file_path + ".gz"
        with open(file_path, "rb") as f_in:
            with gzip.open(zipped_path, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)

        if not keep_original:
            os.remove(file_path)

        file_path = zipped_path
        logger.debug("Compressed %s", zipped_path)

    return file_path
Exemple #30
0
def read_dataframe(filename,
                   index_col=None,
                   columns=None,
                   parse_dates=False,
                   **kwargs):
    """Convert filename to a dataframe. Supports .csv, .json, .feather, .h5.
    Handles compressed files.

    Parameters
    ----------
    filename : str
    index_col : str | int | None
        Index column name or index
    columns : list or None
        Use these columns if the file is CSV and does not define them.
    parse_dates : bool
    kwargs : kwargs
        Passed to underlying library for dataframe conversion.
        Consider setting parse_dates=True if the index is a timestamp.

    Returns
    -------
    pd.DataFrame

    Raises
    ------
    FileNotFoundError
        Raised if the file does not exist.

    """
    if not os.path.exists(filename):
        raise FileNotFoundError("filename={} does not exist".format(filename))

    needs_new_index = False
    ext = os.path.splitext(filename)
    if ext[1] == ".gz":
        ext = os.path.splitext(ext[0])[1]
        open_func = gzip.open
    else:
        ext = ext[1]
        open_func = open

    if ext == ".csv":
        df = pd.read_csv(filename,
                         index_col=index_col,
                         usecols=columns,
                         parse_dates=parse_dates,
                         **kwargs)
    elif ext == ".json":
        df = pd.read_json(filename, **kwargs)
    elif ext == ".feather":
        needs_new_index = True
        with open_func(filename, "rb") as f_in:
            df = feather.read_dataframe(f_in, **kwargs)
    elif ext == ".h5":
        # This assumes that the file has a single dataframe, and so the
        # key name is not relevant.
        df = pd.read_hdf(filename, **kwargs)
        needs_new_index = True
    else:
        raise InvalidParameter(f"unsupported file extension {ext}")

    if index_col is not None and needs_new_index:
        df.set_index(index_col, inplace=True)
        if parse_dates:
            df.set_index(pd.to_datetime(df.index), inplace=True)

    return df