Ejemplo n.º 1
0
def auto_config(extension, inputs, job_post_process_config_file, config_file,
                verbose):
    """Automatically create a configuration."""
    level = logging.DEBUG if verbose else logging.WARNING
    setup_logging("auto_config", None, console_level=level)

    if job_post_process_config_file is not None:
        module, class_name, data = JobPostProcess.load_config_from_file(
            job_post_process_config_file)
        JobPostProcess(module, class_name, data)  # ensure everything ok
        job_post_process_config = {
            "module": module,
            "class": class_name,
            "data": data
        }
    else:
        job_post_process_config = None

    # User extension
    registry = Registry()
    if not registry.is_registered(extension):
        raise InvalidExtension(f"Extension '{extension}' is not registered.")

    cli = registry.get_extension_class(extension, ExtensionClassType.CLI)
    config = cli.auto_config(*inputs,
                             job_post_process_config=job_post_process_config)
    print(f"Created configuration with {config.get_num_jobs()} jobs.")
    config.dump(config_file)
    print(f"Dumped configuration to {config_file}.\n")
Ejemplo n.º 2
0
def test_registry__add_logger(registry_fixture):
    registry = Registry(registry_filename=TEST_FILENAME)
    registry.reset_defaults()
    package = "test-package"
    registry.add_logger(package)
    assert package in registry.list_loggers()
    registry.remove_logger(package)
    assert package not in registry.list_loggers()
Ejemplo n.º 3
0
    def __init__(
        self,
        container=None,
        job_global_config=None,
        job_post_process_config=None,
        user_data=None,
        submission_groups=None,
        setup_command=None,
        teardown_command=None,
        node_setup_command=None,
        node_teardown_command=None,
        **kwargs,
    ):
        """
        Constructs JobConfiguration.

        Parameters
        ----------
        inputs : JobInputsInterface
        container : JobContainerInterface

        """
        self._jobs = container or JobContainerByName()
        self._job_names = None
        self._jobs_directory = kwargs.get("jobs_directory")
        self._registry = Registry()
        self._job_global_config = job_global_config
        self._job_post_process_config = job_post_process_config
        self._user_data = user_data or {}
        self._submission_groups = [
            SubmissionGroup(**x) for x in submission_groups or []
        ]
        self._setup_command = setup_command
        self._teardown_command = teardown_command
        self._node_setup_command = node_setup_command
        self._node_teardown_command = node_teardown_command

        if kwargs.get("do_not_deserialize_jobs", False):
            assert "job_names" in kwargs, str(kwargs)
            self._job_names = kwargs["job_names"]
            return

        if "jobs" in kwargs:
            self._deserialize_jobs(kwargs["jobs"])
        elif "job_names" in kwargs:
            assert self._jobs_directory is not None, str(kwargs)
            names = kwargs["job_names"]
            self._deserialize_jobs_from_names(names)
Ejemplo n.º 4
0
def deserialize_config(data, **kwargs):
    """Create instance of a JobConfiguration from a dict.

    Parameters
    ----------
    data : dict
        Dictionary loaded from a serialized config file.

    Returns
    -------
    JobConfiguration

    """
    registry = Registry()
    extension = data["extension"]
    cls = registry.get_extension_class(extension,
                                       ExtensionClassType.CONFIGURATION)
    return cls.deserialize(data, **kwargs)
Ejemplo n.º 5
0
    def __init__(self,
                 inputs,
                 container,
                 job_parameters_class,
                 extension_name,
                 job_global_config=None,
                 job_post_process_config=None,
                 batch_post_process_config=None,
                 **kwargs):
        """
        Constructs JobConfiguration.

        Parameters
        ----------
        inputs : JobInputsInterface
        container : JobContainerInterface

        """
        self._extension_name = extension_name
        self._inputs = inputs
        self._jobs = container
        self._job_parameters_class = job_parameters_class
        self._job_names = None
        self._jobs_directory = kwargs.get("jobs_directory")
        self._registry = Registry()
        self._job_global_config = job_global_config
        self._job_post_process_config = job_post_process_config
        self._batch_post_process_config = batch_post_process_config

        if kwargs.get("do_not_deserialize_jobs", False):
            assert "job_names" in kwargs, str(kwargs)
            self._job_names = kwargs["job_names"]
            return

        if "jobs" in kwargs:
            self._deserialize_jobs(kwargs["jobs"])
        elif "job_names" in kwargs:
            assert self._jobs_directory is not None, str(kwargs)
            names = kwargs["job_names"]
            self._deserialize_jobs_from_names(names)
Ejemplo n.º 6
0
def test_registry__reset_defaults(registry_fixture):
    registry = Registry(registry_filename=TEST_FILENAME)
    clear_extensions(registry)
    registry.reset_defaults()
    assert len(registry.list_extensions()) == len(
        DEFAULT_REGISTRY["extensions"])
    assert registry.list_loggers() == DEFAULT_REGISTRY["logging"]
Ejemplo n.º 7
0
def test_registry__show_extensions(capsys, registry_fixture):
    """Test functionality of show_extensions."""
    registry = Registry(registry_filename=TEST_FILENAME)
    registry.reset_defaults()
    registry.show_extensions()
    captured = capsys.readouterr()
    for extension in DEFAULT_REGISTRY["extensions"]:
        assert extension["name"] in captured.out
Ejemplo n.º 8
0
def deserialize_config(data, **kwargs):
    """Create instance of a JobConfiguration from a dict.

    Parameters
    ----------
    data : dict
        Dictionary loaded from a serialized config file.

    Returns
    -------
    JobConfiguration

    """
    registry = Registry()
    config_module = data["configuration_module"]
    config_class = data["configuration_class"]
    for ext in registry.iter_extensions():
        ext_cfg_class = ext[ExtensionClassType.CONFIGURATION]
        if ext_cfg_class.__module__ == config_module and ext_cfg_class.__name__ == config_class:
            return ext_cfg_class.deserialize(data, **kwargs)

    raise InvalidParameter(
        f"Cannot deserialize {config_module}.{config_class}")
Ejemplo n.º 9
0
def remove_logger(package_name):
    """Remove logging for a package."""
    registry = Registry()
    registry.remove_logger(package_name)
Ejemplo n.º 10
0
def add_logger(package_name):
    """Add logging for a package."""
    registry = Registry()
    registry.add_logger(package_name)
Ejemplo n.º 11
0
def unregister(extension):
    """Unregister an extension."""
    registry = Registry()
    registry.unregister_extension(extension)
Ejemplo n.º 12
0
def reset_defaults():
    """Reset registry to its default values."""
    Registry().reset_defaults()
Ejemplo n.º 13
0
def register(extension_file):
    """Register one or more extensions."""
    registry = Registry()
    for extension in load_data(extension_file):
        registry.register_extension(extension)
Ejemplo n.º 14
0
def setup_logging(name, filename, console_level=logging.INFO,
                  file_level=logging.INFO, packages=None):
    """Configures logging to file and console.

    Parameters
    ----------
    name : str
        logger name
    filename : str | None
        log filename
    console_level : int, optional
        console log level
    file_level : int, optional
        file log level
    packages : list, optional
        enable logging for these package names

    """
    log_config = {
        "version": 1,
        "disable_existing_loggers": False,
        "formatters": {
            "basic": {
                "format": "%(message)s"
            },
            "short": {
                "format": "%(asctime)s - %(levelname)s [%(name)s "
                          "%(filename)s:%(lineno)d] : %(message)s",
            },
            "detailed": {
                "format": "%(asctime)s - %(levelname)s [%(name)s "
                          "%(filename)s:%(lineno)d] : %(message)s",
            },
        },
        "handlers": {
            "console": {
                "level": console_level,
                "formatter": "short",
                "class": "logging.StreamHandler",
            },
            "file": {
                "class": "logging.FileHandler",
                "level": file_level,
                "filename": filename,
                "mode": "w",
                "formatter": "detailed",
            },
            "structured_file": {
                "class": "logging.FileHandler",
                "level": file_level,
                "filename": filename,
                "mode": "a",
                "formatter": "basic"
            }
        },
        "loggers": {
            name: {
                "handlers": ["console", "file"],
                "level": "DEBUG",
                "propagate": False
            },
            "event": {
                "handlers": ["console", "structured_file"],
                "level": "DEBUG",
                "propagate": False
            }
        },
        #"root": {
        #    "handlers": ["console", "file"],
        #    "level": "WARN",
        #},
    }

    logging_packages = set(Registry().list_loggers())
    if packages is not None:
        for package in packages:
            logging_packages.add(package)

    for package in logging_packages:
        log_config["loggers"][package] = {
            "handlers": ["console", "file"],
            "level": "DEBUG",
            "propagate": False,
        }

    if filename is None:
        log_config["handlers"].pop("file")
        log_config["loggers"][name]["handlers"].remove("file")
        for package in logging_packages:
            log_config["loggers"][package]["handlers"].remove("file")

    # For event logging
    if name == "event":
        log_config["handlers"].pop("file")
        for package in logging_packages:
            log_config["loggers"].pop(package)
    else:
        log_config["handlers"].pop("structured_file")
        log_config["loggers"]["event"]["handlers"].remove("structured_file")

    logging.config.dictConfig(log_config)
    logger = logging.getLogger(name)

    return logger
Ejemplo n.º 15
0
def test_registry__is_registered(registry_fixture):
    registry = Registry(registry_filename=TEST_FILENAME)
    registry.reset_defaults()
    assert registry.is_registered(DEFAULT_REGISTRY["extensions"][0]["name"])
Ejemplo n.º 16
0
def test_registry__list_extensions(registry_fixture):
    registry = Registry(registry_filename=TEST_FILENAME)
    registry.reset_defaults()
    assert len(registry.list_extensions()) == len(
        DEFAULT_REGISTRY["extensions"])
Ejemplo n.º 17
0
def run(extension, **kwargs):
    """Runs individual job."""
    registry = Registry()
    if not registry.is_registered(extension):
        raise InvalidExtension(f"Extension '{extension}' is not registered.")

    # Parse Argument
    config_file = kwargs["config_file"]
    name = kwargs["name"]
    output = kwargs["output"]
    output_format = kwargs["output_format"]
    verbose = kwargs["verbose"]
    level = logging.DEBUG if verbose else logging.INFO

    # Create directory for current job
    job_dir = os.path.join(output, name)
    os.makedirs(job_dir, exist_ok=True)
    # Structural logging setup
    event_file = os.path.join(job_dir, "events.log")
    setup_event_logging(event_file)

    # General logging setup
    log_file = os.path.join(job_dir, "run.log")
    general_logger = setup_logging(
        extension,
        log_file,
        console_level=logging.ERROR,
        file_level=level,
    )
    general_logger.info(get_cli_string())

    # Create config for run
    try:
        cli = registry.get_extension_class(extension, ExtensionClassType.CLI)
        ret = cli.run(config_file, name, output, output_format, verbose)
    except Exception as err:
        msg = f"unexpected exception in run '{extension}' job={name} - {err}"
        general_logger.exception(msg)
        event = StructuredErrorLogEvent(
            source=name,
            category=EVENT_CATEGORY_ERROR,
            name=EVENT_NAME_UNHANDLED_ERROR,
            message=msg,
        )
        log_event(event)
        ret = 1

    if ret == 0:
        try:
            config = load_data(config_file)
            if "job_post_process_config" in config.keys():
                post_process = JobPostProcess(
                    module_name=config["job_post_process_config"]["module"],
                    class_name=config["job_post_process_config"]["class"],
                    data=config["job_post_process_config"]["data"],
                    job_name=name,
                    output=output,
                )
                post_process.run(config_file=config_file, output=output)
        except Exception as err:
            msg = f"unexpected exception in post-process '{extension}' job={name} - {err}"
            general_logger.exception(msg)
            event = StructuredErrorLogEvent(
                source=name,
                category=EVENT_CATEGORY_ERROR,
                name=EVENT_NAME_UNHANDLED_ERROR,
                message=msg,
            )
            log_event(event)
            ret = 1

    sys.exit(ret)
Ejemplo n.º 18
0
class JobConfiguration(abc.ABC):
    """Base class for any simulation configuration."""

    FILENAME_DELIMITER = "_"

    def __init__(self,
                 inputs,
                 container,
                 job_parameters_class,
                 extension_name,
                 job_global_config=None,
                 job_post_process_config=None,
                 batch_post_process_config=None,
                 **kwargs):
        """
        Constructs JobConfiguration.

        Parameters
        ----------
        inputs : JobInputsInterface
        container : JobContainerInterface

        """
        self._extension_name = extension_name
        self._inputs = inputs
        self._jobs = container
        self._job_parameters_class = job_parameters_class
        self._job_names = None
        self._jobs_directory = kwargs.get("jobs_directory")
        self._registry = Registry()
        self._job_global_config = job_global_config
        self._job_post_process_config = job_post_process_config
        self._batch_post_process_config = batch_post_process_config

        if kwargs.get("do_not_deserialize_jobs", False):
            assert "job_names" in kwargs, str(kwargs)
            self._job_names = kwargs["job_names"]
            return

        if "jobs" in kwargs:
            self._deserialize_jobs(kwargs["jobs"])
        elif "job_names" in kwargs:
            assert self._jobs_directory is not None, str(kwargs)
            names = kwargs["job_names"]
            self._deserialize_jobs_from_names(names)

    def __repr__(self):
        """Concisely display all instance information."""
        return self.dumps()

    def _deserialize_jobs(self, jobs):
        for job_ in jobs:
            job = self._job_parameters_class.deserialize(job_)
            self.add_job(job)

    def _deserialize_jobs_from_names(self, job_names):
        for name in job_names:
            job = self._get_job_by_name(name)
            self.add_job(job)

    def _dump(self, stream=sys.stdout, fmt=".json", indent=2):
        # Note: the default is JSON here because parsing 100 MB .toml files
        # is an order of magnitude slower.
        data = self.serialize()
        if fmt == ".json":
            json.dump(data, stream, indent=indent, cls=ExtendedJSONEncoder)
        elif fmt == ".toml":
            toml.dump(data, stream)
        else:
            assert False, fmt

    def _get_job_by_name(self, name):
        assert self._jobs_directory is not None
        filename = os.path.join(self._jobs_directory, name) + ".json"
        assert os.path.exists(filename), filename
        return self._job_parameters_class.deserialize(load_data(filename))

    @abc.abstractmethod
    def _serialize(self, data):
        """Create implementation-specific data for serialization."""

    def check_job_dependencies(self):
        """Check for impossible conditions with job dependencies.

        Raises
        ------
        InvalidConfiguration
            Raised if job dependencies have an impossible condition.

        """
        # This currently only checks that all jobs defined as blocking exist.
        # It does not look for deadlocks.

        job_names = set()
        blocking_jobs = set()
        for job in self.iter_jobs():
            job_names.add(job.name)
            blocking_jobs.update(job.get_blocking_jobs())

        missing_jobs = blocking_jobs.difference(job_names)
        if missing_jobs:
            for job in missing_jobs:
                logger.error("%s is blocking a job but does not exist", job)
            raise InvalidConfiguration("job ordering definitions are invalid")

    @abc.abstractmethod
    def create_from_result(self, job, output_dir):
        """Create an instance from a result file.

        Parameters
        ----------
        job : JobParametersInterface
        output_dir : str

        Returns
        -------
        class

        """

    @property
    def extension_name(self):
        """Return the extension name for the configuration."""
        return self._extension_name

    @abc.abstractmethod
    def get_job_inputs(self):
        """Return the inputs required to run a job."""

    def add_job(self, job):
        """Add a job to the configuration.

        Parameters
        ----------
        job : JobParametersInterface

        """
        self._jobs.add_job(job)

    def clear(self):
        """Clear all configured jobs."""
        self._jobs.clear()

    @timed_debug
    def dump(self, filename=None, stream=sys.stdout, indent=2):
        """Convert the configuration to structured text format.

        Parameters
        ----------
        filename : str | None
            Write configuration to this file (must be .json or .toml).
            If None, write the text to stream.
            Recommend using .json for large files. .toml is much slower.
        stream : file
            File-like interface that supports write().
        indent : int
            If JSON, use this indentation.

        Raises
        ------
        InvalidParameter
            Raised if filename does not have a supported extenstion.

        """
        if filename is None and stream is None:
            raise InvalidParameter("must set either filename or stream")

        if filename is not None:
            ext = os.path.splitext(filename)[1]
            if ext not in (".json", ".toml"):
                raise InvalidParameter("Only .json and .toml are supported")

            with open(filename, "w") as f_out:
                self._dump(f_out, fmt=ext, indent=indent)
        else:
            self._dump(stream, indent=indent)

        logger.info("Dumped configuration to %s", filename)

    def dumps(self, fmt_module=toml, **kwargs):
        """Dump the configuration to a formatted string."""
        return fmt_module.dumps(self.serialize(), **kwargs)

    @classmethod
    def deserialize(cls, filename_or_data, do_not_deserialize_jobs=False):
        """Create a class instance from a saved configuration file.

        Parameters
        ----------
        filename : str | dict
            path to configuration file or that file loaded as a dict
        do_not_deserialize_jobs : bool
            Set to True to avoid the overhead of loading all jobs from disk.
            Job_names will be stored instead of jobs.

        Returns
        -------
        class

        Raises
        ------
        InvalidParameter
            Raised if the config file has invalid parameters.

        """
        if isinstance(filename_or_data, str):
            data = load_data(filename_or_data)
        else:
            data = filename_or_data

        # Don't create an inputs object. It can be very expensive and we don't
        # need it unless the user wants to change the config.
        # TODO: implement user-friendly error messages when they try to access
        # inputs.
        inputs = None
        data["do_not_deserialize_jobs"] = do_not_deserialize_jobs
        return cls(inputs, **data)

    def get_job(self, name):
        """Return the job matching name.

        Returns
        -------
        namedtuple

        """
        if self.get_num_jobs() == 0 and self._job_names is not None:
            # We loaded from a config file with names only.
            return self._get_job_by_name(name)

        return self._jobs.get_job(name)

    def get_parameters_class(self):
        """Return the class used for job parameters."""
        return self._job_parameters_class

    def get_num_jobs(self):
        """Return the number of jobs in the configuration.

        Returns
        -------
        int

        """
        return self._jobs.get_num_jobs()

    @property
    def job_global_config(self):
        """Return the global configs applied to all jobs."""
        return self._job_global_config

    @property
    def job_post_process_config(self):
        """Return post process config for jobs"""
        return self._job_post_process_config

    @property
    def batch_post_process_config(self):
        """Return batch post process config for task"""
        return self._batch_post_process_config

    @batch_post_process_config.setter
    def batch_post_process_config(self, data):
        self._batch_post_process_config = data

    @property
    def inputs(self):
        """Return the instance of JobInputsInterface for the job."""
        return self._inputs

    def iter_jobs(self):
        """Yields a generator over all jobs.

        Yields
        ------
        iterator over JobParametersInterface

        """
        return self._jobs.iter_jobs()

    @timed_debug
    def list_jobs(self):
        """Return a list of all jobs.

        Returns
        ------
        list
            list of JobParametersInterface

        """
        return list(self.iter_jobs())

    @timed_debug
    def reconfigure_jobs(self, jobs):
        """Reconfigure with a list of jobs.

        Parameters
        ----------
        list of DistributionConfiguration.parameter_type

        """
        self.clear()

        for job in jobs:
            self.add_job(job)

        logger.info("Reconfigured jobs.")

    def remove_job(self, job):
        """Remove a job from the configuration.

        Parameters
        ----------
        job : JobParametersInterface

        """
        return self._jobs.remove_job(job)

    def run_job(self, job, output, **kwargs):
        """Run the job.

        Parameters
        ----------
        job : JobParametersInterface
        output : str
            output directory

        Returns
        -------
        int

        """
        logger.debug("job=%s kwargs=%s", job, kwargs)
        cls = self.job_execution_class()
        job_execution = cls.create(self.get_job_inputs(), job, output)
        return job_execution.run(**kwargs)

    def serialize(self, include=ConfigSerializeOptions.JOBS):
        """Create data for serialization."""
        data = {
            "class": self.__class__.__name__,
            "extension": self.extension_name,
            "jobs_directory": self._jobs_directory,
        }
        if self._job_global_config:
            data["job_global_config"] = self._job_global_config

        if self._job_post_process_config:
            data["job_post_process_config"] = self._job_post_process_config

        if self._batch_post_process_config:
            data["batch_post_process_config"] = self._batch_post_process_config

        if include == ConfigSerializeOptions.JOBS:
            data["jobs"] = [x.serialize() for x in self.iter_jobs()]
        elif include == ConfigSerializeOptions.JOB_NAMES:
            data["job_names"] = [x.name for x in self.iter_jobs()]

        # Fill in instance-specific information.
        self._serialize(data)
        return data

    def serialize_jobs(self, directory):
        """Serializes main job data to job-specific files.

        Parameters
        ----------
        output_dir : str

        """
        for job in self.iter_jobs():
            basename = job.name + ".json"
            job_filename = os.path.join(directory, basename)
            dump_data(job.serialize(), job_filename, cls=ExtendedJSONEncoder)

        # We will need this to deserialize from a filename that includes only
        # job names.
        self._jobs_directory = directory

    def serialize_for_execution(self, scratch_dir, are_inputs_local=True):
        """Serialize config data for efficient execution.

        Parameters
        ----------
        scratch_dir : str
            Temporary storage space on the local system.
        are_inputs_local : bool
            Whether the existing input data is local to this system. For many
            configurations accessing the input data across the network by many
            concurrent workers can cause a bottleneck and so implementations
            may wish to copy the data locally before execution starts. If the
            storage access time is very fast the question is irrelevant.

        Returns
        -------
        str
            Name of serialized config file in scratch directory.

        """
        self._transform_for_local_execution(scratch_dir, are_inputs_local)

        # Split up the jobs to individual files so that each worker can just
        # read its own info.
        self.serialize_jobs(scratch_dir)
        data = self.serialize(ConfigSerializeOptions.JOB_NAMES)
        config_file = os.path.join(scratch_dir, CONFIG_FILE)
        dump_data(data, config_file)
        logger.info("Dumped config file locally to %s", config_file)

        return config_file

    def _transform_for_local_execution(self, scratch_dir, are_inputs_local):
        """Transform data for efficient execution in a local environment.
        Default implementation is a no-op. Derived classes can overridde.

        """

    def show_jobs(self):
        """Show the configured jobs."""
        for job in self.iter_jobs():
            print(job)

    def job_execution_class(self):
        """Return the class used for job execution.

        Returns
        -------
        class

        """
        return self._registry.get_extension_class(self.extension_name,
                                                  ExtensionClassType.EXECUTION)
Ejemplo n.º 19
0
def show():
    """Show the available extensions (job types)."""
    print("Extensions:")
    Registry().show_extensions()
    print("Logging enabled for packages:  ", end="")
    Registry().show_loggers()
Ejemplo n.º 20
0
    def submit_jobs(self,
                    name="job",
                    per_node_batch_size=DEFAULTS["per_node_batch_size"],
                    max_nodes=DEFAULTS["max_nodes"],
                    force_local=False,
                    verbose=False,
                    poll_interval=DEFAULTS["poll_interval"],
                    num_processes=None,
                    previous_results=None,
                    reports=True,
                    try_add_blocked_jobs=False):
        """Submit simulations. Auto-detect whether the current system is an HPC
        and submit to its queue. Otherwise, run locally.

        Parameters
        ----------
        name : str
            batch name, applies to HPC job submission only
        per_node_batch_size : int
            Number of jobs to run on one node in one batch.
        max_nodes : int
            Max number of node submission requests to make in parallel.
        force_local : bool
            If on HPC, run jobs through subprocess as if local.
        wait : bool
            Don't return until HPC jobs have finished.
        verbose : bool
            Enable debug logging.
        poll_interval : int
            Inteval in seconds on which to poll jobs.
        num_processes : int
            Number of processes to run in parallel; defaults to num CPUs

        Returns
        -------
        Status

        """
        logger.info("Submit %s jobs for execution.",
                    self._config.get_num_jobs())
        logger.info("JADE version %s", jade.version.__version__)
        registry = Registry()
        loggers = registry.list_loggers()
        logger.info("Registered modules for logging: %s", ", ".join(loggers))
        self._save_repository_info(registry)

        self._config.check_job_dependencies()

        self._hpc = HpcManager(self._hpc_config_file, self._output)
        result = Status.GOOD

        # If an events summary file exists, it is invalid.
        events_file = os.path.join(self._output, EVENTS_FILENAME)
        if os.path.exists(events_file):
            os.remove(events_file)

        start_time = time.time()
        if self._hpc.hpc_type == HpcType.LOCAL or force_local:
            runner = JobRunner(self._config_file, output=self._output)
            result = runner.run_jobs(verbose=verbose,
                                     num_processes=num_processes)
        else:
            self._submit_to_hpc(name, max_nodes, per_node_batch_size, verbose,
                                poll_interval, num_processes,
                                try_add_blocked_jobs)

        results_summary = ResultsAggregatorSummary(self._results_dir)
        self._results = results_summary.get_results()
        if len(self._results) != self._config.get_num_jobs():
            logger.error(
                "Number of results doesn't match number of jobs: "
                "results=%s jobs=%s. Check for process crashes "
                "or HPC timeouts.", len(self._results),
                self._config.get_num_jobs())
            result = Status.ERROR

        if previous_results:
            self._results += previous_results

        self.write_results(RESULTS_FILE)
        results_summary.delete_files()
        shutil.rmtree(self._results_dir)

        self._log_error_log_messages(self._output)

        bytes_consumed = get_directory_size_bytes(self._output,
                                                  recursive=False)
        event = StructuredLogEvent(
            source="submitter",
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_BYTES_CONSUMED,
            message="main output directory size",
            bytes_consumed=bytes_consumed,
        )
        log_event(event)

        event = StructuredLogEvent(
            source="submitter",
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_CONFIG_EXEC_SUMMARY,
            message="config execution summary",
            config_execution_time=time.time() - start_time,
            num_jobs=self.get_num_jobs(),
        )
        log_event(event)

        if reports:
            self.generate_reports(self._output)

        return result
Ejemplo n.º 21
0
class JobConfiguration(abc.ABC):
    """Base class for any simulation configuration."""

    FILENAME_DELIMITER = "_"
    FORMAT_VERSION = "v0.2.0"

    def __init__(
        self,
        container=None,
        job_global_config=None,
        job_post_process_config=None,
        user_data=None,
        submission_groups=None,
        setup_command=None,
        teardown_command=None,
        node_setup_command=None,
        node_teardown_command=None,
        **kwargs,
    ):
        """
        Constructs JobConfiguration.

        Parameters
        ----------
        inputs : JobInputsInterface
        container : JobContainerInterface

        """
        self._jobs = container or JobContainerByName()
        self._job_names = None
        self._jobs_directory = kwargs.get("jobs_directory")
        self._registry = Registry()
        self._job_global_config = job_global_config
        self._job_post_process_config = job_post_process_config
        self._user_data = user_data or {}
        self._submission_groups = [
            SubmissionGroup(**x) for x in submission_groups or []
        ]
        self._setup_command = setup_command
        self._teardown_command = teardown_command
        self._node_setup_command = node_setup_command
        self._node_teardown_command = node_teardown_command

        if kwargs.get("do_not_deserialize_jobs", False):
            assert "job_names" in kwargs, str(kwargs)
            self._job_names = kwargs["job_names"]
            return

        if "jobs" in kwargs:
            self._deserialize_jobs(kwargs["jobs"])
        elif "job_names" in kwargs:
            assert self._jobs_directory is not None, str(kwargs)
            names = kwargs["job_names"]
            self._deserialize_jobs_from_names(names)

    def __repr__(self):
        """Concisely display all instance information."""
        return self.dumps()

    def _deserialize_jobs(self, jobs):
        for _job in jobs:
            param_class = self.job_parameters_class(_job["extension"])
            job = param_class.deserialize(_job)
            self.add_job(job)

    def _deserialize_jobs_from_names(self, job_names):
        for name in job_names:
            job = self._get_job_by_name(name)
            self.add_job(job)

    def _dump(self, stream=sys.stdout, fmt=".json", indent=2):
        # Note: the default is JSON here because parsing 100 MB .toml files
        # is an order of magnitude slower.
        data = self.serialize()
        if fmt == ".json":
            json.dump(data, stream, indent=indent, cls=ExtendedJSONEncoder)
        elif fmt == ".toml":
            toml.dump(data, stream)
        else:
            assert False, fmt

    def _get_job_by_name(self, name):
        assert self._jobs_directory is not None
        filename = os.path.join(self._jobs_directory, name) + ".json"
        assert os.path.exists(filename), filename
        job = load_data(filename)
        param_class = self.job_parameters_class(job["extension"])
        return param_class.deserialize(job)

    @abc.abstractmethod
    def _serialize(self, data):
        """Create implementation-specific data for serialization."""

    def add_user_data(self, key, data):
        """Add user data referenced by a key. Must be JSON-serializable

        Parameters
        ----------
        key : str
        data : any

        Raises
        ------
        InvalidParameter
            Raised if the key is already stored.

        """
        if key in self._user_data:
            raise InvalidParameter(
                f"{key} is already stored. Call remove_user_data first")

        self._user_data[key] = data

    def get_user_data(self, key):
        """Get the user data associated with key.

        Parameters
        ----------
        key : str

        Returns
        -------
        any

        """
        data = self._user_data.get(key)
        if data is None:
            raise InvalidParameter(f"{key} is not stored.")

        return data

    def remove_user_data(self, key):
        """Remove the key from the user data config.

        Parameters
        ----------
        key : str

        """
        self._user_data.pop(key, None)

    def list_user_data_keys(self):
        """List the stored user data keys.

        Returns
        -------
        list
            list of str

        """
        return sorted(list(self._user_data.keys()))

    def check_job_dependencies(self, submitter_params):
        """Check for impossible conditions with job dependencies.

        Parameters
        ----------
        submitter_params : SubmitterParams

        Raises
        ------
        InvalidConfiguration
            Raised if job dependencies have an impossible condition.

        """
        requires_estimated_time = submitter_params.per_node_batch_size == 0

        # This currently only checks that all jobs defined as blocking exist.
        # It does not look for deadlocks.

        job_names = set()
        blocking_jobs = set()
        missing_estimate = []
        for job in self.iter_jobs():
            job_names.add(job.name)
            blocking_jobs.update(job.get_blocking_jobs())
            if requires_estimated_time and job.estimated_run_minutes is None:
                missing_estimate.append(job.name)

        missing_jobs = blocking_jobs.difference(job_names)
        if missing_jobs:
            for job in missing_jobs:
                logger.error("%s is blocking a job but does not exist", job)
            raise InvalidConfiguration("job ordering definitions are invalid")

        if missing_estimate:
            for job in missing_estimate:
                logger.error("Job %s does not define estimated_run_minutes",
                             job)
            raise InvalidConfiguration(
                "Submitting batches by time requires that each job define estimated_run_minutes"
            )

    def check_job_runtimes(self):
        """Check for any job with a longer estimated runtime than the walltime.

        Raises
        ------
        InvalidConfiguration
            Raised if any job is too long.

        """
        wall_times = {
            x.name: x.submitter_params.get_wall_time()
            for x in self.submission_groups
        }
        for job in self.iter_jobs():
            wall_time = wall_times[job.submission_group]
            if job.estimated_run_minutes is not None:
                estimate = timedelta(minutes=job.estimated_run_minutes)
                if estimate > wall_time:
                    raise InvalidConfiguration(
                        f"job {job.name} has estimated_run_minutes={estimate} longer than wall_time={wall_time}"
                    )

    def check_spark_config(self):
        """If Spark jobs are present in the config, configure the params to run
        one job at a time.

        """
        groups_with_spark_jobs = set()
        for job in self.iter_jobs():
            if job.is_spark_job():
                groups_with_spark_jobs.add(job.submission_group)

        for group_name in groups_with_spark_jobs:
            for group in self._submission_groups:
                if group.name == group_name and group.submitter_params.num_processes != 1:
                    group.submitter_params.num_processes = 1
                    logger.info(
                        "Set num_processes=1 for group=%s for Spark jobs.",
                        group_name)

    def check_submission_groups(self, submitter_params):
        """Check for invalid job submission group assignments.
        Make a default group if none are defined and assign it to each job.

        Parameters
        ----------
        submitter_params : SubmitterParams

        Raises
        ------
        InvalidConfiguration
            Raised if submission group assignments are invalid.

        """
        groups = self.submission_groups
        if not groups:
            self._assign_default_submission_group(submitter_params)
            return

        first_group = next(iter(groups))
        group_params = (
            "try_add_blocked_jobs",
            "time_based_batching",
            "num_processes",
            "hpc_config",
            "per_node_batch_size",
            "singularity_params",
            "distributed_submitter",
        )
        user_overrides = (
            "distributed_submitter",
            "generate_reports",
            "resource_monitor_interval",
            "resource_monitor_type",
            "dry_run",
            "verbose",
        )
        user_override_if_not_set = ("node_setup_script",
                                    "node_shutdown_script")
        must_be_same = ("max_nodes", "poll_interval")
        all_params = (must_be_same, group_params, user_overrides,
                      user_override_if_not_set)
        fields = {item for params in all_params for item in params}
        assert sorted(list(fields)) == sorted(
            SubmitterParams.__fields__), sorted(list(fields))
        hpc_type = first_group.submitter_params.hpc_config.hpc_type
        group_names = set()
        for group in groups:
            if group.name in group_names:
                raise InvalidConfiguration(
                    f"submission group {group.name} is listed twice")
            group_names.add(group.name)
            if group.submitter_params.hpc_config.hpc_type != hpc_type:
                raise InvalidConfiguration(
                    f"hpc_type values must be the same in all groups")
            for param in must_be_same:
                first_val = getattr(first_group.submitter_params, param)
                this_val = getattr(group.submitter_params, param)
                if this_val != first_val:
                    raise InvalidConfiguration(
                        f"{param} must be the same in all groups")
            for param in user_overrides:
                user_val = getattr(submitter_params, param)
                setattr(group.submitter_params, param, user_val)
            for param in user_override_if_not_set:
                user_val = getattr(submitter_params, param)
                group_val = getattr(group.submitter_params, param)
                if group_val is None:
                    setattr(group.submitter_params, param, user_val)

        jobs_by_group = defaultdict(list)
        for job in self.iter_jobs():
            if job.submission_group is None:
                raise InvalidConfiguration(
                    f"Job {job.name} does not have a submission group assigned"
                )
            if job.submission_group not in group_names:
                raise InvalidConfiguration(
                    f"Job {job.name} has an invalid submission group: {job.submission_group}"
                )
            jobs_by_group[job.submission_group].append(job.name)

        group_counts = {}
        for name, jobs in jobs_by_group.items():
            if not jobs:
                logger.warning(
                    "Submission group %s does not have any jobs defined", name)
            group_counts[name] = len(jobs)

        for name, count in sorted(group_counts.items()):
            logger.info("Submission group %s has %s jobs", name, count)

    def _assign_default_submission_group(self, submitter_params):
        default_name = "default"
        group = SubmissionGroup(name=default_name,
                                submitter_params=submitter_params)
        for job in self.iter_jobs():
            job.submission_group = group.name
        self.append_submission_group(group)

    @abc.abstractmethod
    def create_from_result(self, job, output_dir):
        """Create an instance from a result file.

        Parameters
        ----------
        job : JobParametersInterface
        output_dir : str

        Returns
        -------
        class

        """

    def add_job(self, job):
        """Add a job to the configuration.

        Parameters
        ----------
        job : JobParametersInterface

        """
        self._jobs.add_job(job)

    def clear(self):
        """Clear all configured jobs."""
        self._jobs.clear()

    @timed_debug
    def dump(self, filename=None, stream=sys.stdout, indent=2):
        """Convert the configuration to structured text format.

        Parameters
        ----------
        filename : str | None
            Write configuration to this file (must be .json or .toml).
            If None, write the text to stream.
            Recommend using .json for large files. .toml is much slower.
        stream : file
            File-like interface that supports write().
        indent : int
            If JSON, use this indentation.

        Raises
        ------
        InvalidParameter
            Raised if filename does not have a supported extenstion.

        """
        if filename is None and stream is None:
            raise InvalidParameter("must set either filename or stream")

        if filename is not None:
            ext = os.path.splitext(filename)[1]
            if ext not in (".json", ".toml"):
                raise InvalidParameter("Only .json and .toml are supported")

            with open(filename, "w") as f_out:
                self._dump(f_out, fmt=ext, indent=indent)
        else:
            self._dump(stream, indent=indent)

        logger.info("Dumped configuration to %s", filename)

    def dumps(self, fmt_module=toml, **kwargs):
        """Dump the configuration to a formatted string."""
        return fmt_module.dumps(self.serialize(), **kwargs)

    @classmethod
    def deserialize(cls, filename_or_data, do_not_deserialize_jobs=False):
        """Create a class instance from a saved configuration file.

        Parameters
        ----------
        filename : str | dict
            path to configuration file or that file loaded as a dict
        do_not_deserialize_jobs : bool
            Set to True to avoid the overhead of loading all jobs from disk.
            Job_names will be stored instead of jobs.

        Returns
        -------
        class

        Raises
        ------
        InvalidParameter
            Raised if the config file has invalid parameters.

        """
        if isinstance(filename_or_data, str):
            data = load_data(filename_or_data)
        else:
            data = filename_or_data

        data["do_not_deserialize_jobs"] = do_not_deserialize_jobs
        return cls(**data)

    def get_job(self, name):
        """Return the job matching name.

        Returns
        -------
        namedtuple

        """
        if self.get_num_jobs() == 0 and self._job_names is not None:
            # We loaded from a config file with names only.
            return self._get_job_by_name(name)

        return self._jobs.get_job(name)

    def get_num_jobs(self):
        """Return the number of jobs in the configuration.

        Returns
        -------
        int

        """
        return len(self._jobs)

    @property
    def job_global_config(self):
        """Return the global configs applied to all jobs."""
        return self._job_global_config

    def iter_jobs(self):
        """Yields a generator over all jobs.

        Yields
        ------
        iterator over JobParametersInterface

        """
        return iter(self._jobs)

    @timed_debug
    def list_jobs(self):
        """Return a list of all jobs.

        Returns
        ------
        list
            list of JobParametersInterface

        """
        return list(self.iter_jobs())

    def append_submission_group(self, submission_group):
        """Append a submission group.

        Parameters
        ----------
        submission_group : SubmissionGroup

        """
        self._submission_groups.append(submission_group)
        logger.info("Added submission group %s", submission_group.name)

    def get_default_submission_group(self):
        """Return the default submission group.

        Returns
        -------
        SubmissionGroup

        """
        name = next(iter(self.iter_jobs())).submission_group
        return self.get_submission_group(name)

    def get_submission_group(self, name):
        """Return the submission group matching name.

        Parameters
        ----------
        name : str

        Returns
        -------
        SubmissionGroup

        """
        for group in self.submission_groups:
            if group.name == name:
                return group

        raise InvalidParameter(f"submission group {name} is not stored")

    @property
    def submission_groups(self):
        """Return the submission groups.

        Returns
        -------
        list

        """
        return self._submission_groups

    @timed_debug
    def reconfigure_jobs(self, jobs):
        """Reconfigure with a list of jobs.

        Parameters
        ----------
        list of DistributionConfiguration.parameter_type

        """
        self.clear()

        for job in jobs:
            self.add_job(job)

        logger.info("Reconfigured jobs.")

    def remove_job(self, job):
        """Remove a job from the configuration.

        Parameters
        ----------
        job : JobParametersInterface

        """
        return self._jobs.remove_job(job)

    def serialize(self, include=ConfigSerializeOptions.JOBS):
        """Create data for serialization."""
        data = {
            "jobs_directory": self._jobs_directory,
            "configuration_module": self.__class__.__module__,
            "configuration_class": self.__class__.__name__,
            "format_version": self.FORMAT_VERSION,
            "user_data": self._user_data,
            "submission_groups": [x.dict() for x in self.submission_groups],
            "setup_command": self.setup_command,
            "teardown_command": self.teardown_command,
            "node_setup_command": self.node_setup_command,
            "node_teardown_command": self.node_teardown_command,
        }
        if self._job_global_config:
            data["job_global_config"] = self._job_global_config

        if self._job_post_process_config:
            data["job_post_process_config"] = self._job_post_process_config

        if include == ConfigSerializeOptions.JOBS:
            data["jobs"] = [x.serialize() for x in self.iter_jobs()]
        elif include == ConfigSerializeOptions.JOB_NAMES:
            data["job_names"] = [x.name for x in self.iter_jobs()]

        # Fill in instance-specific information.
        self._serialize(data)
        return data

    def serialize_jobs(self, directory):
        """Serializes main job data to job-specific files.

        Parameters
        ----------
        directory : str

        """
        for job in self.iter_jobs():
            basename = job.name + ".json"
            job_filename = os.path.join(directory, basename)
            dump_data(job.serialize(), job_filename, cls=ExtendedJSONEncoder)

        # We will need this to deserialize from a filename that includes only
        # job names.
        self._jobs_directory = directory

    def serialize_for_execution(self, scratch_dir, are_inputs_local=True):
        """Serialize config data for efficient execution.

        Parameters
        ----------
        scratch_dir : str
            Temporary storage space on the local system.
        are_inputs_local : bool
            Whether the existing input data is local to this system. For many
            configurations accessing the input data across the network by many
            concurrent workers can cause a bottleneck and so implementations
            may wish to copy the data locally before execution starts. If the
            storage access time is very fast the question is irrelevant.

        Returns
        -------
        str
            Name of serialized config file in scratch directory.

        """
        self._transform_for_local_execution(scratch_dir, are_inputs_local)

        # Split up the jobs to individual files so that each worker can just
        # read its own info.
        self.serialize_jobs(scratch_dir)
        data = self.serialize(ConfigSerializeOptions.JOB_NAMES)
        config_file = os.path.join(scratch_dir, CONFIG_FILE)
        dump_data(data, config_file, cls=ExtendedJSONEncoder)
        logger.info("Dumped config file locally to %s", config_file)

        return config_file

    @property
    def setup_command(self):
        """Command to run by submitter before submitting jobs"""
        return self._setup_command

    @setup_command.setter
    def setup_command(self, cmd):
        """Set command to run by submitter before submitting jobs"""
        self._setup_command = cmd

    @property
    def teardown_command(self):
        """Command to run by last node before completing jobs"""
        return self._teardown_command

    @teardown_command.setter
    def teardown_command(self, cmd):
        """Set command to run by last node before completing jobs"""
        self._teardown_command = cmd

    @property
    def node_setup_command(self):
        """Command to run on each node before starting jobs"""
        return self._node_setup_command

    @node_setup_command.setter
    def node_setup_command(self, cmd):
        """Set command to run on each node before starting jobs"""
        self._node_setup_command = cmd

    @property
    def node_teardown_command(self):
        """Command to run on each node after completing jobs"""
        return self._node_teardown_command

    @node_teardown_command.setter
    def node_teardown_command(self, cmd):
        """Set command to run on each node after completing jobs"""
        self._node_teardown_command = cmd

    def _transform_for_local_execution(self, scratch_dir, are_inputs_local):
        """Transform data for efficient execution in a local environment.
        Default implementation is a no-op. Derived classes can overridde.

        """

    def shuffle_jobs(self):
        """Shuffle the job order."""
        self._jobs.shuffle()

    def show_jobs(self):
        """Show the configured jobs."""
        for job in self.iter_jobs():
            print(job)

    def job_execution_class(self, extension_name):
        """Return the class used for job execution.

        Parameters
        ----------
        extension_name : str

        Returns
        -------
        class

        """
        return self._registry.get_extension_class(extension_name,
                                                  ExtensionClassType.EXECUTION)

    def job_parameters_class(self, extension_name):
        """Return the class used for job parameters.

        Parameters
        ----------
        extension_name : str

        Returns
        -------
        class

        """
        return self._registry.get_extension_class(
            extension_name, ExtensionClassType.PARAMETERS)
Ejemplo n.º 22
0
def test_registry__register_extensions(registry_fixture):
    registry = Registry(registry_filename=TEST_FILENAME)
    clear_extensions(registry)
    extension = DEFAULT_REGISTRY["extensions"][0]
    registry.register_extension(extension)
    extensions = registry.list_extensions()
    assert len(extensions) == 1
    ext = extensions[0]

    assert ext["name"] == extension["name"]
    cfg_class = registry.get_extension_class(ext["name"],
                                             ExtensionClassType.CONFIGURATION)
    assert cfg_class == GenericCommandConfiguration
    exec_class = registry.get_extension_class(ext["name"],
                                              ExtensionClassType.EXECUTION)
    assert exec_class == GenericCommandExecution
    cli_mod = registry.get_extension_class(ext["name"], ExtensionClassType.CLI)
    assert cli_mod == cli

    # Test that the the changes are reflected with a new instance.
    registry2 = Registry(registry_filename=TEST_FILENAME)
    extensions1 = registry.list_extensions()
    extensions2 = registry2.list_extensions()
    for ext1, ext2 in zip(extensions1, extensions2):
        for field in DEFAULT_REGISTRY["extensions"][0]:
            assert ext1[field] == ext2[field]
Ejemplo n.º 23
0
    def submit_jobs(self, cluster, force_local=False):
        """Submit simulations. Auto-detect whether the current system is an HPC
        and submit to its queue. Otherwise, run locally.

        Parameters
        ----------
        cluster : Cluster
        force_local : bool
            If on HPC, run jobs through subprocess as if local.

        Returns
        -------
        Status

        """
        if self._is_new:
            logger.info("Submit %s jobs for execution.",
                        self._config.get_num_jobs())
            logger.info("JADE version %s", jade.version.__version__)
            registry = Registry()
            loggers = registry.list_loggers()
            logger.info("Registered modules for logging: %s",
                        ", ".join(loggers))
            self._save_repository_info(registry)

            ResultsAggregator.create(self._output)

            # If an events summary file exists, it is invalid.
            events_file = os.path.join(self._output, EVENTS_FILENAME)
            if os.path.exists(events_file):
                os.remove(events_file)

            event = StructuredLogEvent(
                source="submitter",
                category=EVENT_CATEGORY_RESOURCE_UTIL,
                name=EVENT_NAME_SUBMIT_COMPLETED,
                message="job submission started",
                num_jobs=self.get_num_jobs(),
            )
            log_event(event)

            os.environ["JADE_RUNTIME_OUTPUT"] = self._output
            if self._config.setup_command is not None:
                cmd = f"JADE_RUNTIME_OUTPUT={self._output} {self._config.setup_command}"
                logger.info("Running setup command: %s", cmd)
                check_run_command(self._config.setup_command)
        else:
            self._handle_submission_groups()

        result = Status.IN_PROGRESS
        group = self._config.get_default_submission_group()
        groups = make_submission_group_lookup(cluster.config.submission_groups)
        self._hpc = HpcManager(groups, self._output)

        if self._hpc.hpc_type == HpcType.LOCAL or force_local:
            runner = JobRunner(self._config_file, output=self._output)
            num_processes = group.submitter_params.num_processes
            verbose = group.submitter_params.verbose
            result = runner.run_jobs(verbose=verbose,
                                     num_processes=num_processes)
            agg = ResultsAggregator.load(self._output)
            agg.process_results()
            is_complete = True
        else:
            is_complete = self._submit_to_hpc(cluster)

        if is_complete:
            result = self._handle_completion(cluster)

        return result
Ejemplo n.º 24
0
import os
import sys

import pytest

from jade.extensions.registry import Registry

if os.environ.get("LOCAL_SUBMITTER") is not None:
    print("You must unset the environment variable LOCAL_SUBMITTER.")
    sys.exit(1)

registry = Registry()
if not registry.is_registered("demo"):
    registry.register_demo_extension()


@pytest.fixture
def test_data_dir():
    """The path to the directory that contains the fixture data"""
    return os.path.join(os.path.dirname(__file__), "data")


@pytest.fixture
def example_output():
    return os.path.join(os.path.dirname(__file__), "data", "example_output")