Exemple #1
0
    def load_default_settings(cls, values):
        """
        Load settings from file or environment variables.

        Loads settings from a root file if available and uses that as defaults in
        place of built in defaults.

        This allows setting of the config file path through environment variables.
        """
        from monty.serialization import loadfn

        config_file_path: str = values.get("CONFIG_FILE", DEFAULT_CONFIG_FILE_PATH)

        new_values = {}
        if Path(config_file_path).exists():
            new_values.update(loadfn(config_file_path))

        store = new_values.get("JOB_STORE")
        if isinstance(store, str):
            new_values["JOB_STORE"] = JobStore.from_file(store)
        elif isinstance(store, dict) and store.get("@class") == "JobStore":
            new_values["JOB_STORE"] = JobStore.from_dict(store)
        elif isinstance(store, dict):
            new_values["JOB_STORE"] = JobStore.from_dict_spec(store)

        new_values.update(values)
        return new_values
def test_from_db_file(test_data):
    from jobflow import JobStore

    ms = JobStore.from_file(test_data / "db.yaml")
    ms.connect()
    assert ms.docs_store.name == "mongo://localhost/jobflow_unittest/outputs"
    assert ms.additional_stores == {}

    # test gridfs
    ms = JobStore.from_file(test_data / "db_gridfs.yaml")
    ms.connect()
    data_store = ms.additional_stores["data"]
    assert ms.docs_store.name == "mongo://localhost/jobflow_unittest/outputs"
    assert data_store.name == "gridfs://localhost/jobflow_unittest/outputs_blobs"

    # test serialized
    ms = JobStore.from_file(test_data / "db_serialized.json")
    ms.connect()
    data_store = ms.additional_stores["data"]
    assert ms.docs_store.name == "mongo://localhost/jobflow_unittest/outputs"
    assert data_store.name == "gridfs://localhost/jobflow_unittest/outputs_blobs"

    # test bad file
    with pytest.raises(ValueError):
        JobStore.from_file(test_data / "db_bad.yaml")
Exemple #3
0
def mongo_jobstore(database):
    from maggma.stores import MongoStore

    from jobflow import JobStore

    store = JobStore(MongoStore(database, "outputs"))
    store.connect()
    return store
Exemple #4
0
def memory_data_jobstore():
    from maggma.stores import MemoryStore

    from jobflow import JobStore

    store = JobStore(MemoryStore(), additional_stores={"data": MemoryStore()})
    store.connect()

    return store
Exemple #5
0
def memory_jobstore():
    from maggma.stores import MemoryStore

    from jobflow import JobStore

    store = JobStore(MemoryStore())
    store.connect()

    return store
Exemple #6
0
def resolve_references(
    references: Sequence[OutputReference],
    store: jobflow.JobStore,
    cache: dict[str, Any] | None = None,
    on_missing: OnMissing = OnMissing.ERROR,
) -> dict[OutputReference, Any]:
    """
    Resolve multiple output references.

    Uses caching to minimize number of database queries.

    Parameters
    ----------
    references :
        A list or tuple of output references.
    store
        A job store.
    cache
        A dictionary cache to use for local caching of reference values.
    on_missing
        What to do if the output reference is missing in the database and cache.
        See :obj:`OnMissing` for the available options.

    Returns
    -------
    dict[OutputReference, Any]
        The output values as a dictionary mapping of ``{reference: output}``.
    """
    from itertools import groupby

    resolved_references = {}
    if cache is None:
        cache = {}

    for uuid, ref_group in groupby(references, key=lambda x: x.uuid):
        # get latest index
        result = store.query_one({"uuid": uuid}, ["index"], sort={"index": -1})
        index = None if result is None else result["index"]

        if uuid not in cache:
            cache[uuid] = {}

        if index is not None and index not in cache[uuid]:
            cache[uuid][index] = store.get_output(uuid,
                                                  load=True,
                                                  on_missing=on_missing)

        for ref in ref_group:
            resolved_references[ref] = ref.resolve(store,
                                                   cache=cache,
                                                   on_missing=on_missing)

    return resolved_references
def test_basic(memory_store):
    from jobflow import JobStore

    store = JobStore(memory_store)
    store.connect()
    assert store
    assert store.name == "JobStore-mem://memory_db"
    assert store._collection is not None

    store.close()

    store = JobStore(memory_store, load=None)
    store.connect()
    assert store
def test_additional(memory_store):
    from copy import deepcopy

    import boto3
    from maggma.stores import MemoryStore, S3Store
    from moto import mock_s3

    from jobflow import JobStore

    with mock_s3():
        conn = boto3.resource("s3", region_name="us-east-1")
        conn.create_bucket(Bucket="bucket1")
        index = MemoryStore("index", key="blob_uuid")
        s3_store = S3Store(index, "bucket1", key="blob_uuid")
        store = JobStore(
            memory_store,
            additional_stores={
                "data": deepcopy(memory_store),
                "data_s3": s3_store
            },
        )

        with store as s:
            assert s
            assert s.name == "JobStore-mem://memory_db"
            assert s._collection is not None
            assert s.additional_stores["data_s3"].searchable_fields == [
                "job_uuid",
                "job_index",
            ]
Exemple #9
0
    def run(self, store: jobflow.JobStore) -> Response:
        """
        Run the job.

        If the job has inputs that are :obj:`.OutputReference` objects, then they will
        need to be resolved before the job can run. See the docstring for
        :obj:`.OutputReference.resolve()` for more details.

        Parameters
        ----------
        store
            A :obj:`.JobStore` to use for resolving references and storing job outputs.

        Returns
        -------
        Response
            The response of the job, containing the outputs, and other settings that
            determine the flow execution.

        Raises
        ------
        ImportError
            If the job function cannot be imported.

        See Also
        --------
        Response, .OutputReference
        """
        import builtins
        import types
        from datetime import datetime

        from jobflow import CURRENT_JOB
        from jobflow.core.flow import get_flow

        index_str = f", {self.index}" if self.index != 1 else ""
        logger.info(f"Starting job - {self.name} ({self.uuid}{index_str})")
        CURRENT_JOB.job = self

        if self.config.expose_store:
            CURRENT_JOB.store = store

        if self.config.resolve_references:
            self.resolve_args(store=store)

        # if Job was created using the job decorator, then access the original function
        function = getattr(self.function, "original", self.function)

        # if function is bound method we need to do some magic to bind the unwrapped
        # function to the class/instance
        bound = getattr(self.function, "__self__", None)
        if bound is not None and bound is not builtins:
            function = types.MethodType(function, bound)

        response = function(*self.function_args, **self.function_kwargs)
        response = Response.from_job_returns(response, self.output_schema)

        if response.replace is not None:
            response.replace = prepare_replace(response.replace, self)
            response.replace.add_hosts_uuids(self.hosts)

        if response.addition is not None:
            # wrap the detour in a Flow to avoid problems if it need to get
            # wrapped at a later stage
            response.addition = get_flow(response.addition)
            response.addition.add_hosts_uuids(self.hosts)

        if response.detour is not None:
            # wrap the detour in a Flow to avoid problems if it need to get
            # wrapped at a later stage
            response.detour = get_flow(response.detour)
            response.detour.add_hosts_uuids(self.hosts)

        if self.config.response_manager_config:
            passed_config = self.config.response_manager_config
        elif self.config.pass_manager_config:
            passed_config = self.config.manager_config
        else:
            passed_config = None

        if passed_config:

            if response.addition is not None:
                pass_manager_config(response.addition, passed_config)

            if response.detour is not None:
                pass_manager_config(response.detour, passed_config)

            if response.replace is not None:
                pass_manager_config(response.replace, passed_config)

        try:
            output = jsanitize(response.output, strict=True, enum_values=True)
        except AttributeError:
            raise RuntimeError(
                "Job output contained an object that is not MSONable and therefore "
                "could not be serialized."
            )

        save = {k: "output" if v is True else v for k, v in self._kwargs.items()}
        data = {
            "uuid": self.uuid,
            "index": self.index,
            "output": output,
            "completed_at": datetime.now().isoformat(),
            "metadata": self.metadata,
            "hosts": self.hosts,
        }
        store.update(data, key=["uuid", "index"], save=save)

        CURRENT_JOB.reset()
        logger.info(f"Finished job - {self.name} ({self.uuid}{index_str})")
        return response
Exemple #10
0
class JobflowSettings(BaseSettings):
    """
    Settings for jobflow.

    The default way to modify these is to modify ~/.jobflow.yaml. Alternatively,
    the environment variable ``JOBFLOW_CONFIG_FILE`` can be set to point to a yaml file
    with jobflow settings.

    Lastly, the variables can be modified directly though environment variables by
    using the "JOBFLOW" prefix. E..g., ``JOBFLOW_JOB_STORE=path/to/jobstore.file``.

    **Allowed JOB_STORE formats**

    If the store is not supplied, a ``MemoryStore`` will be used. Can be specified in
    multiple formats.

    The simplest format is the yaml dumped version of the store, generated using:

    >>> import yaml
    >>> yaml.dump(store.as_dict())

    Alternatively, the store can be specified as the keys docs_store, additional_stores
    and any other keyword arguments supported by the :obj:`JobStore` constructor. The
    docs_store and additional stores are specified by the ``type`` key which must match
    a Maggma ``Store`` subclass, and the remaining keys are passed to the store
    constructor. For example, the following file would  create a :obj:`JobStore` with a
    ``MongoStore`` for docs and a ``GridFSStore`` or ``S3Store`` as an additional store
    for data.

    GridFSStore example:

    .. code-block:: yaml

        docs_store:
          type: MongoStore
          database: jobflow_unittest
          collection_name: outputs
          host: localhost
          port: 27017
        additional_stores:
          data:
            type: GridFSStore
            database: jobflow_unittest
            collection_name: outputs_blobs
            host: localhost
            port: 27017

    S3Store example (Note: the ``key`` field must be set to ``blob_uuid``):

    .. code-block:: yaml

        docs_store:
          type: MongoStore
          database: jobflow_unittest
          collection_name: outputs
          host: localhost
          port: 27017
        additional_stores:
          data:
            type: S3Store
            bucket: output_blobs
            key: blob_uuid
            index:
              type: MongoStore
              database: jobflow_unittest
              collection_name: output_blobs_index
              host: localhost
              port: 27017
              key: blob_uuid


    Lastly, the store can be specified as a file name that points to a file containing
    the credentials in any format supported by :obj:`.JobStore.from_file`.
    """

    CONFIG_FILE: str = Field(
        DEFAULT_CONFIG_FILE_PATH, description="File to load alternative defaults from."
    )

    # general settings
    JOB_STORE: JobStore = Field(
        default_factory=lambda: JobStore(MemoryStore()),
        description="Default JobStore to use when running locally or using FireWorks. "
        "See the :obj:`JobflowSettings` docstring for more details on the "
        "accepted formats.",
    )
    DIRECTORY_FORMAT: str = Field(
        "%Y-%m-%d-%H-%M-%S-%f",
        description="Date stamp format used to create directories",
    )

    class Config:
        """Pydantic config settings."""

        env_prefix = "jobflow_"

    @root_validator(pre=True)
    def load_default_settings(cls, values):
        """
        Load settings from file or environment variables.

        Loads settings from a root file if available and uses that as defaults in
        place of built in defaults.

        This allows setting of the config file path through environment variables.
        """
        from monty.serialization import loadfn

        config_file_path: str = values.get("CONFIG_FILE", DEFAULT_CONFIG_FILE_PATH)

        new_values = {}
        if Path(config_file_path).exists():
            new_values.update(loadfn(config_file_path))

        store = new_values.get("JOB_STORE")
        if isinstance(store, str):
            new_values["JOB_STORE"] = JobStore.from_file(store)
        elif isinstance(store, dict) and store.get("@class") == "JobStore":
            new_values["JOB_STORE"] = JobStore.from_dict(store)
        elif isinstance(store, dict):
            new_values["JOB_STORE"] = JobStore.from_dict_spec(store)

        new_values.update(values)
        return new_values
Exemple #11
0
from maggma.stores import MemoryStore

from jobflow import JobStore, job, run_locally


@job(data=True)
def generate_big_data():
    """
    Generate some data.

    The data=True in the job decorator tells jobflow to store all outputs in the "data"
    additional store.
    """
    mydata = list(range(1000))
    return mydata


big_data_job = generate_big_data()

# in this example, we use different memory stores for the documents and "data"
# additional store. In practice, any Maggma Store subclass can be used for either store.
docs_store = MemoryStore()
data_store = MemoryStore()
store = JobStore(docs_store, additional_stores={"data": data_store})

# Because our job requires an additional store named "data" we have to use our
# custom store when running the job.
output = run_locally(big_data_job, store=store)

print(output)
def test_settings_object(clean_dir, test_data):
    import os
    from pathlib import Path

    from monty.serialization import dumpfn

    from jobflow import JobStore
    from jobflow.settings import JobflowSettings

    monty_spec = {
        "@module": "jobflow.core.store",
        "@class": "JobStore",
        "@version": "",
        "docs_store": {
            "@module": "maggma.stores.mongolike",
            "@class": "MemoryStore",
            "@version": "0.31.0",
            "collection_name": "memory_db_123",
        },
        "additional_stores": {},
        "save": {},
        "load": False,
    }

    dict_spec = {
        "docs_store": {
            "type": "MongoStore",
            "database": "jobflow_unittest",
            "collection_name": "outputs_567",
            "host": "localhost",
            "port": 27017,
        }
    }

    s3_store_spec = {
        "docs_store": {
            "type": "MemoryStore",
            "collection_name": "docs_store_123",
        },
        "additional_stores": {
            "data": {
                "type": "S3Store",
                "bucket": "bucket_123",
                "index": {
                    "type": "MemoryStore",
                },
            }
        },
    }

    # set the path to lood settings from
    os.environ["JOBFLOW_CONFIG_FILE"] = str(Path.cwd() / "config.yaml")

    # assert loading monty spec from files works
    dumpfn({"JOB_STORE": monty_spec}, "config.yaml")
    settings = JobflowSettings()
    assert settings.JOB_STORE.docs_store.collection_name == "memory_db_123"

    # assert loading alternative dict spec from files works
    dumpfn({"JOB_STORE": dict_spec}, "config.yaml")
    settings = JobflowSettings()
    assert settings.JOB_STORE.docs_store.collection_name == "outputs_567"

    dumpfn({"JOB_STORE": s3_store_spec}, "config.yaml")
    settings = JobflowSettings()
    assert settings.JOB_STORE.additional_stores["data"].bucket == "bucket_123"

    # assert loading from db file works.
    dumpfn({"JOB_STORE": str(test_data / "db.yaml")}, "config.yaml")
    settings = JobflowSettings()
    assert settings.JOB_STORE.docs_store.collection_name == "outputs"

    # assert loading from serialized file works.
    dumpfn({"JOB_STORE": str(test_data / "db_serialized.json")}, "config.yaml")
    settings = JobflowSettings()
    assert settings.JOB_STORE.docs_store.database == "jobflow_unittest"

    # assert passing a jobflow object works
    settings = JobflowSettings(JOB_STORE=JobStore.from_dict(monty_spec))
    assert settings.JOB_STORE.docs_store.collection_name == "memory_db_123"