def load_default_settings(cls, values): """ Load settings from file or environment variables. Loads settings from a root file if available and uses that as defaults in place of built in defaults. This allows setting of the config file path through environment variables. """ from monty.serialization import loadfn config_file_path: str = values.get("CONFIG_FILE", DEFAULT_CONFIG_FILE_PATH) new_values = {} if Path(config_file_path).exists(): new_values.update(loadfn(config_file_path)) store = new_values.get("JOB_STORE") if isinstance(store, str): new_values["JOB_STORE"] = JobStore.from_file(store) elif isinstance(store, dict) and store.get("@class") == "JobStore": new_values["JOB_STORE"] = JobStore.from_dict(store) elif isinstance(store, dict): new_values["JOB_STORE"] = JobStore.from_dict_spec(store) new_values.update(values) return new_values
def test_from_db_file(test_data): from jobflow import JobStore ms = JobStore.from_file(test_data / "db.yaml") ms.connect() assert ms.docs_store.name == "mongo://localhost/jobflow_unittest/outputs" assert ms.additional_stores == {} # test gridfs ms = JobStore.from_file(test_data / "db_gridfs.yaml") ms.connect() data_store = ms.additional_stores["data"] assert ms.docs_store.name == "mongo://localhost/jobflow_unittest/outputs" assert data_store.name == "gridfs://localhost/jobflow_unittest/outputs_blobs" # test serialized ms = JobStore.from_file(test_data / "db_serialized.json") ms.connect() data_store = ms.additional_stores["data"] assert ms.docs_store.name == "mongo://localhost/jobflow_unittest/outputs" assert data_store.name == "gridfs://localhost/jobflow_unittest/outputs_blobs" # test bad file with pytest.raises(ValueError): JobStore.from_file(test_data / "db_bad.yaml")
def mongo_jobstore(database): from maggma.stores import MongoStore from jobflow import JobStore store = JobStore(MongoStore(database, "outputs")) store.connect() return store
def memory_data_jobstore(): from maggma.stores import MemoryStore from jobflow import JobStore store = JobStore(MemoryStore(), additional_stores={"data": MemoryStore()}) store.connect() return store
def memory_jobstore(): from maggma.stores import MemoryStore from jobflow import JobStore store = JobStore(MemoryStore()) store.connect() return store
def resolve_references( references: Sequence[OutputReference], store: jobflow.JobStore, cache: dict[str, Any] | None = None, on_missing: OnMissing = OnMissing.ERROR, ) -> dict[OutputReference, Any]: """ Resolve multiple output references. Uses caching to minimize number of database queries. Parameters ---------- references : A list or tuple of output references. store A job store. cache A dictionary cache to use for local caching of reference values. on_missing What to do if the output reference is missing in the database and cache. See :obj:`OnMissing` for the available options. Returns ------- dict[OutputReference, Any] The output values as a dictionary mapping of ``{reference: output}``. """ from itertools import groupby resolved_references = {} if cache is None: cache = {} for uuid, ref_group in groupby(references, key=lambda x: x.uuid): # get latest index result = store.query_one({"uuid": uuid}, ["index"], sort={"index": -1}) index = None if result is None else result["index"] if uuid not in cache: cache[uuid] = {} if index is not None and index not in cache[uuid]: cache[uuid][index] = store.get_output(uuid, load=True, on_missing=on_missing) for ref in ref_group: resolved_references[ref] = ref.resolve(store, cache=cache, on_missing=on_missing) return resolved_references
def test_basic(memory_store): from jobflow import JobStore store = JobStore(memory_store) store.connect() assert store assert store.name == "JobStore-mem://memory_db" assert store._collection is not None store.close() store = JobStore(memory_store, load=None) store.connect() assert store
def test_additional(memory_store): from copy import deepcopy import boto3 from maggma.stores import MemoryStore, S3Store from moto import mock_s3 from jobflow import JobStore with mock_s3(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="bucket1") index = MemoryStore("index", key="blob_uuid") s3_store = S3Store(index, "bucket1", key="blob_uuid") store = JobStore( memory_store, additional_stores={ "data": deepcopy(memory_store), "data_s3": s3_store }, ) with store as s: assert s assert s.name == "JobStore-mem://memory_db" assert s._collection is not None assert s.additional_stores["data_s3"].searchable_fields == [ "job_uuid", "job_index", ]
def run(self, store: jobflow.JobStore) -> Response: """ Run the job. If the job has inputs that are :obj:`.OutputReference` objects, then they will need to be resolved before the job can run. See the docstring for :obj:`.OutputReference.resolve()` for more details. Parameters ---------- store A :obj:`.JobStore` to use for resolving references and storing job outputs. Returns ------- Response The response of the job, containing the outputs, and other settings that determine the flow execution. Raises ------ ImportError If the job function cannot be imported. See Also -------- Response, .OutputReference """ import builtins import types from datetime import datetime from jobflow import CURRENT_JOB from jobflow.core.flow import get_flow index_str = f", {self.index}" if self.index != 1 else "" logger.info(f"Starting job - {self.name} ({self.uuid}{index_str})") CURRENT_JOB.job = self if self.config.expose_store: CURRENT_JOB.store = store if self.config.resolve_references: self.resolve_args(store=store) # if Job was created using the job decorator, then access the original function function = getattr(self.function, "original", self.function) # if function is bound method we need to do some magic to bind the unwrapped # function to the class/instance bound = getattr(self.function, "__self__", None) if bound is not None and bound is not builtins: function = types.MethodType(function, bound) response = function(*self.function_args, **self.function_kwargs) response = Response.from_job_returns(response, self.output_schema) if response.replace is not None: response.replace = prepare_replace(response.replace, self) response.replace.add_hosts_uuids(self.hosts) if response.addition is not None: # wrap the detour in a Flow to avoid problems if it need to get # wrapped at a later stage response.addition = get_flow(response.addition) response.addition.add_hosts_uuids(self.hosts) if response.detour is not None: # wrap the detour in a Flow to avoid problems if it need to get # wrapped at a later stage response.detour = get_flow(response.detour) response.detour.add_hosts_uuids(self.hosts) if self.config.response_manager_config: passed_config = self.config.response_manager_config elif self.config.pass_manager_config: passed_config = self.config.manager_config else: passed_config = None if passed_config: if response.addition is not None: pass_manager_config(response.addition, passed_config) if response.detour is not None: pass_manager_config(response.detour, passed_config) if response.replace is not None: pass_manager_config(response.replace, passed_config) try: output = jsanitize(response.output, strict=True, enum_values=True) except AttributeError: raise RuntimeError( "Job output contained an object that is not MSONable and therefore " "could not be serialized." ) save = {k: "output" if v is True else v for k, v in self._kwargs.items()} data = { "uuid": self.uuid, "index": self.index, "output": output, "completed_at": datetime.now().isoformat(), "metadata": self.metadata, "hosts": self.hosts, } store.update(data, key=["uuid", "index"], save=save) CURRENT_JOB.reset() logger.info(f"Finished job - {self.name} ({self.uuid}{index_str})") return response
class JobflowSettings(BaseSettings): """ Settings for jobflow. The default way to modify these is to modify ~/.jobflow.yaml. Alternatively, the environment variable ``JOBFLOW_CONFIG_FILE`` can be set to point to a yaml file with jobflow settings. Lastly, the variables can be modified directly though environment variables by using the "JOBFLOW" prefix. E..g., ``JOBFLOW_JOB_STORE=path/to/jobstore.file``. **Allowed JOB_STORE formats** If the store is not supplied, a ``MemoryStore`` will be used. Can be specified in multiple formats. The simplest format is the yaml dumped version of the store, generated using: >>> import yaml >>> yaml.dump(store.as_dict()) Alternatively, the store can be specified as the keys docs_store, additional_stores and any other keyword arguments supported by the :obj:`JobStore` constructor. The docs_store and additional stores are specified by the ``type`` key which must match a Maggma ``Store`` subclass, and the remaining keys are passed to the store constructor. For example, the following file would create a :obj:`JobStore` with a ``MongoStore`` for docs and a ``GridFSStore`` or ``S3Store`` as an additional store for data. GridFSStore example: .. code-block:: yaml docs_store: type: MongoStore database: jobflow_unittest collection_name: outputs host: localhost port: 27017 additional_stores: data: type: GridFSStore database: jobflow_unittest collection_name: outputs_blobs host: localhost port: 27017 S3Store example (Note: the ``key`` field must be set to ``blob_uuid``): .. code-block:: yaml docs_store: type: MongoStore database: jobflow_unittest collection_name: outputs host: localhost port: 27017 additional_stores: data: type: S3Store bucket: output_blobs key: blob_uuid index: type: MongoStore database: jobflow_unittest collection_name: output_blobs_index host: localhost port: 27017 key: blob_uuid Lastly, the store can be specified as a file name that points to a file containing the credentials in any format supported by :obj:`.JobStore.from_file`. """ CONFIG_FILE: str = Field( DEFAULT_CONFIG_FILE_PATH, description="File to load alternative defaults from." ) # general settings JOB_STORE: JobStore = Field( default_factory=lambda: JobStore(MemoryStore()), description="Default JobStore to use when running locally or using FireWorks. " "See the :obj:`JobflowSettings` docstring for more details on the " "accepted formats.", ) DIRECTORY_FORMAT: str = Field( "%Y-%m-%d-%H-%M-%S-%f", description="Date stamp format used to create directories", ) class Config: """Pydantic config settings.""" env_prefix = "jobflow_" @root_validator(pre=True) def load_default_settings(cls, values): """ Load settings from file or environment variables. Loads settings from a root file if available and uses that as defaults in place of built in defaults. This allows setting of the config file path through environment variables. """ from monty.serialization import loadfn config_file_path: str = values.get("CONFIG_FILE", DEFAULT_CONFIG_FILE_PATH) new_values = {} if Path(config_file_path).exists(): new_values.update(loadfn(config_file_path)) store = new_values.get("JOB_STORE") if isinstance(store, str): new_values["JOB_STORE"] = JobStore.from_file(store) elif isinstance(store, dict) and store.get("@class") == "JobStore": new_values["JOB_STORE"] = JobStore.from_dict(store) elif isinstance(store, dict): new_values["JOB_STORE"] = JobStore.from_dict_spec(store) new_values.update(values) return new_values
from maggma.stores import MemoryStore from jobflow import JobStore, job, run_locally @job(data=True) def generate_big_data(): """ Generate some data. The data=True in the job decorator tells jobflow to store all outputs in the "data" additional store. """ mydata = list(range(1000)) return mydata big_data_job = generate_big_data() # in this example, we use different memory stores for the documents and "data" # additional store. In practice, any Maggma Store subclass can be used for either store. docs_store = MemoryStore() data_store = MemoryStore() store = JobStore(docs_store, additional_stores={"data": data_store}) # Because our job requires an additional store named "data" we have to use our # custom store when running the job. output = run_locally(big_data_job, store=store) print(output)
def test_settings_object(clean_dir, test_data): import os from pathlib import Path from monty.serialization import dumpfn from jobflow import JobStore from jobflow.settings import JobflowSettings monty_spec = { "@module": "jobflow.core.store", "@class": "JobStore", "@version": "", "docs_store": { "@module": "maggma.stores.mongolike", "@class": "MemoryStore", "@version": "0.31.0", "collection_name": "memory_db_123", }, "additional_stores": {}, "save": {}, "load": False, } dict_spec = { "docs_store": { "type": "MongoStore", "database": "jobflow_unittest", "collection_name": "outputs_567", "host": "localhost", "port": 27017, } } s3_store_spec = { "docs_store": { "type": "MemoryStore", "collection_name": "docs_store_123", }, "additional_stores": { "data": { "type": "S3Store", "bucket": "bucket_123", "index": { "type": "MemoryStore", }, } }, } # set the path to lood settings from os.environ["JOBFLOW_CONFIG_FILE"] = str(Path.cwd() / "config.yaml") # assert loading monty spec from files works dumpfn({"JOB_STORE": monty_spec}, "config.yaml") settings = JobflowSettings() assert settings.JOB_STORE.docs_store.collection_name == "memory_db_123" # assert loading alternative dict spec from files works dumpfn({"JOB_STORE": dict_spec}, "config.yaml") settings = JobflowSettings() assert settings.JOB_STORE.docs_store.collection_name == "outputs_567" dumpfn({"JOB_STORE": s3_store_spec}, "config.yaml") settings = JobflowSettings() assert settings.JOB_STORE.additional_stores["data"].bucket == "bucket_123" # assert loading from db file works. dumpfn({"JOB_STORE": str(test_data / "db.yaml")}, "config.yaml") settings = JobflowSettings() assert settings.JOB_STORE.docs_store.collection_name == "outputs" # assert loading from serialized file works. dumpfn({"JOB_STORE": str(test_data / "db_serialized.json")}, "config.yaml") settings = JobflowSettings() assert settings.JOB_STORE.docs_store.database == "jobflow_unittest" # assert passing a jobflow object works settings = JobflowSettings(JOB_STORE=JobStore.from_dict(monty_spec)) assert settings.JOB_STORE.docs_store.collection_name == "memory_db_123"