def _get_dtypes(cls): """ return a dict of attributes and datatypes for properties. """ # handle special cases if cls in COMPLEX_TYPES: return COMPLEX_TYPES[cls] out = {"_parent_id_": str, "_event_id_": str} # get containers, properties, and types property_dict = getattr(cls, "_property_dict", {}) # iterate properties, handle special case else use type for item, val in property_dict.items(): # if a class that needs to be flattened val = type(val) if not isinstance(val, type) else val if hasattr(val, "_property_dict") or val in COMPLEX_TYPES: sub_dict = _get_dtypes(val) for item_, val_ in sub_dict.items(): if item_ in {"_parent_id_", "_event_id_"}: continue out[f"__{item}__{item_}"] = val_ else: # handle simple, transform types out[item] = SIMPLE_TYPES.get(val, val) # add containers (always strings refering to other tables) and return containers = getattr(cls, "_containers", []) for container in containers: out[f"_{container}"] = str return MapProxy(out) # mapproxy to simulate immutability
def __call__(self, *args, _pype_fixtures=None, **kwargs): fixtures = MapProxy({**(_pype_fixtures or {}), **self._wrap_fixtures, **self._partials}) out = self.task.run(*args, **kwargs, _fixtures=fixtures, _callbacks=self._callbacks) if out is None: raise TaskReturnedNone return args_kwargs(out, adapter=self.adapter)
def validate(self): """ Run checks on the pype to detect potential problems. Will raise an InvalidPype exception if compatibility issues are found, or a TypeError if any invalid callbacks are found. """ # validate task compatibility self.flow.validate(extra_params=MapProxy(self._partials)) # validate callbacks for wrap_ in self.flow.wraps: wrap_._validate_callbacks()
def __call__(self, *args, _pype_fixtures=None, **kwargs): fixtures = MapProxy({ **(_pype_fixtures or EMPTY_PYPE_FIXTURES), **self._wrap_fixtures, **self._partials, }) out = self.task.run( *args, **kwargs, _fixtures=fixtures, _callbacks=self._callbacks, _predicate=self._predicates, ) if out is None: raise TaskReturnedNone return args_kwargs(out, adapter=self.adapter)
def _run_queue(self, _meta, que): """ run the queue until complete """ # run que until complete or all tasks are waiting agg results assert self.flow.get_input_wrap().task is task.pype_input fixtures = MapProxy({"meta": _meta, "pype": self, **self._partials}) while len(que): wrap_, (args, kwargs) = que.pop() wrap_: wrap.Wrap try: output = wrap_(*args, **kwargs, _pype_fixtures=fixtures) except UnresolvedDependency: # task needs to be put back _meta["defer_count"][wrap_] += 1 # up task deferment counter que.appendleft((wrap_, (args, kwargs))) continue except TaskReturnedNone: # task returned None continue else: # everything went fine _meta["outputs"][wrap_.task] = output for neighbor in self.flow.neighbors(wrap_): # queue neighbors neighbor._queue_up(output, _meta, que, sending_wrap=wrap_) # run tasks that waited for object scoped aggregations self._run_aggregations(_meta, que) _meta["output"].append(de_args_kwargs(*output))
class _Bank(ABC): """ The abstract base class for ObsPlus' banks. Used to access local archives in a client-like fashion. """ # hdf5 compression defaults _complib = "blosc" _complevel = 9 # attributes subclasses need to define ext = "" bank_path = "" namespace = "" index_name = ".index.h5" # name of index file executor = None # an executor for using parallelism # optional str defining the directory structure and file name schemes path_structure = None name_structure = None # the minimum obsplus version. If not met delete index and re-index # bump when database schema change. _min_version = "0.0.3" # status bar attributes _bar_update_interval = 50 # number of files before updating bar _min_files_for_bar = 100 # min number of files before using bar enabled _read_func: callable # function for reading datatype # required dypes for input to storage layer _dtypes_input: Mapping = MapProxy({}) # required dtypes for output from bank _dtypes_output: Mapping = MapProxy({}) # the index cache (can greatly reduce IO efforts) _index_cache: Optional[_IndexCache] = None @abstractmethod def read_index(self, **kwargs) -> pd.DataFrame: """ read the index filtering on various params """ @abstractmethod def update_index(self: BankType) -> BankType: """ update the index """ @abstractmethod def last_updated(self) -> Optional[float]: """ get the last modified time stored in the index. If Not available return None """ @abstractmethod def _read_metadata(self) -> pd.DataFrame: """ Return a dictionary of metadata. """ # --- path/node related objects @property def index_path(self): """ The expected path to the index file. """ return join(self.bank_path, self.index_name) @property def _index_node(self): """ The node, or table, the index information is stored in the database. """ return "/".join([self.namespace, "index"]) @property def _index_version(self) -> str: """ Get the version of obsplus used to create the index. """ return self._read_metadata()["obsplus_version"].iloc[0] @property def _time_node(self): """ The node, or table, the update time information is stored in the database. """ return "/".join([self.namespace, "last_updated"]) @property def _meta_node(self): """ The node, or table, the update metadata is stored in the database. """ return "/".join([self.namespace, "metadata"]) def _enforce_min_version(self): """ Check version of obsplus used to create index and delete index if the minimum version requirement is not met. """ try: version = self._index_version except (FileNotFoundError, DatabaseError): return else: if self._min_version > version: msg = ( f"the indexing schema has changed since {self._min_version} " f"the index will be recreated") warnings.warn(msg) os.remove(self.index_path) def _unindexed_iterator(self): """ return an iterator of potential unindexed files """ # get mtime, subtract a bit to avoid odd bugs mtime = None last_updated = self.last_updated # this needs db so only call once if last_updated is not None: mtime = last_updated - 0.001 # return file iterator return iter_files(self.bank_path, ext=self.ext, mtime=mtime) def _measured_unindexed_iterator(self, bar: Optional[ProgressBar] = None): """ A generator to yield un-indexed files and update progress bar. Parameters ---------- bar Any object with an update method. Returns ------- """ # get progress bar bar = self.get_progress_bar(bar) # get the iterator for num, path in enumerate(self._unindexed_iterator()): # update bar if count is in update interval if bar is not None and num % self._bar_update_interval == 0: bar.update(num) yield path # finish progress bar getattr(bar, "finish", lambda: None)() # call finish if bar exists def _make_meta_table(self): """ get a dataframe of meta info """ meta = dict( path_structure=self.path_structure, name_structure=self.name_structure, obsplus_version=obsplus.__version__, ) return pd.DataFrame(meta, index=[0]) def get_service_version(self): """ Return the version of obsplus used to create index. """ return self._index_version def ensure_bank_path_exists(self, create=False): """ Ensure the bank_path exists else raise an BankDoesNotExistError. If create is True, simply create the bank. """ path = Path(self.bank_path) if create: path.mkdir(parents=True, exist_ok=True) if not path.is_dir(): msg = f"{path} is not a directory, cant read bank" raise BankDoesNotExistError(msg) def get_progress_bar(self, bar=None) -> Optional[ProgressBar]: """ Return a progress bar instance based on bar parameter. If bar is False, return None. If bar is None return default Bar If bar is a subclass of ProgressBar, init class and set max_values. If bar is an instance of ProgressBar, return it. """ print(f"updating or creating event index for {self.bank_path}") # conditions to bail out early if bar is False: # False indicates no bar is to be used return None elif isinstance(bar, ProgressBar): # bar is already instantiated return bar # next, count number of files num_files = sum([1 for _ in self._unindexed_iterator()]) if num_files < self._min_files_for_bar: # not enough files to use bar return None # instantiate bar and return kwargs = {"min_value": self._min_files_for_bar, "max_value": num_files} # an instance should be init'ed if isinstance(bar, type) and issubclass(bar, ProgressBar): return bar(**kwargs) elif bar is None: return get_progressbar(**kwargs) else: msg = f"{bar} is not a valid input for get_progress_bar" raise ValueError(msg) def clear_cache(self): """ Clear the index cache if the bank is using one. """ if self._index_cache is not None: self._index_cache.clear_cache() @property def _max_workers(self): """ Return the max number of workers allowed by the executor. If the Executor has no attribute `_max_workers` use the number of CPUs instead. If there is no executor assigned to bank instance return 1. """ executor = getattr(self, "executor", None) if executor is not None: return getattr(executor, "_max_workers", CPU_COUNT) return 1 def _map(self, func, args, chunksize=None): """ Map the args to function, using executor if defined else perform in serial. """ if self.executor is not None: return self.executor.map(func, args, chunksize=chunksize) else: return (func(x) for x in args)
""" Constants, and their explanations, used by sflow """ from types import MappingProxyType as MapProxy from typing import Callable, Sequence, Union, Tuple, Dict, Any, Hashable # ---------------------- Fixtures/callbacks # supported generic task fixtures TASK_FIXTURES = MapProxy( dict( signature="The signature of the task's run method", task="A reference to the current task object", self="A reference to the current task object", e="The exception object if one was raised, else None", inputs="A tuple of (args, kwargs) passed as input to current task", args="A tuple of arguments passed to current task", kwargs="A dict of keywork arguments passed to current task", outputs="The outputs of calling a task, or None", )) # supported wrap fixtures WRAP_FIXTURES = MapProxy( dict(wrap="A refence to the wrap object around the task")) # supported pype fixtures PYPE_FIXTURES = MapProxy( dict( pype="A reference to the pype object running the task, or None", meta="The controld dict running the task que (advanced fixture)", print_flow="If True print the inputs/outputs of each task to screen",
class DataSet(abc.ABC): """ Abstract Base Class for downloading and serving datasets. This is not intended to be used directly, but rather through subclassing. Parameters ---------- base_path The path to which the dataset will be saved. Attributes ---------- data_path The path containing the data. By default it is base_path / name. source_path The path which contains the original files included in the dataset before download. By default this is found in the same directory as the dataset's code (.py) file in a folder with the same name as the dataset. Notes ----- Importantly, each dataset references *two* directories, the source_path and data_path. The source_path contains all data included within the dataset and should not be altered. The data_path has a copy of everything in the source_path, plus the files created during the downloading process. The base_path (the parent of data_path) is resolved for each dataset using the following priorities: 1. The `base_path` provided to `Dataset`'s __init__ method. 2. .data_path.txt file stored in the data source 3. An environmental name OPSDATA_PATH 4. The opsdata_path variable from obsplus.constants By default the data will be downloaded to the user's home directory in a folder called "opsdata", but again, this is easily changed by setting the OPSDATA_PATH environmental variable. """ _entry_points = {} _datasets = {} data_loaded = False # variables for hashing datafiles and versioning _version_filename = "dataset_version.txt" _hash_filename = "dataset_hash.json" # the name of the file that saves where the data file were downloaded _saved_dataset_path_filename = ".dataset_data_path.txt" _hash_excludes = ( "readme.txt", _version_filename, _hash_filename, _saved_dataset_path_filename, ) # generic functions for loading data (WaveBank, event, stations) _load_funcs = MapProxy( dict( waveform=get_waveform_client, event=get_event_client, station=get_station_client, )) # flags to determine if data should be loaded into memory _load_waveforms = False _load_stations = True _load_events = True # cache for instantiated datasets _loaded_datasets = {} _verbose = True def __init_subclass__(cls, **kwargs): """Register subclasses of datasets.""" assert isinstance(cls.name, str), "name must be a string" validate_version_str(cls.version) # Register the subclass as a dataset. DataSet._datasets[cls.name.lower()] = cls # --- logic for loading and caching data def __init__(self, base_path=None): """download and load data into memory.""" self.base_path = self._get_opsdata_path(base_path) # create the dataset's base directory self.data_path.mkdir(exist_ok=True, parents=True) # run the download logic if needed self._run_downloads() # cache loaded dataset self.data_loaded = True if not base_path and self.name not in self._loaded_datasets: self._loaded_datasets[self.name] = self.copy(deep=True) def _get_opsdata_path(self, opsdata_path: Optional[Path] = None) -> Path: """ Get the location where datasets are stored. Returns ------- A path to the opsdata directory. """ if opsdata_path is None: opsdata_path = getattr(self._saved_data_path, "parent", None) if opsdata_path is None: # next look for env variable opsdata_path_default = obsplus.constants.OPSDATA_PATH opsdata_path = os.getenv("OPSDATA_PATH", opsdata_path_default) # ensure the data path exists _create_opsdata(opsdata_path) return Path(opsdata_path) def _run_downloads(self) -> None: """Iterate each kind of data and download if needed.""" # Make sure the version of the dataset is okay version_ok = self.check_version() downloaded = False for what in DATA_TYPES: needs_str = f"{what}s_need_downloading" if getattr(self, needs_str) or (not version_ok): # this is the first type of data to be downloaded, run hook # and copy data from data source. if not downloaded and self.source_path.exists(): copy_tree(str(self.source_path), str(self.data_path)) self.pre_download_hook() downloaded = True # download data, test termination criteria self._log( f"downloading {what} data for {self.name} dataset ...") getattr(self, "download_" + what + "s")() assert not getattr(self, needs_str), f"Download {what} failed" self._log(f"finished downloading {what} data for {self.name}") self._write_readme() # make sure readme has been written # some data were downloaded, call post download hook if downloaded: self.check_hashes() self.post_download_hook() # write a new version file self.write_version() # write out a new saved datafile path self._save_data_path() def _load(self, what, path): """Load the client-like objects from disk.""" try: client = self._load_funcs[what](path) except TypeError: warn(f"failed to load {what} from {path}, returning None") return None # load data into memory (eg load event bank contents into catalog) if getattr(self, f"_load_{what}s"): return getattr(client, f"get_{what}s")() else: return client def copy(self: DataSetType, deep=True) -> DataSetType: """ Return a copy of the dataset. Parameters ---------- deep If True deep copy the objects attached to the dataset. Notes ----- This only copies data in memory, not on disk. If you plan to make any changes to the dataset's on disk resources please use :method:`~obsplus.Dataset.copy_to`. """ return copy.deepcopy(self) if deep else copy.copy(self) def copy_to(self: DataSetType, destination: Optional[Union[str, Path]] = None) -> DataSetType: """ Copy the dataset to a destination. If the destination already exists simply do nothing. Parameters ---------- destination The destination to copy the dataset. It will be created if it doesnt exist. If None is provided use tmpfile to create a temporary directory. Returns ------- A new dataset object which refers to the copied files. """ return copy_dataset(self, destination) def get_fetcher(self, **kwargs) -> "obsplus.Fetcher": """ Return a Fetcher from the data. kwargs are passed to :class:`~obsplus.structures.Fetcher`'s constructor. See its documentation for acceptable kwargs. """ assert self.data_loaded, "data have not been loaded into memory" # get events/waveforms/stations and put into dict for the Fetcher fetch_kwargs = { "waveforms": self.waveform_client, "events": self.event_client, "stations": self.station_client, } fetch_kwargs.update(kwargs) return obsplus.Fetcher(**fetch_kwargs) __call__ = get_fetcher def _write_readme(self, filename="readme.txt"): """Writes the classes docstring to a file.""" path = self.data_path / filename if not path.exists(): with path.open("w") as fi: fi.write(str(self.__doc__)) def _save_data_path(self, path=None): """Save the path to where the data where downloaded in source folder.""" path = Path(path or self._path_to_saved_path_file) path.parent.mkdir(exist_ok=True, parents=True) with path.open("w") as fi: fi.write(str(self.data_path)) @classmethod def load_dataset(cls: DataSetType, name: Union[str, "DataSet"], silent=False) -> DataSetType: """ Get a loaded dataset. Will ensure all files are downloaded and the appropriate data are loaded into memory. Parameters ---------- name The name of the dataset to load or a DataSet object. If a DataSet object is passed a copy of it will be returned. Examples -------- >>> # --- Load an example dataset for testing >>> import obsplus >>> ds = obsplus.load_dataset('default_test') >>> # If you plan to make changes to the dataset be sure to copy it first >>> # The following will copy all files in the dataset to a tmpdir >>> ds2 = obsplus.copy_dataset('default_test') >>> # --- Use dataset clients to load waveforms, stations, and events >>> cat = ds.event_client.get_events() >>> st = ds.waveform_client.get_waveforms() >>> inv = ds.station_client.get_stations() >>> # --- get a fetcher for more "dataset aware" querying >>> fetcher = ds.get_fetcher() """ # Just copy and return if a dataset is passed. if isinstance(name, DataSet): return name.copy() name = name.lower() cls._load_dataset_entry_point(name) if name not in cls._datasets: # The dataset has not been discovered; try to load entry points msg = f"{name} is not in the known datasets {list(cls._datasets)}" raise ValueError(msg) if name in cls._loaded_datasets: # The dataset has already been loaded, simply return a copy return cls._loaded_datasets[name].copy() else: # The dataset has been discovered but not loaded; just loaded return cls._datasets[name]() def delete_data_directory(self): """ Delete the datafiles of a dataset. This will force the data to be re-copied from the source files and download logic to be run. """ dataset = DataSet.load_dataset(self) shutil.rmtree(dataset.data_path) @classmethod def _load_dataset_entry_point(cls, name=None, load=True): """ Load and cache the dataset entry points. Parameters ---------- name A string id of the dataset load If True, load the code associated with the entry point. """ def _load_ep(ep): """Load the entry point, ignore removed datasets.""" # If a plugin was register but no longer exists it can raise. with suppress(ModuleNotFoundError): ep.load() assert name in cls._datasets, "dataset should be registered." if name in cls._entry_points: # entry point has been registered if name in cls._datasets: # and loaded, return return elif load: # it has not been loaded, try loading it. _load_ep(cls._entry_points[name]) # it has not been found, iterate entry points and update eps = {x.name: x for x in iter_entry_points("obsplus.datasets")} cls._entry_points.update(eps) # stop if we don't need to load if not load: return # now iterate through all names, or just selected name, and load for name in set(iterate(name or eps)) & set(eps): _load_ep(eps[name]) # --- prescribed Paths for data @property def data_path(self) -> Path: """ Return a path to where the dataset's data was/will be downloaded. """ return self.base_path / self.name @property def source_path(self) -> Path: """ Return a path to the directory where the data files included with the dataset live. """ try: path = Path(inspect.getfile(self.__class__)).parent except (AttributeError, TypeError): path = Path(__file__) return path / self.name @property def _saved_data_path(self): """Load the saved data source path, else return None.""" expected_path = self._path_to_saved_path_file if expected_path.exists(): loaded_path = Path(expected_path.open("r").read()) if loaded_path.exists(): return loaded_path return None @property def _path_to_saved_path_file(self): """ A path to the file which keeps track of where data are downloaded. """ return self.source_path / self._saved_dataset_path_filename @property def _version_path(self): """A path to the saved version file.""" return self.data_path / self._version_filename @property @lru_cache() def data_files(self) -> Tuple[Path, ...]: """ Return a list of top-level files associated with the dataset. Hidden files are ignored. """ file_iterator = self.source_path.glob("*") files = [x for x in file_iterator if not x.is_dir()] return tuple([x for x in files if not x.name.startswith(".")]) @property def waveform_path(self) -> Path: """Return the path to the waveforms.""" return self.data_path / "waveforms" @property def event_path(self) -> Path: """Return the path to the events.""" return self.data_path / "events" @property def station_path(self) -> Path: """Return the path to the stations.""" return self.data_path / "stations" # --- checks for if each type of data is downloaded @property def waveforms_need_downloading(self) -> bool: """ Returns True if waveform data need to be downloaded. """ return not self.waveform_path.exists() @property def events_need_downloading(self) -> bool: """ Returns True if event data need to be downloaded. """ return not self.event_path.exists() @property def stations_need_downloading(self) -> bool: """ Returns True if station data need to be downloaded. """ return not self.station_path.exists() @property @lru_cache() def waveform_client(self) -> Optional[WaveformClient]: """A cached property for a waveform client""" return self._load("waveform", self.waveform_path) @property @lru_cache() def event_client(self) -> Optional[EventClient]: """A cached property for an event client""" return self._load("event", self.event_path) @property @lru_cache() def station_client(self) -> Optional[StationClient]: """A cached property for a station client""" return self._load("station", self.station_path) @property @lru_cache() def _download_client(self): """ Return an instance of the IRIS client, subclasses can override to use different clients. """ return Client("IRIS") @_download_client.setter def _download_client(self, item): """just allow this to be overwritten""" self.__dict__["client"] = item def _log(self, msg): """Simple way to customize dataset logging.""" print(msg) def create_sha256_hash(self, path=None, hidden=False) -> dict: """ Create a sha256 hash of the dataset's data files. The output is stored in a simple json file. Keys are paths (relative to dataset base path) and values are files hashes. If you want to update/create the hash file in the dataset's source this can be done by passing the dataset's source_path as the path argument. Parameters ---------- path The path to which the hash data is saved. If None use data_path. hidden If True also include hidden files. """ kwargs = dict(exclude=self._hash_excludes, hidden=hidden) out = hash_directory(self.data_path, **kwargs) # sort dict to mess less with git sort_dict = OrderedDict(sorted(out.items())) # get path and dump json default_path = Path(self.data_path) / self._hash_filename _path = path or default_path hash_path = _path / self._hash_filename if _path.is_dir() else _path with hash_path.open("w") as fi: json.dump(sort_dict, fi, sort_keys=True, indent=2) return out def check_hashes(self, check_hash=False): """ Check that the files are all there and have the correct Hashes. Parameters ---------- check_hash If True check the hash of the files. Raises ------ FileHashChangedError If one of the file hashes is not as expeted. MissingDataFileError If one the data files was not downloaded. """ # If there is not a pre-existing hash file return hash_path = Path(self.data_path / self._hash_filename) if not hash_path.exists(): return # get old and new hash, and overlaps old_hash = json.load(hash_path.open()) current_hash = hash_directory(self.data_path, exclude=self._hash_excludes) overlap = set(old_hash) & set(current_hash) - set(self._hash_excludes) # get any files with new hashes has_changed = {x for x in overlap if old_hash[x] != current_hash[x]} missing = (set(old_hash) - set(current_hash)) - set( self._hash_excludes) if has_changed and check_hash: msg = (f"The hash for dataset {self.name} did not match the " f"expected values for the following files:\n{has_changed}") raise FileHashChangedError(msg) if missing: msg = f"Dataset {self.name} is missing files: \n{missing}" raise MissingDataFileError(msg) def check_version(self) -> bool: """ Check the version of the dataset. Verifies the version string in the dataset class definition matches the one saved on disk. Returns True if all is well else raises a DataVersionError. Parameters ---------- path Expected path of the version file. Raises ------ DataVersionError If any version problems are discovered. """ redownload_msg = f"Delete the following directory {self.data_path}" try: version = self.read_data_version() except (DataVersionError, ValueError): # failed to read version need_dl = (getattr(self, f"{x}s_need_downloading") for x in DATA_TYPES) if not any(need_dl): # Something is a little weird warn( "Version file is missing. Attempting to re-download the dataset." ) return False # Check the version number if get_version_tuple(version) < get_version_tuple(self.version): msg = f"Dataset version is out of date: {version} < {self.version}. " raise DataVersionError(msg + redownload_msg) elif get_version_tuple(version) > get_version_tuple(self.version): msg = f"Dataset version mismatch: {version} > {self.version}." msg = msg + " It may be necessary to reload the dataset." warn(msg + redownload_msg) return True # All is well. Continue. def write_version(self, path: Optional[Union[Path, str]] = None): """Write the version string to disk.""" version_path = path or self._version_path with version_path.open("w") as fi: fi.write(self.version) def read_data_version(self, path: Optional[Union[Path, str]] = None) -> str: """ Read the data version from disk. Return a 3 length tuple from the semantic version string (of the form xx.yy.zz). Raise a DataVersionError if not found. """ version_path = path or self._version_path if not version_path.exists(): raise DataVersionError(f"{version_path} does not exist!") with version_path.open("r") as fi: version_str = fi.read() validate_version_str(version_str) return version_str # --- Abstract properties subclasses should implement @property @abc.abstractmethod def name(self) -> str: """ Name of the dataset """ @property @abc.abstractmethod def version(self) -> str: """ Dataset version. Should be a str of the form x.y.z """ @property def version_tuple(self) -> Tuple[int, int, int]: """ Return a tuple of the version string. """ validate_version_str(self.version) vsplit = self.version.split(".") return int(vsplit[0]), int(vsplit[1]), int(vsplit[2]) # --- Abstract methods subclasses should implement def download_events(self) -> None: """ Method to ensure the events have been downloaded. Events should be written in an obspy-readable format to self.event_path. If not implemented this method will create an empty directory. """ self.event_path.mkdir(exist_ok=True, parents=True) def download_waveforms(self) -> None: """ Method to ensure waveforms have been downloaded. Waveforms should be written in an obspy-readable format to self.waveform_path. """ self.waveform_path.mkdir(exist_ok=True, parents=True) def download_stations(self) -> None: """ Method to ensure inventories have been downloaded. Station data should be written in an obspy-readable format to self.station_path. Since there is not yet a functional StationBank, this method must be implemented by subclass. """ self.station_path.mkdir(exist_ok=True, parents=True) def pre_download_hook(self): """Code to run before any downloads.""" def post_download_hook(self): """Code to run after any downloads.""" def __str__(self): return f"Dataset: {self.name}" def __repr__(self): return f"{str(self)} with description: {self.__doc__}"
) from spype.core import wrap from spype.core.sbase import _SpypeBase from spype.exceptions import UnresolvedDependency, ExitTask from spype.types import valid_input, compatible_instance from spype.utils import ( iterate, apply_partial, de_args_kwargs, copy_func, get_default_names, function_or_class_name, ) _fixtures = {**dict.fromkeys(PYPE_FIXTURES), **dict.fromkeys(WRAP_FIXTURES)} EMPTY_FIXTURES = MapProxy(_fixtures) # --------------------------- Auxiliary tasks class _RunControl: """ A class to control executing callbacks in task's run method """ _hard_exit = False def __init__(self, task: "Task", _fixtures, _callbacks, _predicates, args, kwargs): self.task = task # get fixtures passed in from wraps/pypes or use empty dicts _fixtures = _fixtures or EMPTY_FIXTURES self.meta = _fixtures.get("meta", {}) or {
class PlotEventSpectra(VerticalWithSubPlots): """" Class for plotting event spectra. """ colors = {"Noise": "b", "P": "r", "S": "g"} _source_funcs = MapProxy({}) _source_kwargs = MapProxy({}) def __init__(self, source_group, event_id, limit=None, stations=None): super().__init__() self.source_group = source_group self.freqs = source_group.data.columns event_id = self._get_event_id(source_group.data, event_id) df = self._get_filtered_df(abs(source_group.data), event_id) source_df = source_group.source_df # slice meta to only get same selection as df meta = source_group.meta.loc[df.index] # init a dict of subplots {(phase: seed_id): axis} fig, ax_dict = self._get_axis_dict(meta, event_id, limit, stations) # init partials of source_models if used self._get_source_funcs() for (sta, chan), ax in ax_dict.items(): sub_meta = meta[(meta.station == sta) & (meta.channel == chan)] data = df.loc[sub_meta.index] self._plot_channel(ax, data, meta) # plot fitted dataframe if applicable if source_df is not None and not source_df.empty: self._plot_fitted_source(ax, source_df.loc[sub_meta.index], sub_meta) # iterate all axis and turn on legends for ax in ax_dict.values(): ax.legend(loc=3) def _get_source_funcs(self): """ set _source_funcs and _source_kwargs """ # set a few variables; bail out if not fitted df sg = self.source_group data = sg.data source_df = sg.source_df if source_df is None or source_df.empty: return from mopy.sourcemodels import source_spectrum, SOURCE_MODEL_PARAMS # get frequencies and model function partials freqs = data.columns # used_models = source_df.columns.get_level_values("model").unique() # create dict of partial source model functions funcs = {} wanted_kwargs = {} for model in used_models: source_params = SOURCE_MODEL_PARAMS[model] func = partial(source_spectrum, freqs=freqs, **source_params) funcs[model] = func wanted_kwargs[model] = set(inspect.signature(func).parameters) self._source_funcs = funcs self._source_kwargs = wanted_kwargs def _plot_fitted_source(self, ax, fit_df, meta): """ plot the fitted source spectra """ used_models = fit_df.columns.get_level_values("model").unique() # filter out nulls fit_df = fit_df[~fit_df.isnull().any(axis=1)] # iterate each row that is not null for ind, row in fit_df.iterrows(): phase = ind[0] color = self.colors[phase] for model in used_models: wanted_kwargs = self._source_kwargs[model] func = self._source_funcs[model] # get inputs to particular model function meta_dict = dict(meta.loc[ind[:2]]) kwargs = dict(row.loc[model]) kwargs.update(meta_dict) # get desired inputs from signature overlap = wanted_kwargs & set(kwargs) kwargs = {x: kwargs[x] for x in overlap} # calc spectra data = func(**kwargs) # get label and plot kwargs_str = "; ".join([f"{i}: {v:.2E}" for i, v in kwargs.items()]) label = f"{model}_{phase}: {kwargs_str}" ax.plot( self.freqs, data, color=color, ls="-", label=label, linestyle="--" ) # for model in : # model_params = SOURCE_MODEL_PARAMS[model] # func = partial(source_spectrum, **model_params) # for ind, df in fit_df.iterrows() # time = funq(freqs) # # # breakpoint() sg = self.source_group def _plot_channel(self, ax, data, meta): """ plot the channel data. """ for ind, row in data.iterrows(): meta_row = meta.loc[ind] phase = ind[0] color = self.colors.get(phase, "k") ax.loglog(row.index, row.values, label=phase, color=color) ax.set_title(ind[-1])
ARRIVAL_COLUMNS = tuple(ARRIVAL_DTYPES) # Waveform datatypes WAVEFORM_DTYPES = OrderedDict( network=str, station=str, location=str, channel=str, starttime="datetime64[ns]", endtime="datetime64[ns]", sampling_period="timedelta64[ns]", ) # The datatypes needed for putting waveform info into HDF5 WAVEFORM_DTYPES_INPUT = MapProxy( {i: _DATETIME_TYPE_MAP.get(v, v) for i, v in WAVEFORM_DTYPES.items()}) # keys used to identify UTC objects UTC_KEYS = ("creation_time", "time", "reference") # keys to pop out of a json object JSON_KEYS_TO_POP = {"_id", "_summary"} # seed id components NSLC = ("network", "station", "location", "channel") # the expected dimensions of the standard waveform array DIMS = ("stream_id", "seed_id", "time") # Small and BIG UTCDateTimes
class DataSet(abc.ABC): """ Class for downloading and serving datasets. By default the data will be downloaded to obsplus' datasets module but this can be changed using the base_path argument. All data will be saved in base_path / name. Parameters ---------- base_path The path to which the dataset will be saved as base_path / name. Attributes ---------- source_path The path to the directory containing the source of DataSet code. """ _entry_points = {} datasets = {} data_loaded = False # variables for hashing datafiles and versioning _version_filename = ".dataset_version.txt" _hash_filename = ".dataset_md5_hash.json" # the name of the file that saves where the data file were downloaded _saved_dataset_path_filename = ".dataset_data_path.txt" _hash_excludes = ( "readme.txt", _version_filename, _hash_filename, _saved_dataset_path_filename, ) # generic functions for loading data (WaveBank, event, stations) _load_funcs = MapProxy( dict( waveform=get_waveform_client, event=get_event_client, station=get_station_client, )) # flags to determine if data should be loaded into memory _load_waveforms = False _load_stations = True _load_events = True # cache for instantiated datasets _loaded_datasets = {} def __init_subclass__(cls, **kwargs): """ Register subclasses of datasets. """ assert isinstance(cls.name, str), "name must be a string" cls._validate_version_str(cls.version) # Register the subclass as a dataset. DataSet.datasets[cls.name.lower()] = cls # --- logic for loading and caching data def __init__(self, base_path=None): """ download and load data into memory. """ self.base_path = self._get_opsdata_path(base_path) # create the dataset's base directory self.data_path.mkdir(exist_ok=True, parents=True) # run the download logic if needed self._run_downloads() # cache loaded dataset self.data_loaded = True if not base_path and self.name not in self._loaded_datasets: self._loaded_datasets[self.name] = self.copy(deep=True) def _get_opsdata_path(self, opsdata_path: Optional[Path] = None) -> Path: """ Get the location where datasets are stored. Uses the following priorities: 1. Provided Path via opsdata_path 2. .data_path.txt file stored in the data source 3. An environmental name OPSDATA_PATH 4. The opsdata_path variable from obsplus.constants Returns ------- A path to the opsdata directory. """ if opsdata_path is None: opsdata_path = getattr(self._saved_data_path, "parent", None) if opsdata_path is None: # next look for env variable opsdata_path_default = obsplus.constants.OPSDATA_PATH opsdata_path = os.getenv("OPSDATA_PATH", opsdata_path_default) # ensure the data path exists _create_opsdata(opsdata_path) return Path(opsdata_path) def _run_downloads(self): """ Iterate each kind of data and download if needed. """ # Make sure the version of the dataset is okay version_ok = self.check_version() downloaded = False for what in DATA_TYPES: needs_str = f"{what}s_need_downloading" if getattr(self, needs_str) or (not version_ok): # this is the first type of data to be downloaded, run hook # and copy data from data source. if not downloaded and self.source_path.exists(): copy_tree(str(self.source_path), str(self.data_path)) self.pre_download_hook() downloaded = True # download data, test termination criteria print(f"downloading {what} data for {self.name} dataset ...") getattr(self, "download_" + what + "s")() assert not getattr(self, needs_str), f"Download {what} failed" print(f"finished downloading {what} data for {self.name}") self._write_readme() # make sure readme has been written # some data were downloaded, call post download hook if downloaded: self.check_hashes() self.post_download_hook() # write a new version file self.write_version() # write out a new saved datafile path self._save_data_path() def _load(self, what, path): """ Load the client-like objects from disk. """ try: client = self._load_funcs[what](path) except TypeError: warn(f"failed to load {what} from {path}, returning None") return None # load data into memory (eg load event bank contents into catalog) if getattr(self, f"_load_{what}s"): return getattr(client, f"get_{what}s")() else: return client def copy(self, deep=True): """ Return a copy of the dataset. """ return copy.deepcopy(self) if deep else copy.copy(self) def copy_to(self, destination: Optional[Union[str, Path]] = None): """ Copy the dataset to a destination. Parameters ---------- destination The destination to copy the dataset. It will be created if it doesnt exist. If None is provided use tmpfile to create a temporary directory. Returns ------- A new dataset object which refers to the copied files. """ return copy_dataset(self, destination) def get_fetcher(self) -> "obsplus.Fetcher": """ Return a Fetcher from the data. """ assert self.data_loaded, "data have not been loaded into memory" # get events/waveforms/stations and put into dict for the Fetcher fetch_kwargs = { "waveforms": self.waveform_client, "events": self.event_client, "stations": self.station_client, } return obsplus.Fetcher(**fetch_kwargs) __call__ = get_fetcher def _write_readme(self, filename="readme.txt"): """ Writes the classes docstring to a file. """ path = self.data_path / filename if not path.exists(): with path.open("w") as fi: fi.write(str(self.__doc__)) def _save_data_path(self, path=None): """ Save the path to where the data where downloaded in source folder. """ path = Path(path or self._path_to_saved_path_file) with path.open("w") as fi: fi.write(str(self.data_path)) @classmethod def load_dataset(cls, name: Union[str, "DataSet"]) -> "DataSet": """ Get a loaded dataset. Will ensure all files are downloaded and the appropriate data are loaded into memory. Parameters ---------- name The name of the dataset to load or a DataSet object. If a DataSet object is passed a copy of it will be returned. """ if isinstance(name, DataSet): return name.copy() name = name.lower() if name not in cls.datasets: # The dataset has not been discovered; try to load entry points cls._load_dataset_entry_points(name) if name in cls._entry_points: cls._entry_points[name].load() return load_dataset(name) msg = f"{name} is not in the known datasets {list(cls.datasets)}" raise ValueError(msg) if name in cls._loaded_datasets: # The dataset has already been loaded, simply return a copy return cls._loaded_datasets[name].copy() else: # The dataset has been discovered but not loaded; just loaded return cls.datasets[name]() def delete_data_directory(self): """ Delete the datafiles of a dataset. """ dataset = DataSet.load_dataset(self) shutil.rmtree(dataset.data_path) @classmethod def _load_dataset_entry_points(cls, name=None): """ load and cache the dataset entry points. """ look_for_name = name is not None and name not in cls._entry_points if not cls._entry_points or look_for_name: for ep in pkg_resources.iter_entry_points("obsplus.datasets"): cls._entry_points[ep.name] = ep # --- prescribed Paths for data @property def data_path(self) -> Path: """ Return a path to where the dataset's data was/will be downloaded. """ return self.base_path / self.name @property def source_path(self) -> Path: """ Return a path to the directory where the data files included with the dataset live. """ try: path = Path(inspect.getfile(self.__class__)).parent except (AttributeError, TypeError): path = Path(__file__) return path / self.name @property def _saved_data_path(self): """ Load the saved data source path, else return None """ expected_path = self._path_to_saved_path_file if expected_path.exists(): loaded_path = Path(expected_path.open("r").read()) if loaded_path.exists(): return loaded_path return None @property def _path_to_saved_path_file(self): """ A path to the file which keeps track of where data are downloaded. """ return self.source_path / self._saved_dataset_path_filename @property def _version_path(self): """ A path to the saved version file. """ return self.data_path / self._version_filename @property @lru_cache() def data_files(self) -> Tuple[Path, ...]: """ Return a list of top-level files associated with the dataset. Hidden files are ignored. """ file_iterator = self.source_path.glob("*") files = [x for x in file_iterator if not x.is_dir()] return tuple([x for x in files if not x.name.startswith(".")]) @property def waveform_path(self) -> Path: return self.data_path / "waveforms" @property def event_path(self) -> Path: return self.data_path / "events" @property def station_path(self) -> Path: return self.data_path / "stations" # --- checks for if each type of data is downloaded @property def waveforms_need_downloading(self): """ Returns True if waveform data need to be downloaded. """ return not self.waveform_path.exists() @property def events_need_downloading(self): """ Returns True if event data need to be downloaded. """ return not self.event_path.exists() @property def stations_need_downloading(self): """ Returns True if station data need to be downloaded. """ return not self.station_path.exists() @property @lru_cache() def waveform_client(self) -> Optional[WaveBank]: """ A cached property for a waveform client """ return self._load("waveform", self.waveform_path) @property @lru_cache() def event_client(self) -> Optional[EventBank]: """ A cached property for an event client """ return self._load("event", self.event_path) @property @lru_cache() def station_client(self) -> Optional[obspy.Inventory]: """ A cached property for a station client """ return self._load("station", self.station_path) @property @lru_cache() def _download_client(self): """ Return an instance of the IRIS client, subclasses can override to use different clients. """ return Client("IRIS") @_download_client.setter def _download_client(self, item): """ just allow this to be overwritten """ self.__dict__["client"] = item def create_md5_hash(self, path=_hash_filename, hidden=False) -> dict: """ Create an md5 hash of all dataset's files to ensure dataset integrity. Keys are paths (relative to dataset base path) and values are md5 hashes. Parameters ---------- path The path to which the hash data is saved. If None dont save. hidden If True also include hidden files """ out = md5_directory(self.data_path, exclude="readme.txt", hidden=hidden) if path is not None: # sort dict to mess less with git sort_dict = OrderedDict(sorted(out.items())) with (self.data_path / Path(path)).open("w") as fi: json.dump(sort_dict, fi) return out def check_hashes(self, check_hash=False): """ Check that the files are all there and have the correct Hashes. Parameters ---------- check_hash If True check the hash of the files. Returns ------- """ # TODO figure this out (data seem to have changed on IRIS' end) # If there is not a pre-existing hash file return hash_path = Path(self.data_path / self._hash_filename) if not hash_path.exists(): return # get old and new hash, and overlaps old_hash = json.load(hash_path.open()) current_hash = md5_directory(self.data_path, exclude=self._hash_excludes) overlap = set(old_hash) & set(current_hash) - set(self._hash_excludes) # get any files with new hashes has_changed = {x for x in overlap if old_hash[x] != current_hash[x]} missing = (set(old_hash) - set(current_hash)) - set( self._hash_excludes) if has_changed and check_hash: msg = (f"The md5 hash for dataset {self.name} did not match the " f"expected values for the following files:\n{has_changed}") raise FileHashChangedError(msg) if missing: msg = f"The following files are missing: \n{missing}" raise MissingDataFileError(msg) def check_version(self): """ Check the version of the dataset. Verifies the version string in the dataset class definition matches the one saved on disk. Parameters ---------- path Expected path of the version file. Raises ------ DataVersionError If any version problems are discovered. Returns ------- version_ok : bool True if the version matches what is expected. """ redownload_msg = f"Delete the following directory {self.data_path}" try: version = self.read_data_version() except DataVersionError: # The data version cannot be read from disk need_dl = (getattr(self, f"{x}s_need_downloading") for x in DATA_TYPES) if not any(need_dl): # Something is a little weird warn( "Version file is missing. Attempting to re-download the dataset." ) return False # Check the version number if version < self.version: msg = f"Dataset version is out of date: {version} < {self.version}. " raise DataVersionError(msg + redownload_msg) elif version > self.version: msg = f"Dataset version mismatch: {version} > {self.version}." msg = msg + " It may be necessary to reload the dataset." warn(msg + redownload_msg) return True # All is well. Continue. def write_version(self): """ Write the version string to disk. """ version_path = self._version_path with version_path.open("w") as fi: fi.write(self.version) def read_data_version(self): """ Read the data version from disk. Raise a DataVersionError if not found. """ version_path = self._version_path if not version_path.exists(): raise DataVersionError(f"{version_path} does not exist!") with version_path.open("r") as fi: version_str = fi.read() self._validate_version_str(version_str) return version_str @staticmethod def _validate_version_str(version_str): """ Check the version string is of the form x.y.z. If the version string is not valid raise DataVersionError. """ is_str = isinstance(version_str, str) has_3 = len(version_str.split(".")) == 3 if not (is_str and has_3): msg = f"version must be a string of the form x.y.z, not {version_str}" raise DataVersionError(msg) # --- Abstract properties subclasses should implement @property @abc.abstractmethod def name(self) -> str: """ Name of the dataset """ @property @abc.abstractmethod def version(self) -> str: """ Dataset version. Should be a str of the form x.y.z """ # --- Abstract methods subclasses should implement def download_events(self) -> None: """ Method to ensure the events have been downloaded. Events should be written in an obspy-readable format to self.event_path. If not implemented this method will create an empty directory. """ self.event_path.mkdir(exist_ok=True, parents=True) def download_waveforms(self) -> None: """ Method to ensure waveforms have been downloaded. Waveforms should be written in an obspy-readable format to self.waveform_path. """ self.waveform_path.mkdir(exist_ok=True, parents=True) @abc.abstractmethod def download_stations(self) -> None: """ Method to ensure inventories have been downloaded. Station data should be written in an obspy-readable format to self.station_path. Since there is not yet a functional StationBank, this method must be implemented by subclass. """ def pre_download_hook(self): """ Code to run before any downloads. """ def post_download_hook(self): """ code to run after any downloads. """ def __str__(self): return f"Dataset: {self.name}" def __repr__(self): return f"{str(self)} with description: {self.__doc__}"
class _Bank(ABC): """ The abstract base class for ObsPlus' banks. Used to access local archives in a client-like fashion. """ # hdf5 compression defaults _complib = "blosc" _complevel = 9 # attributes subclasses need to define ext = "" bank_path: Path = "" namespace = "" index_name = ".index.h5" # name of index file executor = None # an executor for using parallelism # optional str defining the directory structure and file name schemes path_structure = None name_structure = None # the minimum obsplus version. If not met delete index and re-index # bump when database schema change. _min_version = "0.0.3" # status bar attributes _bar_update_interval = 50 # number of files before updating bar _min_files_for_bar = 100 # min number of files before using bar enabled _read_func: callable # function for reading datatype # required dtypes for input to storage layer _dtypes_input: Mapping = MapProxy({}) # required dtypes for output from bank _dtypes_output: Mapping = MapProxy({}) # the index cache (can greatly reduce IO efforts) _index_cache: Optional[_IndexCache] = None @abstractmethod def read_index(self, **kwargs) -> pd.DataFrame: """Read the index filtering on various params.""" @abstractmethod def update_index(self: BankType) -> BankType: """Update the index.""" @abstractmethod def last_updated_timestamp(self) -> Optional[float]: """ Get the last modified time stored in the index. If not available return None. """ @property def last_updated(self) -> Optional[np.datetime64]: """ Get the last time (UTC) that the bank was updated. """ return to_datetime64(self.last_updated_timestamp) @abstractmethod def _read_metadata(self) -> pd.DataFrame: """Return a dictionary of metadata.""" # --- path/node related objects @property def index_path(self): """Return the expected path to the index file.""" return Path(self.bank_path) / self.index_name @property def _index_node(self): """Return the node/table where the index information is stored.""" return "/".join([self.namespace, "index"]) @property def _index_version(self) -> str: """Get the version of obsplus used to create the index.""" return self._read_metadata()["obsplus_version"].iloc[0] @property def _time_node(self): """The node/table where the update time information is stored.""" return "/".join([self.namespace, "last_updated"]) @property def _meta_node(self): """The node/table where the update metadata is stored.""" return "/".join([self.namespace, "metadata"]) @property def _version_or_none(self) -> Optional[str]: """Return the version string or None if it doesn't yet exist.""" try: version = self._index_version except (FileNotFoundError, DatabaseError): return return version def _enforce_min_version(self): """ Check version of obsplus used to create index and delete index if the minimum version requirement is not met. """ version = self._version_or_none if version is not None: min_version_tuple = get_version_tuple(self._min_version) version_tuple = get_version_tuple(version) if min_version_tuple > version_tuple: msg = ( f"The indexing schema has changed since {self._min_version} " f"the index will be recreated." ) warnings.warn(msg) os.remove(self.index_path) def _warn_on_newer_version(self): """ Issue a warning if the bank was created by a newer version of obsplus. If this is the case, there is no guarantee it will work. """ version = self._version_or_none if version is not None: obsplus_version = get_version_tuple(obsplus.__last_version__) bank_version = get_version_tuple(version) if bank_version > obsplus_version: msg = ( f"The bank was created with a newer version of ObsPlus (" f"{version}), you are running ({obsplus.__last_version__})," f"You may encounter problems, consider updating ObsPlus." ) warnings.warn(msg) def _unindexed_iterator(self, paths: Optional[bank_subpaths_type] = None): """Return an iterator of potential unindexed files.""" # get mtime, subtract a bit to avoid odd bugs mtime = None last_updated = self.last_updated_timestamp # this needs db so only call once if last_updated is not None: mtime = last_updated - 0.001 # get paths to iterate bank_path = self.bank_path if paths is None: paths = self.bank_path else: paths = [ f"{self.bank_path}/{x}" if str(bank_path) not in str(x) else str(x) for x in iterate(paths) ] # return file iterator return iter_files(paths, ext=self.ext, mtime=mtime) def _measure_iterator(self, iterable: Iterable, bar: Optional[ProgressBar] = None): """ A generator to yield un-indexed files and update progress bar. Parameters ---------- iterable Any iterable to yield. bar Any object which has a 'update' method. """ # get progress bar bar = self.get_progress_bar(bar) # get the iterator for num, obj in enumerate(iterable): # update bar if count is in update interval if bar is not None and num % self._bar_update_interval == 0: bar.update(num) yield obj # finish progress bar getattr(bar, "finish", lambda: None)() # call finish if bar exists def _make_meta_table(self): """ get a dataframe of meta info """ meta = dict( path_structure=self.path_structure, name_structure=self.name_structure, obsplus_version=obsplus.__last_version__, ) return pd.DataFrame(meta, index=[0]) def get_service_version(self): """ Return the version of obsplus used to create index. """ return self._index_version def ensure_bank_path_exists(self, create=False): """ Ensure the bank_path exists else raise an BankDoesNotExistError. If create is True, simply create the bank. """ path = Path(self.bank_path) if create: path.mkdir(parents=True, exist_ok=True) if not path.is_dir(): msg = f"{path} is not a directory, cant read bank" raise BankDoesNotExistError(msg) def get_progress_bar(self, bar=None) -> Optional[ProgressBar]: """ Return a progress bar instance based on bar parameter. If bar is False, return None. If bar is None return default Bar If bar is a subclass of ProgressBar, init class and set max_values. If bar is an instance of ProgressBar, return it. """ # conditions to bail out early if bar is False: # False indicates no bar is to be used return None elif isinstance(bar, ProgressBar): # bar is already instantiated return bar # next, count number of files num_files = sum([1 for _ in self._unindexed_iterator()]) if num_files < self._min_files_for_bar: # not enough files to use bar return None # instantiate bar and return kwargs = {"min_value": self._min_files_for_bar, "max_value": num_files} # an instance should be init'ed if isinstance(bar, type) and issubclass(bar, ProgressBar): return bar(**kwargs) elif bar is None: return get_progressbar(**kwargs) else: msg = f"{bar} is not a valid input for get_progress_bar" raise ValueError(msg) def clear_cache(self): """ Clear the index cache if the bank is using one. """ if self._index_cache is not None: self._index_cache.clear_cache() @property def _max_workers(self): """ Return the max number of workers allowed by the executor. If the Executor has no attribute `_max_workers` use the number of CPUs instead. If there is no executor assigned to bank instance return 1. """ executor = getattr(self, "executor", None) if executor is not None: return getattr(executor, "_max_workers", CPU_COUNT) return 1 def _map(self, func, args, chunksize=1): """ Map the args to function, using executor if defined, else perform in serial. """ if self.executor is not None: return self.executor.map(func, args, chunksize=chunksize) else: return (func(x) for x in args) @classmethod def load_example_bank( cls: BankType, dataset: str = "default_test", path: Optional[Union[str, Path]] = None, ) -> BankType: """ Create an example bank which is safe to modify. Copies relevant files from a dataset to a specified path, or a temporary directory if None is specified. Parameters ---------- dataset The name of the dataset. path The path to which the dataset files will be copied. If None just create a temporary directory. """ # determine which directory in the dataset this bank needs data_types = { obsplus.EventBank: "event_path", obsplus.StationBank: "station_path", obsplus.WaveBank: "waveform_path", } ds = obsplus.load_dataset(dataset) destination = Path(tempfile.mkdtemp() if path is None else path) / "temp" assert cls in data_types, f"{cls} Bank type not supported." path_to_copy = getattr(ds, data_types[cls]) shutil.copytree(path_to_copy, destination) return cls(destination) def __repr__(self): """Return the class name with bank path.""" name = type(self).__name__ return f"{name}(base_path={self.bank_path})" __str__ = __repr__