Esempio n. 1
0
def _get_dtypes(cls):
    """ return a dict of attributes and datatypes for properties. """
    # handle special cases
    if cls in COMPLEX_TYPES:
        return COMPLEX_TYPES[cls]
    out = {"_parent_id_": str, "_event_id_": str}
    # get containers, properties, and types
    property_dict = getattr(cls, "_property_dict", {})
    # iterate properties, handle special case else use type
    for item, val in property_dict.items():
        # if a class that needs to be flattened
        val = type(val) if not isinstance(val, type) else val
        if hasattr(val, "_property_dict") or val in COMPLEX_TYPES:
            sub_dict = _get_dtypes(val)
            for item_, val_ in sub_dict.items():
                if item_ in {"_parent_id_", "_event_id_"}:
                    continue
                out[f"__{item}__{item_}"] = val_
        else:  # handle simple, transform types
            out[item] = SIMPLE_TYPES.get(val, val)
    # add containers (always strings refering to other tables) and return
    containers = getattr(cls, "_containers", [])
    for container in containers:
        out[f"_{container}"] = str
    return MapProxy(out)  # mapproxy to simulate immutability
Esempio n. 2
0
File: wrap.py Progetto: sboltz/spype
 def __call__(self, *args, _pype_fixtures=None, **kwargs):
     fixtures = MapProxy({**(_pype_fixtures or {}), **self._wrap_fixtures,
                          **self._partials})
     out = self.task.run(*args, **kwargs, _fixtures=fixtures,
                         _callbacks=self._callbacks)
     if out is None:
         raise TaskReturnedNone
     return args_kwargs(out, adapter=self.adapter)
Esempio n. 3
0
    def validate(self):
        """
        Run checks on the pype to detect potential problems.

        Will raise an InvalidPype exception if compatibility issues are found,
        or a TypeError if any invalid callbacks are found.
        """
        # validate task compatibility
        self.flow.validate(extra_params=MapProxy(self._partials))
        # validate callbacks
        for wrap_ in self.flow.wraps:
            wrap_._validate_callbacks()
Esempio n. 4
0
    def __call__(self, *args, _pype_fixtures=None, **kwargs):
        fixtures = MapProxy({
            **(_pype_fixtures or EMPTY_PYPE_FIXTURES),
            **self._wrap_fixtures,
            **self._partials,
        })

        out = self.task.run(
            *args,
            **kwargs,
            _fixtures=fixtures,
            _callbacks=self._callbacks,
            _predicate=self._predicates,
        )
        if out is None:
            raise TaskReturnedNone
        return args_kwargs(out, adapter=self.adapter)
Esempio n. 5
0
    def _run_queue(self, _meta, que):
        """ run the queue until complete """
        # run que until complete or all tasks are waiting agg results
        assert self.flow.get_input_wrap().task is task.pype_input
        fixtures = MapProxy({"meta": _meta, "pype": self, **self._partials})

        while len(que):
            wrap_, (args, kwargs) = que.pop()
            wrap_: wrap.Wrap
            try:
                output = wrap_(*args, **kwargs, _pype_fixtures=fixtures)
            except UnresolvedDependency:  # task needs to be put back
                _meta["defer_count"][wrap_] += 1  # up task deferment counter
                que.appendleft((wrap_, (args, kwargs)))
                continue
            except TaskReturnedNone:  # task returned None
                continue
            else:  # everything went fine
                _meta["outputs"][wrap_.task] = output
                for neighbor in self.flow.neighbors(wrap_):  # queue neighbors
                    neighbor._queue_up(output, _meta, que, sending_wrap=wrap_)
        # run tasks that waited for object scoped aggregations
        self._run_aggregations(_meta, que)
        _meta["output"].append(de_args_kwargs(*output))
Esempio n. 6
0
class _Bank(ABC):
    """
    The abstract base class for ObsPlus' banks. Used to access local
    archives in a client-like fashion.
    """

    # hdf5 compression defaults
    _complib = "blosc"
    _complevel = 9
    # attributes subclasses need to define
    ext = ""
    bank_path = ""
    namespace = ""
    index_name = ".index.h5"  # name of index file
    executor = None  # an executor for using parallelism
    # optional str defining the directory structure and file name schemes
    path_structure = None
    name_structure = None
    # the minimum obsplus version. If not met delete index and re-index
    # bump when database schema change.
    _min_version = "0.0.3"
    # status bar attributes
    _bar_update_interval = 50  # number of files before updating bar
    _min_files_for_bar = 100  # min number of files before using bar enabled
    _read_func: callable  # function for reading datatype
    # required dypes for input to storage layer
    _dtypes_input: Mapping = MapProxy({})
    # required dtypes for output from bank
    _dtypes_output: Mapping = MapProxy({})
    # the index cache (can greatly reduce IO efforts)
    _index_cache: Optional[_IndexCache] = None

    @abstractmethod
    def read_index(self, **kwargs) -> pd.DataFrame:
        """ read the index filtering on various params """

    @abstractmethod
    def update_index(self: BankType) -> BankType:
        """ update the index """

    @abstractmethod
    def last_updated(self) -> Optional[float]:
        """ get the last modified time stored in the index. If
        Not available return None
        """

    @abstractmethod
    def _read_metadata(self) -> pd.DataFrame:
        """ Return a dictionary of metadata. """

    # --- path/node related objects

    @property
    def index_path(self):
        """
        The expected path to the index file.
        """
        return join(self.bank_path, self.index_name)

    @property
    def _index_node(self):
        """
        The node, or table, the index information is stored in the database.
        """
        return "/".join([self.namespace, "index"])

    @property
    def _index_version(self) -> str:
        """
        Get the version of obsplus used to create the index.
        """
        return self._read_metadata()["obsplus_version"].iloc[0]

    @property
    def _time_node(self):
        """
        The node, or table, the update time information is stored in the database.
        """
        return "/".join([self.namespace, "last_updated"])

    @property
    def _meta_node(self):
        """
        The node, or table, the update metadata is stored in the database.
        """
        return "/".join([self.namespace, "metadata"])

    def _enforce_min_version(self):
        """
        Check version of obsplus used to create index and delete index if the
        minimum version requirement is not met.
        """
        try:
            version = self._index_version
        except (FileNotFoundError, DatabaseError):
            return
        else:
            if self._min_version > version:
                msg = (
                    f"the indexing schema has changed since {self._min_version} "
                    f"the index will be recreated")
                warnings.warn(msg)
                os.remove(self.index_path)

    def _unindexed_iterator(self):
        """ return an iterator of potential unindexed files """
        # get mtime, subtract a bit to avoid odd bugs
        mtime = None
        last_updated = self.last_updated  # this needs db so only call once
        if last_updated is not None:
            mtime = last_updated - 0.001
        # return file iterator
        return iter_files(self.bank_path, ext=self.ext, mtime=mtime)

    def _measured_unindexed_iterator(self, bar: Optional[ProgressBar] = None):
        """
        A generator to yield un-indexed files and update progress bar.

        Parameters
        ----------
        bar
            Any object with an update method.

        Returns
        -------

        """
        # get progress bar
        bar = self.get_progress_bar(bar)
        # get the iterator
        for num, path in enumerate(self._unindexed_iterator()):
            # update bar if count is in update interval
            if bar is not None and num % self._bar_update_interval == 0:
                bar.update(num)
            yield path
        # finish progress bar
        getattr(bar, "finish", lambda: None)()  # call finish if bar exists

    def _make_meta_table(self):
        """ get a dataframe of meta info """
        meta = dict(
            path_structure=self.path_structure,
            name_structure=self.name_structure,
            obsplus_version=obsplus.__version__,
        )
        return pd.DataFrame(meta, index=[0])

    def get_service_version(self):
        """ Return the version of obsplus used to create index. """
        return self._index_version

    def ensure_bank_path_exists(self, create=False):
        """
        Ensure the bank_path exists else raise an BankDoesNotExistError.

        If create is True, simply create the bank.
        """
        path = Path(self.bank_path)
        if create:
            path.mkdir(parents=True, exist_ok=True)
        if not path.is_dir():
            msg = f"{path} is not a directory, cant read bank"
            raise BankDoesNotExistError(msg)

    def get_progress_bar(self, bar=None) -> Optional[ProgressBar]:
        """
        Return a progress bar instance based on bar parameter.

        If bar is False, return None.
        If bar is None return default Bar
        If bar is a subclass of ProgressBar, init class and set max_values.
        If bar is an instance of ProgressBar, return it.
        """
        print(f"updating or creating event index for {self.bank_path}")
        # conditions to bail out early
        if bar is False:  # False indicates no bar is to be used
            return None
        elif isinstance(bar, ProgressBar):  # bar is already instantiated
            return bar
        # next, count number of files
        num_files = sum([1 for _ in self._unindexed_iterator()])
        if num_files < self._min_files_for_bar:  # not enough files to use bar
            return None
        # instantiate bar and return
        kwargs = {"min_value": self._min_files_for_bar, "max_value": num_files}
        # an instance should be init'ed
        if isinstance(bar, type) and issubclass(bar, ProgressBar):
            return bar(**kwargs)
        elif bar is None:
            return get_progressbar(**kwargs)
        else:
            msg = f"{bar} is not a valid input for get_progress_bar"
            raise ValueError(msg)

    def clear_cache(self):
        """
        Clear the index cache if the bank is using one.
        """
        if self._index_cache is not None:
            self._index_cache.clear_cache()

    @property
    def _max_workers(self):
        """
        Return the max number of workers allowed by the executor.

        If the Executor has no attribute `_max_workers` use the number of
        CPUs instead. If there is no executor assigned to bank instance
        return 1.
        """
        executor = getattr(self, "executor", None)
        if executor is not None:
            return getattr(executor, "_max_workers", CPU_COUNT)
        return 1

    def _map(self, func, args, chunksize=None):
        """
        Map the args to function, using executor if defined else perform
        in serial.
        """
        if self.executor is not None:
            return self.executor.map(func, args, chunksize=chunksize)
        else:
            return (func(x) for x in args)
Esempio n. 7
0
"""
Constants, and their explanations, used by sflow
"""
from types import MappingProxyType as MapProxy
from typing import Callable, Sequence, Union, Tuple, Dict, Any, Hashable

# ---------------------- Fixtures/callbacks

# supported generic task fixtures
TASK_FIXTURES = MapProxy(
    dict(
        signature="The signature of the task's run method",
        task="A reference to the current task object",
        self="A reference to the current task object",
        e="The exception object if one was raised, else None",
        inputs="A tuple of (args, kwargs) passed as input to current task",
        args="A tuple of arguments passed to current task",
        kwargs="A dict of keywork arguments passed to current task",
        outputs="The outputs of calling a task, or None",
    ))

# supported wrap fixtures
WRAP_FIXTURES = MapProxy(
    dict(wrap="A refence to the wrap object around the task"))

# supported pype fixtures
PYPE_FIXTURES = MapProxy(
    dict(
        pype="A reference to the pype object running the task, or None",
        meta="The controld dict running the task que (advanced fixture)",
        print_flow="If True print the inputs/outputs of each task to screen",
Esempio n. 8
0
class DataSet(abc.ABC):
    """
    Abstract Base Class for downloading and serving datasets.

    This is not intended to be used directly, but rather through subclassing.

    Parameters
    ----------
    base_path
        The path to which the dataset will be saved.

    Attributes
    ----------
    data_path
        The path containing the data. By default it is base_path / name.
    source_path
        The path which contains the original files included in the dataset
        before download. By default this is found in the same directory as
        the dataset's code (.py) file in a folder with the same name as the
        dataset.

    Notes
    -----
        Importantly, each dataset references *two* directories, the source_path
        and data_path. The source_path contains all data included within the
        dataset and should not be altered. The data_path has a copy of
        everything in the source_path, plus the files created during the
        downloading process.

        The base_path (the parent of data_path) is resolved for each
        dataset using the following priorities:

            1. The `base_path` provided to `Dataset`'s __init__ method.
            2. .data_path.txt file stored in the data source
            3. An environmental name OPSDATA_PATH
            4. The opsdata_path variable from obsplus.constants

        By default the data will be downloaded to the user's home directory
        in a folder called "opsdata", but again, this is easily changed
        by setting the OPSDATA_PATH environmental variable.
    """

    _entry_points = {}
    _datasets = {}
    data_loaded = False
    # variables for hashing datafiles and versioning
    _version_filename = "dataset_version.txt"
    _hash_filename = "dataset_hash.json"
    # the name of the file that saves where the data file were downloaded
    _saved_dataset_path_filename = ".dataset_data_path.txt"
    _hash_excludes = (
        "readme.txt",
        _version_filename,
        _hash_filename,
        _saved_dataset_path_filename,
    )
    # generic functions for loading data (WaveBank, event, stations)
    _load_funcs = MapProxy(
        dict(
            waveform=get_waveform_client,
            event=get_event_client,
            station=get_station_client,
        ))
    # flags to determine if data should be loaded into memory
    _load_waveforms = False
    _load_stations = True
    _load_events = True
    # cache for instantiated datasets
    _loaded_datasets = {}
    _verbose = True

    def __init_subclass__(cls, **kwargs):
        """Register subclasses of datasets."""
        assert isinstance(cls.name, str), "name must be a string"
        validate_version_str(cls.version)
        # Register the subclass as a dataset.
        DataSet._datasets[cls.name.lower()] = cls

    # --- logic for loading and caching data

    def __init__(self, base_path=None):
        """download and load data into memory."""
        self.base_path = self._get_opsdata_path(base_path)
        # create the dataset's base directory
        self.data_path.mkdir(exist_ok=True, parents=True)
        # run the download logic if needed
        self._run_downloads()
        # cache loaded dataset
        self.data_loaded = True
        if not base_path and self.name not in self._loaded_datasets:
            self._loaded_datasets[self.name] = self.copy(deep=True)

    def _get_opsdata_path(self, opsdata_path: Optional[Path] = None) -> Path:
        """
        Get the location where datasets are stored.

        Returns
        -------
        A path to the opsdata directory.
        """
        if opsdata_path is None:
            opsdata_path = getattr(self._saved_data_path, "parent", None)
            if opsdata_path is None:
                # next look for env variable
                opsdata_path_default = obsplus.constants.OPSDATA_PATH
                opsdata_path = os.getenv("OPSDATA_PATH", opsdata_path_default)
        # ensure the data path exists
        _create_opsdata(opsdata_path)
        return Path(opsdata_path)

    def _run_downloads(self) -> None:
        """Iterate each kind of data and download if needed."""
        # Make sure the version of the dataset is okay
        version_ok = self.check_version()
        downloaded = False
        for what in DATA_TYPES:
            needs_str = f"{what}s_need_downloading"
            if getattr(self, needs_str) or (not version_ok):
                # this is the first type of data to be downloaded, run hook
                # and copy data from data source.
                if not downloaded and self.source_path.exists():
                    copy_tree(str(self.source_path), str(self.data_path))
                    self.pre_download_hook()
                downloaded = True
                # download data, test termination criteria
                self._log(
                    f"downloading {what} data for {self.name} dataset ...")
                getattr(self, "download_" + what + "s")()
                assert not getattr(self, needs_str), f"Download {what} failed"
                self._log(f"finished downloading {what} data for {self.name}")
                self._write_readme()  # make sure readme has been written
        # some data were downloaded, call post download hook
        if downloaded:
            self.check_hashes()
            self.post_download_hook()
            # write a new version file
            self.write_version()
            # write out a new saved datafile path
            self._save_data_path()

    def _load(self, what, path):
        """Load the client-like objects from disk."""
        try:
            client = self._load_funcs[what](path)
        except TypeError:
            warn(f"failed to load {what} from {path}, returning None")
            return None
        # load data into memory (eg load event bank contents into catalog)
        if getattr(self, f"_load_{what}s"):
            return getattr(client, f"get_{what}s")()
        else:
            return client

    def copy(self: DataSetType, deep=True) -> DataSetType:
        """
        Return a copy of the dataset.

        Parameters
        ----------
        deep
            If True deep copy the objects attached to the dataset.

        Notes
        -----
        This only copies data in memory, not on disk. If you plan to make
        any changes to the dataset's on disk resources please use
        :method:`~obsplus.Dataset.copy_to`.
        """
        return copy.deepcopy(self) if deep else copy.copy(self)

    def copy_to(self: DataSetType,
                destination: Optional[Union[str, Path]] = None) -> DataSetType:
        """
        Copy the dataset to a destination.

        If the destination already exists simply do nothing.

        Parameters
        ----------
        destination
            The destination to copy the dataset. It will be created if it
            doesnt exist. If None is provided use tmpfile to create a temporary
            directory.

        Returns
        -------
        A new dataset object which refers to the copied files.
        """
        return copy_dataset(self, destination)

    def get_fetcher(self, **kwargs) -> "obsplus.Fetcher":
        """
        Return a Fetcher from the data.

        kwargs are passed to :class:`~obsplus.structures.Fetcher`'s constructor.
        See its documentation for acceptable kwargs.
        """
        assert self.data_loaded, "data have not been loaded into memory"
        # get events/waveforms/stations and put into dict for the Fetcher
        fetch_kwargs = {
            "waveforms": self.waveform_client,
            "events": self.event_client,
            "stations": self.station_client,
        }
        fetch_kwargs.update(kwargs)
        return obsplus.Fetcher(**fetch_kwargs)

    __call__ = get_fetcher

    def _write_readme(self, filename="readme.txt"):
        """Writes the classes docstring to a file."""
        path = self.data_path / filename
        if not path.exists():
            with path.open("w") as fi:
                fi.write(str(self.__doc__))

    def _save_data_path(self, path=None):
        """Save the path to where the data where downloaded in source folder."""
        path = Path(path or self._path_to_saved_path_file)
        path.parent.mkdir(exist_ok=True, parents=True)
        with path.open("w") as fi:
            fi.write(str(self.data_path))

    @classmethod
    def load_dataset(cls: DataSetType,
                     name: Union[str, "DataSet"],
                     silent=False) -> DataSetType:
        """
        Get a loaded dataset.

        Will ensure all files are downloaded and the appropriate data are
        loaded into memory.

        Parameters
        ----------
        name
            The name of the dataset to load or a DataSet object. If a DataSet
            object is passed a copy of it will be returned.

        Examples
        --------
        >>> # --- Load an example dataset for testing
        >>> import obsplus
        >>> ds = obsplus.load_dataset('default_test')
        >>> # If you plan to make changes to the dataset be sure to copy it first
        >>> # The following will copy all files in the dataset to a tmpdir
        >>> ds2 = obsplus.copy_dataset('default_test')

        >>> # --- Use dataset clients to load waveforms, stations, and events
        >>> cat = ds.event_client.get_events()
        >>> st = ds.waveform_client.get_waveforms()
        >>> inv = ds.station_client.get_stations()

        >>> # --- get a fetcher for more "dataset aware" querying
        >>> fetcher = ds.get_fetcher()
        """
        # Just copy and return if a dataset is passed.
        if isinstance(name, DataSet):
            return name.copy()
        name = name.lower()
        cls._load_dataset_entry_point(name)
        if name not in cls._datasets:
            # The dataset has not been discovered; try to load entry points
            msg = f"{name} is not in the known datasets {list(cls._datasets)}"
            raise ValueError(msg)
        if name in cls._loaded_datasets:
            # The dataset has already been loaded, simply return a copy
            return cls._loaded_datasets[name].copy()
        else:  # The dataset has been discovered but not loaded; just loaded
            return cls._datasets[name]()

    def delete_data_directory(self):
        """
        Delete the datafiles of a dataset.

        This will force the data to be re-copied from the source files and
        download logic to be run.
        """
        dataset = DataSet.load_dataset(self)
        shutil.rmtree(dataset.data_path)

    @classmethod
    def _load_dataset_entry_point(cls, name=None, load=True):
        """
        Load and cache the dataset entry points.

        Parameters
        ----------
        name
            A string id of the dataset
        load
            If True, load the code associated with the entry point.
        """
        def _load_ep(ep):
            """Load the entry point, ignore removed datasets."""
            # If a plugin was register but no longer exists it can raise.
            with suppress(ModuleNotFoundError):
                ep.load()
                assert name in cls._datasets, "dataset should be registered."

        if name in cls._entry_points:  # entry point has been registered
            if name in cls._datasets:  # and loaded, return
                return
            elif load:  # it has not been loaded, try loading it.
                _load_ep(cls._entry_points[name])
        # it has not been found, iterate entry points and update
        eps = {x.name: x for x in iter_entry_points("obsplus.datasets")}
        cls._entry_points.update(eps)
        # stop if we don't need to load
        if not load:
            return
        # now iterate through all names, or just selected name, and load
        for name in set(iterate(name or eps)) & set(eps):
            _load_ep(eps[name])

    # --- prescribed Paths for data

    @property
    def data_path(self) -> Path:
        """
        Return a path to where the dataset's data was/will be downloaded.
        """
        return self.base_path / self.name

    @property
    def source_path(self) -> Path:
        """
        Return a path to the directory where the data files included with
        the dataset live.
        """
        try:
            path = Path(inspect.getfile(self.__class__)).parent
        except (AttributeError, TypeError):
            path = Path(__file__)
        return path / self.name

    @property
    def _saved_data_path(self):
        """Load the saved data source path, else return None."""
        expected_path = self._path_to_saved_path_file
        if expected_path.exists():
            loaded_path = Path(expected_path.open("r").read())
            if loaded_path.exists():
                return loaded_path
        return None

    @property
    def _path_to_saved_path_file(self):
        """
        A path to the file which keeps track of where data are downloaded.
        """
        return self.source_path / self._saved_dataset_path_filename

    @property
    def _version_path(self):
        """A path to the saved version file."""
        return self.data_path / self._version_filename

    @property
    @lru_cache()
    def data_files(self) -> Tuple[Path, ...]:
        """
        Return a list of top-level files associated with the dataset.

        Hidden files are ignored.
        """
        file_iterator = self.source_path.glob("*")
        files = [x for x in file_iterator if not x.is_dir()]
        return tuple([x for x in files if not x.name.startswith(".")])

    @property
    def waveform_path(self) -> Path:
        """Return the path to the waveforms."""
        return self.data_path / "waveforms"

    @property
    def event_path(self) -> Path:
        """Return the path to the events."""
        return self.data_path / "events"

    @property
    def station_path(self) -> Path:
        """Return the path to the stations."""
        return self.data_path / "stations"

    # --- checks for if each type of data is downloaded

    @property
    def waveforms_need_downloading(self) -> bool:
        """
        Returns True if waveform data need to be downloaded.
        """
        return not self.waveform_path.exists()

    @property
    def events_need_downloading(self) -> bool:
        """
        Returns True if event data need to be downloaded.
        """
        return not self.event_path.exists()

    @property
    def stations_need_downloading(self) -> bool:
        """
        Returns True if station data need to be downloaded.
        """
        return not self.station_path.exists()

    @property
    @lru_cache()
    def waveform_client(self) -> Optional[WaveformClient]:
        """A cached property for a waveform client"""
        return self._load("waveform", self.waveform_path)

    @property
    @lru_cache()
    def event_client(self) -> Optional[EventClient]:
        """A cached property for an event client"""
        return self._load("event", self.event_path)

    @property
    @lru_cache()
    def station_client(self) -> Optional[StationClient]:
        """A cached property for a station client"""
        return self._load("station", self.station_path)

    @property
    @lru_cache()
    def _download_client(self):
        """
        Return an instance of the IRIS client, subclasses can override
        to use different clients.
        """
        return Client("IRIS")

    @_download_client.setter
    def _download_client(self, item):
        """just allow this to be overwritten"""
        self.__dict__["client"] = item

    def _log(self, msg):
        """Simple way to customize dataset logging."""
        print(msg)

    def create_sha256_hash(self, path=None, hidden=False) -> dict:
        """
        Create a sha256 hash of the dataset's data files.

        The output is stored in a simple json file. Keys are paths (relative
        to dataset base path) and values are files hashes.

        If you want to update/create the hash file in the dataset's source
        this can be done by passing the dataset's source_path as the path
        argument.

        Parameters
        ----------
        path
            The path to which the hash data is saved. If None use data_path.
        hidden
            If True also include hidden files.
        """
        kwargs = dict(exclude=self._hash_excludes, hidden=hidden)
        out = hash_directory(self.data_path, **kwargs)
        # sort dict to mess less with git
        sort_dict = OrderedDict(sorted(out.items()))
        # get path and dump json
        default_path = Path(self.data_path) / self._hash_filename
        _path = path or default_path
        hash_path = _path / self._hash_filename if _path.is_dir() else _path
        with hash_path.open("w") as fi:
            json.dump(sort_dict, fi, sort_keys=True, indent=2)
        return out

    def check_hashes(self, check_hash=False):
        """
        Check that the files are all there and have the correct Hashes.

        Parameters
        ----------
        check_hash
            If True check the hash of the files.

        Raises
        ------
        FileHashChangedError
            If one of the file hashes is not as expeted.
        MissingDataFileError
            If one the data files was not downloaded.
        """
        # If there is not a pre-existing hash file return
        hash_path = Path(self.data_path / self._hash_filename)
        if not hash_path.exists():
            return
        # get old and new hash, and overlaps
        old_hash = json.load(hash_path.open())
        current_hash = hash_directory(self.data_path,
                                      exclude=self._hash_excludes)
        overlap = set(old_hash) & set(current_hash) - set(self._hash_excludes)
        # get any files with new hashes
        has_changed = {x for x in overlap if old_hash[x] != current_hash[x]}
        missing = (set(old_hash) - set(current_hash)) - set(
            self._hash_excludes)
        if has_changed and check_hash:
            msg = (f"The hash for dataset {self.name} did not match the "
                   f"expected values for the following files:\n{has_changed}")
            raise FileHashChangedError(msg)
        if missing:
            msg = f"Dataset {self.name} is missing files: \n{missing}"
            raise MissingDataFileError(msg)

    def check_version(self) -> bool:
        """
        Check the version of the dataset.

        Verifies the version string in the dataset class definition matches
        the one saved on disk. Returns True if all is well else raises a
        DataVersionError.

        Parameters
        ----------
        path
            Expected path of the version file.

        Raises
        ------
        DataVersionError
            If any version problems are discovered.
        """
        redownload_msg = f"Delete the following directory {self.data_path}"
        try:
            version = self.read_data_version()
        except (DataVersionError, ValueError):  # failed to read version
            need_dl = (getattr(self, f"{x}s_need_downloading")
                       for x in DATA_TYPES)
            if not any(need_dl):  # Something is a little weird
                warn(
                    "Version file is missing. Attempting to re-download the dataset."
                )
            return False
        # Check the version number
        if get_version_tuple(version) < get_version_tuple(self.version):
            msg = f"Dataset version is out of date: {version} < {self.version}. "
            raise DataVersionError(msg + redownload_msg)
        elif get_version_tuple(version) > get_version_tuple(self.version):
            msg = f"Dataset version mismatch: {version} > {self.version}."
            msg = msg + " It may be necessary to reload the dataset."
            warn(msg + redownload_msg)
        return True  # All is well. Continue.

    def write_version(self, path: Optional[Union[Path, str]] = None):
        """Write the version string to disk."""
        version_path = path or self._version_path
        with version_path.open("w") as fi:
            fi.write(self.version)

    def read_data_version(self,
                          path: Optional[Union[Path, str]] = None) -> str:
        """
        Read the data version from disk.

        Return a 3 length tuple from the semantic version string (of the
        form xx.yy.zz). Raise a DataVersionError if not found.
        """
        version_path = path or self._version_path
        if not version_path.exists():
            raise DataVersionError(f"{version_path} does not exist!")
        with version_path.open("r") as fi:
            version_str = fi.read()
        validate_version_str(version_str)
        return version_str

    # --- Abstract properties subclasses should implement
    @property
    @abc.abstractmethod
    def name(self) -> str:
        """
        Name of the dataset
        """

    @property
    @abc.abstractmethod
    def version(self) -> str:
        """
        Dataset version. Should be a str of the form x.y.z
        """

    @property
    def version_tuple(self) -> Tuple[int, int, int]:
        """
        Return a tuple of the version string.
        """
        validate_version_str(self.version)
        vsplit = self.version.split(".")
        return int(vsplit[0]), int(vsplit[1]), int(vsplit[2])

    # --- Abstract methods subclasses should implement

    def download_events(self) -> None:
        """
        Method to ensure the events have been downloaded.

        Events should be written in an obspy-readable format to
        self.event_path. If not implemented this method will create an empty
        directory.
        """
        self.event_path.mkdir(exist_ok=True, parents=True)

    def download_waveforms(self) -> None:
        """
        Method to ensure waveforms have been downloaded.

        Waveforms should be written in an obspy-readable format to
        self.waveform_path.
        """
        self.waveform_path.mkdir(exist_ok=True, parents=True)

    def download_stations(self) -> None:
        """
        Method to ensure inventories have been downloaded.

        Station data should be written in an obspy-readable format to
        self.station_path. Since there is not yet a functional StationBank,
        this method must be implemented by subclass.
        """
        self.station_path.mkdir(exist_ok=True, parents=True)

    def pre_download_hook(self):
        """Code to run before any downloads."""

    def post_download_hook(self):
        """Code to run after any downloads."""

    def __str__(self):
        return f"Dataset: {self.name}"

    def __repr__(self):
        return f"{str(self)} with description: {self.__doc__}"
Esempio n. 9
0
)
from spype.core import wrap
from spype.core.sbase import _SpypeBase
from spype.exceptions import UnresolvedDependency, ExitTask
from spype.types import valid_input, compatible_instance
from spype.utils import (
    iterate,
    apply_partial,
    de_args_kwargs,
    copy_func,
    get_default_names,
    function_or_class_name,
)

_fixtures = {**dict.fromkeys(PYPE_FIXTURES), **dict.fromkeys(WRAP_FIXTURES)}
EMPTY_FIXTURES = MapProxy(_fixtures)

# --------------------------- Auxiliary tasks


class _RunControl:
    """ A class to control executing callbacks in task's run method """

    _hard_exit = False

    def __init__(self, task: "Task", _fixtures, _callbacks, _predicates, args,
                 kwargs):
        self.task = task
        # get fixtures passed in from wraps/pypes or use empty dicts
        _fixtures = _fixtures or EMPTY_FIXTURES
        self.meta = _fixtures.get("meta", {}) or {
Esempio n. 10
0
class PlotEventSpectra(VerticalWithSubPlots):
    """" Class for plotting event spectra. """

    colors = {"Noise": "b", "P": "r", "S": "g"}
    _source_funcs = MapProxy({})
    _source_kwargs = MapProxy({})

    def __init__(self, source_group, event_id, limit=None, stations=None):
        super().__init__()
        self.source_group = source_group
        self.freqs = source_group.data.columns
        event_id = self._get_event_id(source_group.data, event_id)
        df = self._get_filtered_df(abs(source_group.data), event_id)
        source_df = source_group.source_df
        # slice meta to only get same selection as df
        meta = source_group.meta.loc[df.index]
        # init a dict of subplots {(phase: seed_id): axis}
        fig, ax_dict = self._get_axis_dict(meta, event_id, limit, stations)

        # init partials of source_models if used
        self._get_source_funcs()

        for (sta, chan), ax in ax_dict.items():
            sub_meta = meta[(meta.station == sta) & (meta.channel == chan)]
            data = df.loc[sub_meta.index]
            self._plot_channel(ax, data, meta)
            # plot fitted dataframe if applicable
            if source_df is not None and not source_df.empty:
                self._plot_fitted_source(ax, source_df.loc[sub_meta.index], sub_meta)

        # iterate all axis and turn on legends
        for ax in ax_dict.values():
            ax.legend(loc=3)

    def _get_source_funcs(self):
        """ set _source_funcs and _source_kwargs """
        # set a few variables; bail out if not fitted df
        sg = self.source_group
        data = sg.data
        source_df = sg.source_df
        if source_df is None or source_df.empty:
            return

        from mopy.sourcemodels import source_spectrum, SOURCE_MODEL_PARAMS

        # get frequencies and model function partials
        freqs = data.columns
        #
        used_models = source_df.columns.get_level_values("model").unique()
        # create dict of partial source model functions
        funcs = {}
        wanted_kwargs = {}
        for model in used_models:
            source_params = SOURCE_MODEL_PARAMS[model]
            func = partial(source_spectrum, freqs=freqs, **source_params)
            funcs[model] = func
            wanted_kwargs[model] = set(inspect.signature(func).parameters)
        self._source_funcs = funcs
        self._source_kwargs = wanted_kwargs

    def _plot_fitted_source(self, ax, fit_df, meta):
        """ plot the fitted source spectra """
        used_models = fit_df.columns.get_level_values("model").unique()
        # filter out nulls
        fit_df = fit_df[~fit_df.isnull().any(axis=1)]

        # iterate each row that is not null
        for ind, row in fit_df.iterrows():
            phase = ind[0]
            color = self.colors[phase]
            for model in used_models:
                wanted_kwargs = self._source_kwargs[model]
                func = self._source_funcs[model]
                # get inputs to particular model function
                meta_dict = dict(meta.loc[ind[:2]])
                kwargs = dict(row.loc[model])
                kwargs.update(meta_dict)
                # get desired inputs from signature
                overlap = wanted_kwargs & set(kwargs)
                kwargs = {x: kwargs[x] for x in overlap}
                # calc spectra
                data = func(**kwargs)
                # get label and plot
                kwargs_str = "; ".join([f"{i}: {v:.2E}" for i, v in kwargs.items()])
                label = f"{model}_{phase}: {kwargs_str}"
                ax.plot(
                    self.freqs, data, color=color, ls="-", label=label, linestyle="--"
                )

        # for model in :
        #     model_params = SOURCE_MODEL_PARAMS[model]
        #     func = partial(source_spectrum, **model_params)
        #     for ind, df in fit_df.iterrows()
        #     time = funq(freqs)
        #
        #
        #     breakpoint()

        sg = self.source_group

    def _plot_channel(self, ax, data, meta):
        """ plot the channel data. """
        for ind, row in data.iterrows():
            meta_row = meta.loc[ind]
            phase = ind[0]
            color = self.colors.get(phase, "k")
            ax.loglog(row.index, row.values, label=phase, color=color)
            ax.set_title(ind[-1])
Esempio n. 11
0
ARRIVAL_COLUMNS = tuple(ARRIVAL_DTYPES)

# Waveform datatypes
WAVEFORM_DTYPES = OrderedDict(
    network=str,
    station=str,
    location=str,
    channel=str,
    starttime="datetime64[ns]",
    endtime="datetime64[ns]",
    sampling_period="timedelta64[ns]",
)

# The datatypes needed for putting waveform info into HDF5
WAVEFORM_DTYPES_INPUT = MapProxy(
    {i: _DATETIME_TYPE_MAP.get(v, v)
     for i, v in WAVEFORM_DTYPES.items()})

# keys used to identify UTC objects
UTC_KEYS = ("creation_time", "time", "reference")

# keys to pop out of a json object
JSON_KEYS_TO_POP = {"_id", "_summary"}

# seed id components
NSLC = ("network", "station", "location", "channel")

# the expected dimensions of the standard waveform array
DIMS = ("stream_id", "seed_id", "time")

# Small and BIG UTCDateTimes
Esempio n. 12
0
class DataSet(abc.ABC):
    """
    Class for downloading and serving datasets.

    By default the data will be downloaded to obsplus' datasets module
    but this can be changed using the base_path argument. All data will
    be saved in base_path / name.

    Parameters
    ----------
    base_path
        The path to which the dataset will be saved as base_path / name.

    Attributes
    ----------
    source_path
        The path to the directory containing the source of DataSet code.
    """

    _entry_points = {}
    datasets = {}
    data_loaded = False
    # variables for hashing datafiles and versioning
    _version_filename = ".dataset_version.txt"
    _hash_filename = ".dataset_md5_hash.json"
    # the name of the file that saves where the data file were downloaded
    _saved_dataset_path_filename = ".dataset_data_path.txt"
    _hash_excludes = (
        "readme.txt",
        _version_filename,
        _hash_filename,
        _saved_dataset_path_filename,
    )

    # generic functions for loading data (WaveBank, event, stations)
    _load_funcs = MapProxy(
        dict(
            waveform=get_waveform_client,
            event=get_event_client,
            station=get_station_client,
        ))
    # flags to determine if data should be loaded into memory
    _load_waveforms = False
    _load_stations = True
    _load_events = True
    # cache for instantiated datasets
    _loaded_datasets = {}

    def __init_subclass__(cls, **kwargs):
        """ Register subclasses of datasets. """
        assert isinstance(cls.name, str), "name must be a string"
        cls._validate_version_str(cls.version)
        # Register the subclass as a dataset.
        DataSet.datasets[cls.name.lower()] = cls

    # --- logic for loading and caching data

    def __init__(self, base_path=None):
        """ download and load data into memory. """
        self.base_path = self._get_opsdata_path(base_path)
        # create the dataset's base directory
        self.data_path.mkdir(exist_ok=True, parents=True)
        # run the download logic if needed
        self._run_downloads()
        # cache loaded dataset
        self.data_loaded = True
        if not base_path and self.name not in self._loaded_datasets:
            self._loaded_datasets[self.name] = self.copy(deep=True)

    def _get_opsdata_path(self, opsdata_path: Optional[Path] = None) -> Path:
        """
        Get the location where datasets are stored.

        Uses the following priorities:

        1. Provided Path via opsdata_path
        2. .data_path.txt file stored in the data source
        3. An environmental name OPSDATA_PATH
        4. The opsdata_path variable from obsplus.constants

        Returns
        -------
        A path to the opsdata directory.
        """
        if opsdata_path is None:
            opsdata_path = getattr(self._saved_data_path, "parent", None)
            if opsdata_path is None:
                # next look for env variable
                opsdata_path_default = obsplus.constants.OPSDATA_PATH
                opsdata_path = os.getenv("OPSDATA_PATH", opsdata_path_default)
        # ensure the data path exists
        _create_opsdata(opsdata_path)
        return Path(opsdata_path)

    def _run_downloads(self):
        """ Iterate each kind of data and download if needed. """
        # Make sure the version of the dataset is okay
        version_ok = self.check_version()
        downloaded = False
        for what in DATA_TYPES:
            needs_str = f"{what}s_need_downloading"
            if getattr(self, needs_str) or (not version_ok):
                # this is the first type of data to be downloaded, run hook
                # and copy data from data source.
                if not downloaded and self.source_path.exists():
                    copy_tree(str(self.source_path), str(self.data_path))
                    self.pre_download_hook()
                downloaded = True
                # download data, test termination criteria
                print(f"downloading {what} data for {self.name} dataset ...")
                getattr(self, "download_" + what + "s")()
                assert not getattr(self, needs_str), f"Download {what} failed"
                print(f"finished downloading {what} data for {self.name}")
                self._write_readme()  # make sure readme has been written
        # some data were downloaded, call post download hook
        if downloaded:
            self.check_hashes()
            self.post_download_hook()
            # write a new version file
            self.write_version()
            # write out a new saved datafile path
            self._save_data_path()

    def _load(self, what, path):
        """ Load the client-like objects from disk. """
        try:
            client = self._load_funcs[what](path)
        except TypeError:
            warn(f"failed to load {what} from {path}, returning None")
            return None
        # load data into memory (eg load event bank contents into catalog)
        if getattr(self, f"_load_{what}s"):
            return getattr(client, f"get_{what}s")()
        else:
            return client

    def copy(self, deep=True):
        """
        Return a copy of the dataset.
        """
        return copy.deepcopy(self) if deep else copy.copy(self)

    def copy_to(self, destination: Optional[Union[str, Path]] = None):
        """
        Copy the dataset to a destination.

        Parameters
        ----------
        destination
            The destination to copy the dataset. It will be created if it
            doesnt exist. If None is provided use tmpfile to create a temporary
            directory.

        Returns
        -------
        A new dataset object which refers to the copied files.
        """
        return copy_dataset(self, destination)

    def get_fetcher(self) -> "obsplus.Fetcher":
        """
        Return a Fetcher from the data.
        """
        assert self.data_loaded, "data have not been loaded into memory"
        # get events/waveforms/stations and put into dict for the Fetcher
        fetch_kwargs = {
            "waveforms": self.waveform_client,
            "events": self.event_client,
            "stations": self.station_client,
        }
        return obsplus.Fetcher(**fetch_kwargs)

    __call__ = get_fetcher

    def _write_readme(self, filename="readme.txt"):
        """ Writes the classes docstring to a file. """
        path = self.data_path / filename
        if not path.exists():
            with path.open("w") as fi:
                fi.write(str(self.__doc__))

    def _save_data_path(self, path=None):
        """ Save the path to where the data where downloaded in source folder. """
        path = Path(path or self._path_to_saved_path_file)
        with path.open("w") as fi:
            fi.write(str(self.data_path))

    @classmethod
    def load_dataset(cls, name: Union[str, "DataSet"]) -> "DataSet":
        """
        Get a loaded dataset.

        Will ensure all files are downloaded and the appropriate data are
        loaded into memory.

        Parameters
        ----------
        name
            The name of the dataset to load or a DataSet object. If a DataSet
            object is passed a copy of it will be returned.
        """
        if isinstance(name, DataSet):
            return name.copy()
        name = name.lower()
        if name not in cls.datasets:
            # The dataset has not been discovered; try to load entry points
            cls._load_dataset_entry_points(name)
            if name in cls._entry_points:
                cls._entry_points[name].load()
                return load_dataset(name)
            msg = f"{name} is not in the known datasets {list(cls.datasets)}"
            raise ValueError(msg)
        if name in cls._loaded_datasets:
            # The dataset has already been loaded, simply return a copy
            return cls._loaded_datasets[name].copy()
        else:  # The dataset has been discovered but not loaded; just loaded
            return cls.datasets[name]()

    def delete_data_directory(self):
        """ Delete the datafiles of a dataset. """
        dataset = DataSet.load_dataset(self)
        shutil.rmtree(dataset.data_path)

    @classmethod
    def _load_dataset_entry_points(cls, name=None):
        """ load and cache the dataset entry points. """
        look_for_name = name is not None and name not in cls._entry_points
        if not cls._entry_points or look_for_name:
            for ep in pkg_resources.iter_entry_points("obsplus.datasets"):
                cls._entry_points[ep.name] = ep

    # --- prescribed Paths for data

    @property
    def data_path(self) -> Path:
        """
        Return a path to where the dataset's data was/will be downloaded.
        """
        return self.base_path / self.name

    @property
    def source_path(self) -> Path:
        """
        Return a path to the directory where the data files included with
        the dataset live.
        """
        try:
            path = Path(inspect.getfile(self.__class__)).parent
        except (AttributeError, TypeError):
            path = Path(__file__)
        return path / self.name

    @property
    def _saved_data_path(self):
        """ Load the saved data source path, else return None """
        expected_path = self._path_to_saved_path_file
        if expected_path.exists():
            loaded_path = Path(expected_path.open("r").read())
            if loaded_path.exists():
                return loaded_path
        return None

    @property
    def _path_to_saved_path_file(self):
        """
        A path to the file which keeps track of where data are downloaded.
        """
        return self.source_path / self._saved_dataset_path_filename

    @property
    def _version_path(self):
        """ A path to the saved version file. """
        return self.data_path / self._version_filename

    @property
    @lru_cache()
    def data_files(self) -> Tuple[Path, ...]:
        """
        Return a list of top-level files associated with the dataset.

        Hidden files are ignored.
        """
        file_iterator = self.source_path.glob("*")
        files = [x for x in file_iterator if not x.is_dir()]
        return tuple([x for x in files if not x.name.startswith(".")])

    @property
    def waveform_path(self) -> Path:
        return self.data_path / "waveforms"

    @property
    def event_path(self) -> Path:
        return self.data_path / "events"

    @property
    def station_path(self) -> Path:
        return self.data_path / "stations"

    # --- checks for if each type of data is downloaded

    @property
    def waveforms_need_downloading(self):
        """
        Returns True if waveform data need to be downloaded.
        """
        return not self.waveform_path.exists()

    @property
    def events_need_downloading(self):
        """
        Returns True if event data need to be downloaded.
        """
        return not self.event_path.exists()

    @property
    def stations_need_downloading(self):
        """
        Returns True if station data need to be downloaded.
        """
        return not self.station_path.exists()

    @property
    @lru_cache()
    def waveform_client(self) -> Optional[WaveBank]:
        """ A cached property for a waveform client """
        return self._load("waveform", self.waveform_path)

    @property
    @lru_cache()
    def event_client(self) -> Optional[EventBank]:
        """ A cached property for an event client """
        return self._load("event", self.event_path)

    @property
    @lru_cache()
    def station_client(self) -> Optional[obspy.Inventory]:
        """ A cached property for a station client """
        return self._load("station", self.station_path)

    @property
    @lru_cache()
    def _download_client(self):
        """
        Return an instance of the IRIS client, subclasses can override
        to use different clients.
        """
        return Client("IRIS")

    @_download_client.setter
    def _download_client(self, item):
        """ just allow this to be overwritten """
        self.__dict__["client"] = item

    def create_md5_hash(self, path=_hash_filename, hidden=False) -> dict:
        """
        Create an md5 hash of all dataset's files to ensure dataset integrity.

        Keys are paths (relative to dataset base path) and values are md5
        hashes.

        Parameters
        ----------
        path
            The path to which the hash data is saved. If None dont save.
        hidden
            If True also include hidden files
        """
        out = md5_directory(self.data_path,
                            exclude="readme.txt",
                            hidden=hidden)
        if path is not None:
            # sort dict to mess less with git
            sort_dict = OrderedDict(sorted(out.items()))
            with (self.data_path / Path(path)).open("w") as fi:
                json.dump(sort_dict, fi)
        return out

    def check_hashes(self, check_hash=False):
        """
        Check that the files are all there and have the correct Hashes.

        Parameters
        ----------
        check_hash
            If True check the hash of the files.

        Returns
        -------

        """
        # TODO figure this out (data seem to have changed on IRIS' end)
        # If there is not a pre-existing hash file return
        hash_path = Path(self.data_path / self._hash_filename)
        if not hash_path.exists():
            return
        # get old and new hash, and overlaps
        old_hash = json.load(hash_path.open())
        current_hash = md5_directory(self.data_path,
                                     exclude=self._hash_excludes)
        overlap = set(old_hash) & set(current_hash) - set(self._hash_excludes)
        # get any files with new hashes
        has_changed = {x for x in overlap if old_hash[x] != current_hash[x]}
        missing = (set(old_hash) - set(current_hash)) - set(
            self._hash_excludes)
        if has_changed and check_hash:
            msg = (f"The md5 hash for dataset {self.name} did not match the "
                   f"expected values for the following files:\n{has_changed}")
            raise FileHashChangedError(msg)
        if missing:
            msg = f"The following files are missing: \n{missing}"
            raise MissingDataFileError(msg)

    def check_version(self):
        """
        Check the version of the dataset.

        Verifies the version string in the dataset class definition matches
        the one saved on disk.

        Parameters
        ----------
        path
            Expected path of the version file.

        Raises
        ------
        DataVersionError
            If any version problems are discovered.

        Returns
        -------
        version_ok : bool
            True if the version matches what is expected.
        """
        redownload_msg = f"Delete the following directory {self.data_path}"
        try:
            version = self.read_data_version()
        except DataVersionError:  # The data version cannot be read from disk
            need_dl = (getattr(self, f"{x}s_need_downloading")
                       for x in DATA_TYPES)
            if not any(need_dl):  # Something is a little weird
                warn(
                    "Version file is missing. Attempting to re-download the dataset."
                )
            return False
        # Check the version number
        if version < self.version:
            msg = f"Dataset version is out of date: {version} < {self.version}. "
            raise DataVersionError(msg + redownload_msg)
        elif version > self.version:
            msg = f"Dataset version mismatch: {version} > {self.version}."
            msg = msg + " It may be necessary to reload the dataset."
            warn(msg + redownload_msg)
        return True  # All is well. Continue.

    def write_version(self):
        """ Write the version string to disk. """
        version_path = self._version_path
        with version_path.open("w") as fi:
            fi.write(self.version)

    def read_data_version(self):
        """
        Read the data version from disk.

        Raise a DataVersionError if not found.
        """
        version_path = self._version_path
        if not version_path.exists():
            raise DataVersionError(f"{version_path} does not exist!")
        with version_path.open("r") as fi:
            version_str = fi.read()
        self._validate_version_str(version_str)
        return version_str

    @staticmethod
    def _validate_version_str(version_str):
        """
        Check the version string is of the form x.y.z.

        If the version string is not valid raise DataVersionError.
        """
        is_str = isinstance(version_str, str)
        has_3 = len(version_str.split(".")) == 3
        if not (is_str and has_3):
            msg = f"version must be a string of the form x.y.z, not {version_str}"
            raise DataVersionError(msg)

    # --- Abstract properties subclasses should implement
    @property
    @abc.abstractmethod
    def name(self) -> str:
        """
        Name of the dataset
        """

    @property
    @abc.abstractmethod
    def version(self) -> str:
        """
        Dataset version. Should be a str of the form x.y.z
        """

    # --- Abstract methods subclasses should implement

    def download_events(self) -> None:
        """
        Method to ensure the events have been downloaded.

        Events should be written in an obspy-readable format to
        self.event_path. If not implemented this method will create an empty
        directory.
        """
        self.event_path.mkdir(exist_ok=True, parents=True)

    def download_waveforms(self) -> None:
        """
        Method to ensure waveforms have been downloaded.

        Waveforms should be written in an obspy-readable format to
        self.waveform_path.
        """
        self.waveform_path.mkdir(exist_ok=True, parents=True)

    @abc.abstractmethod
    def download_stations(self) -> None:
        """
        Method to ensure inventories have been downloaded.

        Station data should be written in an obspy-readable format to
        self.station_path. Since there is not yet a functional StationBank,
        this method must be implemented by subclass.
        """

    def pre_download_hook(self):
        """ Code to run before any downloads. """

    def post_download_hook(self):
        """ code to run after any downloads. """

    def __str__(self):
        return f"Dataset: {self.name}"

    def __repr__(self):
        return f"{str(self)} with description: {self.__doc__}"
Esempio n. 13
0
class _Bank(ABC):
    """
    The abstract base class for ObsPlus' banks.

    Used to access local archives in a client-like fashion.
    """

    # hdf5 compression defaults
    _complib = "blosc"
    _complevel = 9
    # attributes subclasses need to define
    ext = ""
    bank_path: Path = ""
    namespace = ""
    index_name = ".index.h5"  # name of index file
    executor = None  # an executor for using parallelism
    # optional str defining the directory structure and file name schemes
    path_structure = None
    name_structure = None
    # the minimum obsplus version. If not met delete index and re-index
    # bump when database schema change.
    _min_version = "0.0.3"
    # status bar attributes
    _bar_update_interval = 50  # number of files before updating bar
    _min_files_for_bar = 100  # min number of files before using bar enabled
    _read_func: callable  # function for reading datatype
    # required dtypes for input to storage layer
    _dtypes_input: Mapping = MapProxy({})
    # required dtypes for output from bank
    _dtypes_output: Mapping = MapProxy({})
    # the index cache (can greatly reduce IO efforts)
    _index_cache: Optional[_IndexCache] = None

    @abstractmethod
    def read_index(self, **kwargs) -> pd.DataFrame:
        """Read the index filtering on various params."""

    @abstractmethod
    def update_index(self: BankType) -> BankType:
        """Update the index."""

    @abstractmethod
    def last_updated_timestamp(self) -> Optional[float]:
        """
        Get the last modified time stored in the index.

        If not available return None.
        """

    @property
    def last_updated(self) -> Optional[np.datetime64]:
        """
        Get the last time (UTC) that the bank was updated.
        """
        return to_datetime64(self.last_updated_timestamp)

    @abstractmethod
    def _read_metadata(self) -> pd.DataFrame:
        """Return a dictionary of metadata."""

    # --- path/node related objects

    @property
    def index_path(self):
        """Return the expected path to the index file."""
        return Path(self.bank_path) / self.index_name

    @property
    def _index_node(self):
        """Return the node/table where the index information is stored."""
        return "/".join([self.namespace, "index"])

    @property
    def _index_version(self) -> str:
        """Get the version of obsplus used to create the index."""
        return self._read_metadata()["obsplus_version"].iloc[0]

    @property
    def _time_node(self):
        """The node/table where the update time information is stored."""
        return "/".join([self.namespace, "last_updated"])

    @property
    def _meta_node(self):
        """The node/table where the update metadata is stored."""
        return "/".join([self.namespace, "metadata"])

    @property
    def _version_or_none(self) -> Optional[str]:
        """Return the version string or None if it doesn't yet exist."""
        try:
            version = self._index_version
        except (FileNotFoundError, DatabaseError):
            return
        return version

    def _enforce_min_version(self):
        """
        Check version of obsplus used to create index and delete index if the
        minimum version requirement is not met.
        """
        version = self._version_or_none
        if version is not None:
            min_version_tuple = get_version_tuple(self._min_version)
            version_tuple = get_version_tuple(version)
            if min_version_tuple > version_tuple:
                msg = (
                    f"The indexing schema has changed since {self._min_version} "
                    f"the index will be recreated."
                )
                warnings.warn(msg)
                os.remove(self.index_path)

    def _warn_on_newer_version(self):
        """
        Issue a warning if the bank was created by a newer version of obsplus.

        If this is the case, there is no guarantee it will work.
        """
        version = self._version_or_none
        if version is not None:
            obsplus_version = get_version_tuple(obsplus.__last_version__)
            bank_version = get_version_tuple(version)
            if bank_version > obsplus_version:
                msg = (
                    f"The bank was created with a newer version of ObsPlus ("
                    f"{version}), you are running ({obsplus.__last_version__}),"
                    f"You may encounter problems, consider updating ObsPlus."
                )
                warnings.warn(msg)

    def _unindexed_iterator(self, paths: Optional[bank_subpaths_type] = None):
        """Return an iterator of potential unindexed files."""
        # get mtime, subtract a bit to avoid odd bugs
        mtime = None
        last_updated = self.last_updated_timestamp  # this needs db so only call once
        if last_updated is not None:
            mtime = last_updated - 0.001
        # get paths to iterate
        bank_path = self.bank_path
        if paths is None:
            paths = self.bank_path
        else:
            paths = [
                f"{self.bank_path}/{x}" if str(bank_path) not in str(x) else str(x)
                for x in iterate(paths)
            ]
        # return file iterator
        return iter_files(paths, ext=self.ext, mtime=mtime)

    def _measure_iterator(self, iterable: Iterable, bar: Optional[ProgressBar] = None):
        """
        A generator to yield un-indexed files and update progress bar.

        Parameters
        ----------
        iterable
            Any iterable to yield.
        bar
            Any object which has a 'update' method.
        """
        # get progress bar
        bar = self.get_progress_bar(bar)
        # get the iterator
        for num, obj in enumerate(iterable):
            # update bar if count is in update interval
            if bar is not None and num % self._bar_update_interval == 0:
                bar.update(num)
            yield obj
        # finish progress bar
        getattr(bar, "finish", lambda: None)()  # call finish if bar exists

    def _make_meta_table(self):
        """ get a dataframe of meta info """
        meta = dict(
            path_structure=self.path_structure,
            name_structure=self.name_structure,
            obsplus_version=obsplus.__last_version__,
        )
        return pd.DataFrame(meta, index=[0])

    def get_service_version(self):
        """ Return the version of obsplus used to create index. """
        return self._index_version

    def ensure_bank_path_exists(self, create=False):
        """
        Ensure the bank_path exists else raise an BankDoesNotExistError.

        If create is True, simply create the bank.
        """
        path = Path(self.bank_path)
        if create:
            path.mkdir(parents=True, exist_ok=True)
        if not path.is_dir():
            msg = f"{path} is not a directory, cant read bank"
            raise BankDoesNotExistError(msg)

    def get_progress_bar(self, bar=None) -> Optional[ProgressBar]:
        """
        Return a progress bar instance based on bar parameter.

        If bar is False, return None.
        If bar is None return default Bar
        If bar is a subclass of ProgressBar, init class and set max_values.
        If bar is an instance of ProgressBar, return it.
        """
        # conditions to bail out early
        if bar is False:  # False indicates no bar is to be used
            return None
        elif isinstance(bar, ProgressBar):  # bar is already instantiated
            return bar
        # next, count number of files
        num_files = sum([1 for _ in self._unindexed_iterator()])
        if num_files < self._min_files_for_bar:  # not enough files to use bar
            return None
        # instantiate bar and return
        kwargs = {"min_value": self._min_files_for_bar, "max_value": num_files}
        # an instance should be init'ed
        if isinstance(bar, type) and issubclass(bar, ProgressBar):
            return bar(**kwargs)
        elif bar is None:
            return get_progressbar(**kwargs)
        else:
            msg = f"{bar} is not a valid input for get_progress_bar"
            raise ValueError(msg)

    def clear_cache(self):
        """
        Clear the index cache if the bank is using one.
        """
        if self._index_cache is not None:
            self._index_cache.clear_cache()

    @property
    def _max_workers(self):
        """
        Return the max number of workers allowed by the executor.

        If the Executor has no attribute `_max_workers` use the number of
        CPUs instead. If there is no executor assigned to bank instance
        return 1.
        """
        executor = getattr(self, "executor", None)
        if executor is not None:
            return getattr(executor, "_max_workers", CPU_COUNT)
        return 1

    def _map(self, func, args, chunksize=1):
        """
        Map the args to function, using executor if defined, else perform
        in serial.
        """
        if self.executor is not None:
            return self.executor.map(func, args, chunksize=chunksize)
        else:
            return (func(x) for x in args)

    @classmethod
    def load_example_bank(
        cls: BankType,
        dataset: str = "default_test",
        path: Optional[Union[str, Path]] = None,
    ) -> BankType:
        """
        Create an example bank which is safe to modify.

        Copies relevant files from a dataset to a specified path, or a
        temporary directory if None is specified.

        Parameters
        ----------
        dataset
            The name of the dataset.
        path
            The path to which the dataset files will be copied. If None
            just create a temporary directory.
        """
        # determine which directory in the dataset this bank needs
        data_types = {
            obsplus.EventBank: "event_path",
            obsplus.StationBank: "station_path",
            obsplus.WaveBank: "waveform_path",
        }
        ds = obsplus.load_dataset(dataset)
        destination = Path(tempfile.mkdtemp() if path is None else path) / "temp"
        assert cls in data_types, f"{cls} Bank type not supported."
        path_to_copy = getattr(ds, data_types[cls])
        shutil.copytree(path_to_copy, destination)
        return cls(destination)

    def __repr__(self):
        """Return the class name with bank path."""
        name = type(self).__name__
        return f"{name}(base_path={self.bank_path})"

    __str__ = __repr__