Esempio n. 1
0
 def test_union_with_empty_sub_dict(self):
     """unioning with a dict that has an empty sub-dict"""
     d = UnionDict({"title": {}})
     e = UnionDict({"title": {"text": "Alignment Position"}})
     f = UnionDict(e.copy())
     e |= d
     self.assertEqual(e, f)
Esempio n. 2
0
 def test_union_value_dict(self):
     """replacing union or of a value with a dict should be dict"""
     d = UnionDict({"A": {"B": "Blah"}})
     e = UnionDict({"A": "Blah"})
     f = UnionDict(d.copy())
     f |= e
     self.assertNotEqual(d, f)
     e |= d
     self.assertEqual(d, e)
Esempio n. 3
0
class ReadOnlyDataStoreBase:
    """a read only data store"""

    store_suffix = None

    def __init__(self,
                 source,
                 suffix=None,
                 limit=None,
                 verbose=False,
                 md5=True):
        """
        Parameters
        ----------
        source
            path to directory / zip file. Forced to end with store_suffix.
        suffix
            only members whose name matches the suffix are considered included
        limit
            the maximum number of members to consider
        verbose
            displays files that don't match search (applies only to the Zipped
            variant)
        md5 : bool
            record md5 hexadecimal checksum of read data when possible
        """
        # assuming delimiter is /

        # todo this approach to caching persistent arguments for reconstruction
        # is fragile. Need an inspect module based approach
        d = locals()
        self._persistent = UnionDict(
            {k: v
             for k, v in d.items() if k != "self"})

        source = str(source)
        suffix = suffix or ""
        if suffix != "*":  # wild card search for all
            suffix = re.sub(r"^[\s.*]+", "", suffix)  # tidy the suffix
        source = re.sub(r"/+$", "", source)  # tidy the source

        self.suffix = suffix
        if self.store_suffix and not source.endswith(self.store_suffix):
            source = ".".join([source, self.store_suffix])
        self.source = str(pathlib.Path(source).expanduser())
        self.mode = "r"
        self._members = []
        self.limit = limit
        self._verbose = verbose
        self._md5 = md5
        self._checksums = {}

    def __getstate__(self):
        return self._persistent.copy()

    def __setstate__(self, data):
        new = self.__class__(**data)
        self.__dict__.update(new.__dict__)
        return self

    def __repr__(self):
        if len(self) > 3:
            sample = str(list(self[:3]))
            sample = f"{sample[:-1]}..."
        else:
            sample = list(self)

        num = len(self)
        name = self.__class__.__name__
        return f"{num}x member {name}(source='{self.source}', members={sample})"

    def __str__(self):
        return str(list(self))

    def head(self, n=5):
        """displays top n members"""
        pprint(self[:n])

    def tail(self, n=5):
        """displays last n members"""
        pprint(self[-n:])

    def __iter__(self):
        for i, member in enumerate(self.members):
            if not isinstance(member, DataStoreMember):
                member = DataStoreMember(self.get_absolute_identifier(member),
                                         self)
                self.members[i] = member
            yield member

    def __getitem__(self, index):
        return self.members[index]

    def __len__(self):
        return len(self.members)

    def __contains__(self, identifier):
        """whether relative identifier has been stored"""
        if isinstance(identifier, DataStoreMember):
            return identifier.parent is self

        if not identifier.endswith(self.suffix):
            suffix = pathlib.Path(identifier).suffix
            # possible an "added" file
            if self.store_suffix == "zip":
                klass = ReadOnlyZippedDataStore
            else:
                klass = ReadOnlyDirectoryDataStore
            new = klass(self.source, suffix=suffix)
            return identifier in new
        identifier = self.get_relative_identifier(identifier)
        result = False
        for member in self.members:
            if identifier in member:
                result = True
                break
        return result

    def get_member(self, identifier):
        """returns DataStoreMember"""
        identifier = self.get_relative_identifier(identifier)
        for member in self.members:
            if identifier in member:
                return member
        return None

    def get_relative_identifier(self, identifier):
        """returns the identifier relative to store root path"""
        if isinstance(identifier,
                      DataStoreMember) and identifier.parent is self:
            return identifier

        source = self.source
        identifier = os.path.basename(identifier)
        if source.endswith(".zip"):
            # we insert the source path into identifier name
            # for zip members to ensure inflation creates a directory
            # containing them
            source = source.replace(".zip", "")
            source = os.path.basename(source)
            identifier = f"{source}{os.sep}{identifier}"
        else:
            identifier = Path(identifier)
            identifier = identifier.name

        return identifier

    def get_absolute_identifier(self, identifier, from_relative=False):
        """returns the identifier relative to the root path"""
        if not from_relative:
            identifier = self.get_relative_identifier(identifier)
        source = self.source.replace(".zip", "")
        if isinstance(identifier, DataStoreMember):
            identifier = identifier.name
        elif not identifier.startswith(source):
            identifier = f"{source}{os.sep}{identifier}"
        return identifier

    def read(self, identifier):
        """reads data corresponding to identifier"""
        if isinstance(identifier,
                      DataStoreMember) and identifier.parent is self:
            identifier = identifier.name
        source = self.open(identifier)

        data = source.read()
        if self._md5:
            self._checksums[identifier] = get_text_hexdigest(data)
        source.close()
        return data

    @property
    def members(self):
        raise NotImplementedError  # override in subclasses

    def open(self, identifier):
        raise NotImplementedError

    def filtered(self, pattern=None, callback=None):
        """returns list of members for which callback returns True"""
        assert any([callback, pattern]), "Must provide a pattern or a callback"
        if pattern:
            result = [m for m in self if fnmatch(m, pattern)]
        else:
            result = [m for m in self if callback(m)]
        return result

    def md5(self, identifier, force=True):
        """
        Parameters
        ----------
        identifier
            name of data store member
        force : bool
            forces reading of data if not already done

        Returns
        -------
        md5 checksum for the member, if available, None otherwise
        """
        md5_setting = self._md5  # for restoring automatic md5 calc setting
        absoluteid = self.get_absolute_identifier(identifier)
        if force and absoluteid not in self._checksums:
            self._md5 = True
            _ = self.read(absoluteid)

        result = self._checksums.get(absoluteid, None)
        self._md5 = md5_setting
        return result