Example #1
0
    def _get_traj_info(self, filename):
        # noinspection PyUnresolvedReferences
        import tables
        import h5py

        with h5py.File(filename, mode='r') as f:
            try:
                sel = self.selection[filename]
            except (KeyError, TypeError):
                sel = self.selection
            import re

            # unfortunately keys do not start with a root, so insert it now to simplify matching.
            keys = list(f.keys())
            for i, k in enumerate(keys):
                if not k.startswith('/'):
                    keys[i] = '/' + k

            def name_matches_selection(_, obj, matches):
                if not isinstance(obj, h5py.Dataset):
                    return

                m = re.match(sel, obj.name)
                if m is not None:
                    matches.append(m)

            from functools import partial
            matches = []
            f.visititems(partial(name_matches_selection, matches=matches))

            if not matches:
                self.logger.warning(
                    'selection "%s" did not match any group/dataset in file "%s"',
                    sel, filename)

            children = []
            for m in matches:
                path = m.string
                h5_item = f[path]
                _, shape_2d = self._reshape(h5_item, dry=True)
                lengths, ndim = shape_2d
                self._itraj_dataset_mapping[self._itraj_counter] = (filename,
                                                                    path)
                self._itraj_counter += 1
                children.append(TrajInfo(ndim, lengths))

        if children:
            t = children[0]
            t.children = children[1:]
        else:
            t = TrajInfo(-1, 0)
        return t
    def _create_traj_info(row):
        # convert a database row to a TrajInfo object
        try:
            hash = row[0]
            length = row[1]
            ndim = row[2]
            offsets = row[3]
            assert isinstance(offsets, np.ndarray)
            abs_path = row[4]
            version = row[5]

            info = TrajInfo()
            info._version = version
            if version == 2:
                info._hash = hash
                info._ndim = ndim
                info._length = length
                info._offsets = offsets
                info._abs_path = abs_path
            else:
                raise ValueError("unknown version %s" % version)
            return info
        except Exception as ex:
            logger.exception(ex)
            raise UnknownDBFormatException(ex)
Example #3
0
    def _get_traj_info(self, filename):
        with mdtraj.open(filename, mode='r') as fh:
            length = len(fh)
            frame = fh.read(1)[0]
            ndim = np.shape(frame)[1]
            offsets = fh.offsets if hasattr(fh, 'offsets') else []

        return TrajInfo(ndim, length, offsets)
Example #4
0
    def _get_traj_info(self, filename):
        # calc byte offsets, csv dialect and dimension (elements in first valid row)

        with open(filename, self.DEFAULT_OPEN_MODE) as fh:
            length, offsets = PyCSVReader._calc_offsets(fh)
            dialect, length, skip = self._determine_dialect(fh, length)
            ndim = PyCSVReader._get_dimension(fh, dialect, skip)

        return TrajInfo(ndim, length, offsets)
Example #5
0
    def _get_traj_info(self, filename):
        with mdtraj.open(filename, mode='r') as fh:
            try:
                length = len(fh)
            # certain formats like txt based ones (.gro, .lammpstrj) do not implement len()
            except (NotImplementedError, TypeError):
                frame = fh.read(1)[0]
                ndim = np.shape(frame)[1]
                _ = fh.read()
                length = fh.tell()
            else:
                frame = fh.read(1)[0]
                ndim = np.shape(frame)[1]

            offsets = fh.offsets if hasattr(fh, 'offsets') else ()

        return TrajInfo(ndim, length, offsets)
Example #6
0
    def _get_traj_info(self, filename):
        # workaround NotImplementedError __len__ for xyz files
        # Github issue: markovmodel/pyemma#621
        if six.PY2:
            from mock import patch
        else:
            from unittest.mock import patch
        from mdtraj.formats import XYZTrajectoryFile

        def _make_len_func(top):
            def _len_xyz(self):
                assert isinstance(self, XYZTrajectoryFile)
                assert hasattr(
                    self, '_filename'), "structual change in xyzfile class!"
                import warnings
                from pyemma.util.exceptions import EfficiencyWarning
                warnings.warn(
                    "reading all of your data,"
                    " just to determine number of frames." +
                    " Happens only once, because this is cached."
                    if config['use_trajectory_lengths_cache'] else "",
                    EfficiencyWarning)
                # obtain len by reading whole file!
                mditer = mdtraj.iterload(self._filename, top=top)
                return sum(t.n_frames for t in mditer)

            return _len_xyz

        f = _make_len_func(self.topfile)

        # lookups pre-computed lengths, or compute it on the fly and store it in db.
        with patch.object(XYZTrajectoryFile, '__len__', f):
            with mdtraj.open(filename, mode='r') as fh:
                length = len(fh)
                frame = fh.read(1)[0]
                ndim = np.shape(frame)[1]
                offsets = fh.offsets if hasattr(fh, 'offsets') else []

        return TrajInfo(ndim, length, offsets)
Example #7
0
    def _get_traj_info(self, filename):
        idx = self.filenames.index(filename)

        def new_size(x):
            return int(ceil(x * 1.2))
        # how to handle mode?
        """
        On Windows, tell() can return illegal values (after an fgets()) when
        reading files with Unix-style line-endings. Use binary mode ('rb') to
        circumvent this problem.
        """
        with open(filename, self.DEFAULT_OPEN_MODE) as fh:
            # approx by filesize / (first line + 20%)
            size = new_size(os.stat(filename).st_size / len(fh.readline()))
            assert size > 0
            fh.seek(0)
            offsets = np.empty(size, dtype=np.int64)
            offsets[0] = 0
            i = 1
            while fh.readline():
                offsets[i] = fh.tell()
                i += 1
                if i >= len(offsets):
                    offsets = np.resize(offsets, new_size(len(offsets)))
            offsets = offsets[:i]
            length = len(offsets) - 1
            fh.seek(0)

            # auto detect delimiter with csv.Sniffer
            if self._delimiters[idx] is None:
                # determine delimiter
                sample = fh.read(2048)
                sniffer = csv.Sniffer()
                try:
                    self._dialects[idx] = sniffer.sniff(sample)
                except csv.Error as e:
                    s = ('During handling of file "%s" follwing error occured:'
                         ' "%s". Sample was "%s"' % (filename, e, sample))
                    raise RuntimeError(s)
                if sniffer.has_header(sample):
                    self._skip[idx] += 1
                    length -= 1
            else:
                class custom_dialect(csv.Dialect):
                    delimiter = self._delimiters[idx]
                    quotechar = '"'
                    # lets enforce \n because we use text mode with 'U' (unified newline)
                    lineterminator = '\n'
                    quoting = csv.QUOTE_MINIMAL
                d = custom_dialect()
                d.delimiter = self._delimiters[idx]

                # determine header
                hdr = False
                for line in fh:
                    if line.startswith(self._comments[idx]):
                        hdr += 1
                        continue

                self._skip[idx] += hdr
                length -= hdr

                self._dialects[idx] = d
            # if we have a header subtract it from total length
            fh.seek(0)
            r = csv.reader(fh, dialect=self._dialects[idx])
            for _ in range(self._skip[idx]+1):
                line = next(r)

            try:
                arr = np.array(line).astype(float)
            except ValueError as ve:
                s = 'could not parse first line of data in file "%s"' % filename
                raise ValueError(s, ve)
            s = arr.squeeze().shape
            if len(s) == 1:
                ndim = s[0]
            else:
                ndim = 1

        return TrajInfo(ndim, length, offsets)
Example #8
0
    def _get_traj_info(self, filename):
        idx = self.filenames.index(filename)
        array = self._load_file(idx)
        length, ndim = np.shape(array)

        return TrajInfo(ndim, length)