def _get_traj_info(self, filename): # noinspection PyUnresolvedReferences import tables import h5py with h5py.File(filename, mode='r') as f: try: sel = self.selection[filename] except (KeyError, TypeError): sel = self.selection import re # unfortunately keys do not start with a root, so insert it now to simplify matching. keys = list(f.keys()) for i, k in enumerate(keys): if not k.startswith('/'): keys[i] = '/' + k def name_matches_selection(_, obj, matches): if not isinstance(obj, h5py.Dataset): return m = re.match(sel, obj.name) if m is not None: matches.append(m) from functools import partial matches = [] f.visititems(partial(name_matches_selection, matches=matches)) if not matches: self.logger.warning( 'selection "%s" did not match any group/dataset in file "%s"', sel, filename) children = [] for m in matches: path = m.string h5_item = f[path] _, shape_2d = self._reshape(h5_item, dry=True) lengths, ndim = shape_2d self._itraj_dataset_mapping[self._itraj_counter] = (filename, path) self._itraj_counter += 1 children.append(TrajInfo(ndim, lengths)) if children: t = children[0] t.children = children[1:] else: t = TrajInfo(-1, 0) return t
def _create_traj_info(row): # convert a database row to a TrajInfo object try: hash = row[0] length = row[1] ndim = row[2] offsets = row[3] assert isinstance(offsets, np.ndarray) abs_path = row[4] version = row[5] info = TrajInfo() info._version = version if version == 2: info._hash = hash info._ndim = ndim info._length = length info._offsets = offsets info._abs_path = abs_path else: raise ValueError("unknown version %s" % version) return info except Exception as ex: logger.exception(ex) raise UnknownDBFormatException(ex)
def _get_traj_info(self, filename): with mdtraj.open(filename, mode='r') as fh: length = len(fh) frame = fh.read(1)[0] ndim = np.shape(frame)[1] offsets = fh.offsets if hasattr(fh, 'offsets') else [] return TrajInfo(ndim, length, offsets)
def _get_traj_info(self, filename): # calc byte offsets, csv dialect and dimension (elements in first valid row) with open(filename, self.DEFAULT_OPEN_MODE) as fh: length, offsets = PyCSVReader._calc_offsets(fh) dialect, length, skip = self._determine_dialect(fh, length) ndim = PyCSVReader._get_dimension(fh, dialect, skip) return TrajInfo(ndim, length, offsets)
def _get_traj_info(self, filename): with mdtraj.open(filename, mode='r') as fh: try: length = len(fh) # certain formats like txt based ones (.gro, .lammpstrj) do not implement len() except (NotImplementedError, TypeError): frame = fh.read(1)[0] ndim = np.shape(frame)[1] _ = fh.read() length = fh.tell() else: frame = fh.read(1)[0] ndim = np.shape(frame)[1] offsets = fh.offsets if hasattr(fh, 'offsets') else () return TrajInfo(ndim, length, offsets)
def _get_traj_info(self, filename): # workaround NotImplementedError __len__ for xyz files # Github issue: markovmodel/pyemma#621 if six.PY2: from mock import patch else: from unittest.mock import patch from mdtraj.formats import XYZTrajectoryFile def _make_len_func(top): def _len_xyz(self): assert isinstance(self, XYZTrajectoryFile) assert hasattr( self, '_filename'), "structual change in xyzfile class!" import warnings from pyemma.util.exceptions import EfficiencyWarning warnings.warn( "reading all of your data," " just to determine number of frames." + " Happens only once, because this is cached." if config['use_trajectory_lengths_cache'] else "", EfficiencyWarning) # obtain len by reading whole file! mditer = mdtraj.iterload(self._filename, top=top) return sum(t.n_frames for t in mditer) return _len_xyz f = _make_len_func(self.topfile) # lookups pre-computed lengths, or compute it on the fly and store it in db. with patch.object(XYZTrajectoryFile, '__len__', f): with mdtraj.open(filename, mode='r') as fh: length = len(fh) frame = fh.read(1)[0] ndim = np.shape(frame)[1] offsets = fh.offsets if hasattr(fh, 'offsets') else [] return TrajInfo(ndim, length, offsets)
def _get_traj_info(self, filename): idx = self.filenames.index(filename) def new_size(x): return int(ceil(x * 1.2)) # how to handle mode? """ On Windows, tell() can return illegal values (after an fgets()) when reading files with Unix-style line-endings. Use binary mode ('rb') to circumvent this problem. """ with open(filename, self.DEFAULT_OPEN_MODE) as fh: # approx by filesize / (first line + 20%) size = new_size(os.stat(filename).st_size / len(fh.readline())) assert size > 0 fh.seek(0) offsets = np.empty(size, dtype=np.int64) offsets[0] = 0 i = 1 while fh.readline(): offsets[i] = fh.tell() i += 1 if i >= len(offsets): offsets = np.resize(offsets, new_size(len(offsets))) offsets = offsets[:i] length = len(offsets) - 1 fh.seek(0) # auto detect delimiter with csv.Sniffer if self._delimiters[idx] is None: # determine delimiter sample = fh.read(2048) sniffer = csv.Sniffer() try: self._dialects[idx] = sniffer.sniff(sample) except csv.Error as e: s = ('During handling of file "%s" follwing error occured:' ' "%s". Sample was "%s"' % (filename, e, sample)) raise RuntimeError(s) if sniffer.has_header(sample): self._skip[idx] += 1 length -= 1 else: class custom_dialect(csv.Dialect): delimiter = self._delimiters[idx] quotechar = '"' # lets enforce \n because we use text mode with 'U' (unified newline) lineterminator = '\n' quoting = csv.QUOTE_MINIMAL d = custom_dialect() d.delimiter = self._delimiters[idx] # determine header hdr = False for line in fh: if line.startswith(self._comments[idx]): hdr += 1 continue self._skip[idx] += hdr length -= hdr self._dialects[idx] = d # if we have a header subtract it from total length fh.seek(0) r = csv.reader(fh, dialect=self._dialects[idx]) for _ in range(self._skip[idx]+1): line = next(r) try: arr = np.array(line).astype(float) except ValueError as ve: s = 'could not parse first line of data in file "%s"' % filename raise ValueError(s, ve) s = arr.squeeze().shape if len(s) == 1: ndim = s[0] else: ndim = 1 return TrajInfo(ndim, length, offsets)
def _get_traj_info(self, filename): idx = self.filenames.index(filename) array = self._load_file(idx) length, ndim = np.shape(array) return TrajInfo(ndim, length)