def test_usecols(self): reader = NumPyFileReader(self.f4) cols = (0, 2) it = reader.iterator(chunk=0, return_trajindex=False, cols=cols) with it: for x in it: np.testing.assert_equal(x, self.d2[:, cols])
def test_stridden_access(self): reader = NumPyFileReader(self.f1) reader.chunksize = 10 wanted = np.load(self.f1) for stride in [2, 3, 5, 7, 15]: first_traj = reader.get_output(stride=stride)[0] np.testing.assert_equal(first_traj, wanted[::stride], "did not match for stride %i" % stride)
def test_lagged_stridden_access(self): reader = NumPyFileReader(self.f1) strides = [2, 3, 5, 7, 15] lags = [1, 3, 7, 10, 30] for stride in strides: for lag in lags: chunks = [] for _, _, Y in reader.iterator(stride, lag): chunks.append(Y) chunks = np.vstack(chunks) np.testing.assert_equal(chunks, self.d[lag::stride])
def test_npz(self): reader = NumPyFileReader(self.npz) all_data = reader.get_output() fh = np.load(self.npz) data = [x[1] for x in list(fh.items())] fh.close() self.assertEqual(reader.number_of_trajectories(), len(data)) for outp, inp in zip(all_data, data): np.testing.assert_equal(outp, inp)
def test_only_npy(self): reader = NumPyFileReader(self.npy_files) from_files = [np.load(f) for f in self.npy_files] concatenated = np.vstack(from_files) output = reader.get_output() self.assertEqual(reader.number_of_trajectories(), len(self.npy_files)) self.assertEqual(reader.n_frames_total(), concatenated.shape[0]) for x, y in zip(output, from_files): np.testing.assert_array_almost_equal(x, y)
def test_npz(self): reader = NumPyFileReader(self.npz) all_data = reader.get_output() fh = np.load(self.npz) data = [x[1] for x in fh.items()] fh.close() self.assertEqual(reader.number_of_trajectories(), len(data)) for outp, inp in zip(all_data, data): np.testing.assert_equal(outp, inp)
def load_from_files(cls, files): """ construct this by loading all files into memory Parameters ---------- files: str or list of str filenames to read from """ # import here to avoid cyclic import from pyemma.coordinates.data.numpy_filereader import NumPyFileReader reader = NumPyFileReader(files) data = reader.get_output() return cls(data)
def test_numpy_reader(self): arr = np.random.random(10) from pyemma.util.files import TemporaryDirectory with TemporaryDirectory() as d: files = [os.path.join(d, '1.npy'), os.path.join(d, '2.npy')] np.save(files[0], arr) np.save(files[1], arr) params = {'filenames': files, 'chunksize': 23} r = NumPyFileReader(**params) self.compare(r, params)
def test_different_shapes_value_error(self): with tempfile.NamedTemporaryFile(delete=False, suffix='.npy') as f: x = np.zeros((3, 42)) np.save(f.name, x) myfiles = self.files2d[:] myfiles.insert(1, f.name) with self.assertRaises(ValueError) as cm: NumPyFileReader(myfiles) self.assertIn("different dimensions", cm.exception.args[0])
def test_lagged_stridden_access_multiple_files(self): reader = NumPyFileReader(self.files2d) strides = [2, 3, 5, 7, 15] lags = [1, 3, 7, 10, 30] for stride in strides: for lag in lags: chunks = { i: [] for i in range(reader.number_of_trajectories()) } for itraj, _, Y in reader.iterator(stride, lag): chunks[itraj].append(Y) for i, k in enumerate(chunks.values()): stack = np.vstack(k) d = np.load(self.files2d[i]) np.testing.assert_equal( stack, d[lag::stride], "not equal for stride=%i" " and lag=%i" % (stride, lag))
def test_skip(self): for skip in [0, 3, 13]: r1 = NumPyFileReader(self.npy_files[0]) out_with_skip = r1.get_output(skip=skip)[0] r2 = NumPyFileReader(self.npy_files[0]) out = r2.get_output()[0] np.testing.assert_almost_equal( out_with_skip, out[skip::], err_msg="The first %s rows were skipped, but that did not " "match the rows with skip=0 and sliced by [%s::]" % (skip, skip))
def test_npy_reader(self): lengths_and_dims = [(7, 3), (23, 3), (27, 3)] data = [ np.empty((n, dim)) for n, dim in lengths_and_dims] files = [] with TemporaryDirectory() as td: for i, x in enumerate(data): fn = os.path.join(td, "%i.npy" % i) np.save(fn, x) files.append(fn) reader = NumPyFileReader(files) # cache it and compare results = {f: (self.db[f, reader].length, self.db[f, reader].ndim, self.db[f, reader].offsets) for f in files} expected = {f: (len(data[i]), data[i].shape[1], []) for i, f in enumerate(files)} np.testing.assert_equal(results, expected)
def test_skip_input_list(self): for skip in [0, 3, 13]: r1 = NumPyFileReader(self.npy_files) out_with_skip = r1.get_output(skip=skip) r2 = NumPyFileReader(self.npy_files) out = r2.get_output() for i in range(0, len(self.npy_files)): np.testing.assert_almost_equal( out_with_skip[i], out[i][skip::], err_msg= "The first %s rows of the %s'th file were skipped, but that did not " "match the rows with skip=0 and sliced by [%s::]" % (skip, i, skip))
def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw): r""" Creates a (possibly featured) file reader by a number of input files and either a topology file or a featurizer. Parameters ---------- :param input_files: A single input file or a list of input files. :param topology: A topology file. If given, the featurizer argument can be None. :param featurizer: A featurizer. If given, the topology file can be None. :param chunksize: The chunk size with which the corresponding reader gets initialized. :return: Returns the reader. """ from pyemma.coordinates.data.numpy_filereader import NumPyFileReader from pyemma.coordinates.data.py_csv_reader import PyCSVReader from pyemma.coordinates.data import FeatureReader from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader # fragmented trajectories if (isinstance(input_files, (list, tuple)) and len(input_files) > 0 and any(isinstance(item, (list, tuple)) for item in input_files)): return FragmentedTrajectoryReader(input_files, topology, chunksize, featurizer) # normal trajectories if (isinstance(input_files, (Path, str)) or (isinstance(input_files, (list, tuple)) and (any(isinstance(item, (Path, str)) for item in input_files) or len(input_files) == 0))): # check: if single string create a one-element list if isinstance(input_files, (Path, str)): input_list = [input_files] elif len(input_files) > 0 and all(isinstance(item, (Path, str)) for item in input_files): input_list = input_files else: if len(input_files) == 0: raise ValueError("The passed input list should not be empty.") else: raise ValueError("The passed list did not exclusively contain strings or was a list of lists " "(fragmented trajectory).") # convert to list of paths input_list = [Path(f) for f in input_list] # TODO: this does not handle suffixes like .xyz.gz (rare) suffix = input_list[0].suffix # check: do all files have the same file type? If not: raise ValueError. if all(item.suffix == suffix for item in input_list): # do all the files exist? If not: Raise value error all_exist = True from six import StringIO err_msg = StringIO() for item in input_list: if not item.is_file(): err_msg.write('\n' if err_msg.tell() > 0 else "") err_msg.write('File %s did not exist or was no file' % item) all_exist = False if not all_exist: raise ValueError('Some of the given input files were directories' ' or did not exist:\n%s' % err_msg.getvalue()) featurizer_or_top_provided = featurizer is not None or topology is not None # we need to check for h5 first, because of mdtraj custom HDF5 traj format (which is deprecated). if suffix in ('.h5', '.hdf5') and not featurizer_or_top_provided: # This check is potentially expensive for lots of files, we also re-open the file twice (causing atime updates etc.) # So we simply require that no featurizer option is given. # and not all((_is_mdtraj_hdf5_file(f) for f in input_files)): from pyemma.coordinates.data.h5_reader import H5Reader reader = H5Reader(filenames=input_files, chunk_size=chunksize, **kw) # CASE 1.1: file types are MD files elif FeatureReader.supports_format(suffix): # check: do we either have a featurizer or a topology file name? If not: raise ValueError. # create a MD reader with file names and topology if not featurizer_or_top_provided: raise ValueError('The input files were MD files which makes it mandatory to have either a ' 'Featurizer or a topology file.') if suffix in ('.pdb', '.pdb.gz'): raise ValueError('PyEMMA can not read PDB-fake-trajectories. ' 'Please consider using a sane trajectory format (e.g. xtc, dcd).') reader = FeatureReader(input_list, featurizer=featurizer, topologyfile=topology, chunksize=chunksize) elif suffix in ('.npy', '.npz'): reader = NumPyFileReader(input_list, chunksize=chunksize) # otherwise we assume that given files are ascii tabulated data else: reader = PyCSVReader(input_list, chunksize=chunksize, **kw) else: raise ValueError('Not all elements in the input list were of the type %s!' % suffix) else: raise ValueError('Input "{}" was no string or list of strings.'.format(input_files)) return reader
def create_file_reader(input_files, topology, featurizer, chunk_size=1000, **kw): r""" Creates a (possibly featured) file reader by a number of input files and either a topology file or a featurizer. Parameters ---------- :param input_files: A single input file or a list of input files. :param topology: A topology file. If given, the featurizer argument can be None. :param featurizer: A featurizer. If given, the topology file can be None. :param chunk_size: The chunk size with which the corresponding reader gets initialized. :return: Returns the reader. """ from pyemma.coordinates.data.numpy_filereader import NumPyFileReader from pyemma.coordinates.data.py_csv_reader import PyCSVReader from pyemma.coordinates.data import FeatureReader from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader # fragmented trajectories if (isinstance(input_files, (list, tuple)) and len(input_files) > 0 and any(isinstance(item, (list, tuple)) for item in input_files)): return FragmentedTrajectoryReader(input_files, topology, chunk_size, featurizer) # normal trajectories if (isinstance(input_files, string_types) or (isinstance(input_files, (list, tuple)) and (any(isinstance(item, string_types) for item in input_files) or len(input_files) is 0))): reader = None # check: if single string create a one-element list if isinstance(input_files, string_types): input_list = [input_files] elif len(input_files) > 0 and all( isinstance(item, string_types) for item in input_files): input_list = input_files else: if len(input_files) is 0: raise ValueError("The passed input list should not be empty.") else: raise ValueError( "The passed list did not exclusively contain strings or was a list of lists " "(fragmented trajectory).") # TODO: this does not handle suffixes like .xyz.gz (rare) _, suffix = os.path.splitext(input_list[0]) # check: do all files have the same file type? If not: raise ValueError. if all(item.endswith(suffix) for item in input_list): # do all the files exist? If not: Raise value error all_exist = True err_msg = "" for item in input_list: if not os.path.isfile(item): err_msg += "\n" if len(err_msg) > 0 else "" err_msg += "File %s did not exist or was no file" % item all_exist = False if not all_exist: raise ValueError( "Some of the given input files were directories" " or did not exist:\n%s" % err_msg) if all_exist: from mdtraj.formats.registry import FormatRegistry # CASE 1.1: file types are MD files if suffix in list(FormatRegistry.loaders.keys()): # check: do we either have a featurizer or a topology file name? If not: raise ValueError. # create a MD reader with file names and topology if not featurizer and not topology: raise ValueError( "The input files were MD files which makes it mandatory to have either a " "featurizer or a topology file.") reader = FeatureReader(input_list, featurizer=featurizer, topologyfile=topology, chunksize=chunk_size) else: if suffix in ['.npy', '.npz']: reader = NumPyFileReader(input_list, chunksize=chunk_size) # otherwise we assume that given files are ascii tabulated data else: reader = PyCSVReader(input_list, chunksize=chunk_size, **kw) else: raise ValueError( "Not all elements in the input list were of the type %s!" % suffix) else: raise ValueError("Input \"%s\" was no string or list of strings." % input) return reader
def testSingleFile(self): reader = NumPyFileReader(self.npy_files[0]) self.assertEqual(reader.n_frames_total(), self.d.shape[0])
def test_describe(self): r = NumPyFileReader(self.files2d) r.describe()