def test_read_1file_oneline(self): tiny = np.array([1, 2, 3]) with tempfile.NamedTemporaryFile(mode='wb', suffix='.dat', delete=False) as f: np.savetxt(f, tiny) f.close() reader = CSVReader(f.name, delimiters=" ") np.testing.assert_equal(reader.get_output()[0], np.atleast_2d(tiny).T)
def test_with_kwargs(self): args = {'header': 27} reader = CSVReader(self.filename1, **args) output = reader.get_output() np.testing.assert_almost_equal(output[0], self.data)
def test_newline_at_eof_with_header(self): with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: f.write("#x y z\n1 2 3\n4 5 6\n\n") f.close() desired = np.genfromtxt(f.name, dtype=np.float32).reshape(-1, 3) reader = CSVReader(f.name) result = reader.get_output()[0] np.testing.assert_allclose(result, desired)
def test_reset(self): reader = CSVReader((self.filename1, self.filename2)) it = reader.iterator() data = [chunk for chunk in it] it.reset() data2 = [chunk for chunk in it] np.testing.assert_equal(data, data2)
def test_holes_in_file(self): x = "1 2 3\n4 5 6\n7 8 9" desired = np.fromstring(x, sep=" ", dtype=np.float32).reshape(-1, 3) with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: f.write(x) f.close() reader = CSVReader(f.name) result = reader.get_output()[0] np.testing.assert_allclose(result, desired)
def test_read_1file_with_header(self): reader = CSVReader(self.file_with_header) self.assertEqual(reader.number_of_trajectories(), 1) self.assertEqual(reader.dimension(), self.nd) self.assertEqual(reader.n_frames_total(), self.nt) output = reader.get_output() np.testing.assert_almost_equal(output[0], self.data)
def test_newline_at_eof_carriage_return(self): x = "1 2 3\r\n4 5 6\r\n" desired = np.fromstring(x, sep=" ", dtype=np.float32).reshape(-1, 3) with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: f.write(x) f.close() reader = CSVReader(f.name) result = reader.get_output()[0] np.testing.assert_allclose(result, desired)
def test_with_lag(self): reader = CSVReader(self.filename1) for t in [23, 7, 59]: chunks = [] it = reader.iterator(stride=1, lag=t) with it: for _, _, Y in it: chunks.append(Y) chunks = np.vstack(chunks) np.testing.assert_almost_equal(chunks, self.data[t:])
def test_read_with_skipping_first_few_couple_lines_multiple_trajectoryfiles(self): for skip in [0, 3, 13]: r1 = CSVReader([self.filename1, self.filename2]) out_with_skip = r1.get_output(skip=skip) r2 = CSVReader([self.filename1, self.filename2]) out = r2.get_output() np.testing.assert_almost_equal(out_with_skip[0], out[0][skip::], err_msg="The first %s rows of the first file were skipped, but that did not " "match the rows with skip=0 and sliced by [%s::]" % (skip, skip)) np.testing.assert_almost_equal(out_with_skip[1], out[1][skip::], err_msg="The first %s rows of the second file were skipped, but that did not" " match the rows with skip=0 and sliced by [%s::]" % (skip, skip))
def test_read_with_skipping_first_few_couple_lines(self): for skip in [0, 3, 13]: # FIXME: opening the same file twice is not being liked by py27 r1 = CSVReader(self.filename1, chunksize=30) out_with_skip = r1.get_output(skip=skip)[0] assert len(out_with_skip) == len(self.data[skip:]) r2 = CSVReader(self.filename1, chunksize=30) self.maxDiff=None #self.assertDictEqual(r1.__dict__, r2.__dict__) out = r2.get_output()[0] np.testing.assert_almost_equal(out_with_skip, out[skip::], err_msg="The first %s rows were skipped, but that did not " "match the rows with skip=0 and sliced by [%s::]" % (skip, skip))
def test_read_lagged_small_chunks(self): lag = 200 reader = CSVReader(self.filename1, chunksize=30) lagged_data = self.data[lag:] lagged_chunks = [] it = reader.iterator(lag=lag) with it: for _, _, Y in it: assert len(Y) > 0 lagged_chunks.append(Y) lagged_chunks = np.vstack(lagged_chunks) np.testing.assert_almost_equal(lagged_chunks, lagged_data)
def test_csvreader(self): data = np.random.random((101, 3)) fn = tempfile.mktemp() try: np.savetxt(fn, data) # calc offsets offsets = [0] with open(fn, PyCSVReader.DEFAULT_OPEN_MODE) as new_fh: while new_fh.readline(): offsets.append(new_fh.tell()) reader = PyCSVReader(fn) assert reader.dimension() == 3 trajinfo = reader._get_traj_info(fn) np.testing.assert_equal(offsets, trajinfo.offsets) finally: os.unlink(fn)
def test_with_stride_and_lag_with_header(self): reader = CSVReader(self.file_with_header) for s in [2, 3, 7, 10]: for t in [1, 23, 7, 59]: chunks = [] chunks_lag = [] it = reader.iterator(stride=s, lag=t) with it: for _, X, Y in it: chunks.append(X) chunks_lag.append(Y) chunks = np.vstack(chunks) chunks_lag = np.vstack(chunks_lag) actual_lagged = self.data[t::s] np.testing.assert_almost_equal(chunks, self.data[::s][0:len(actual_lagged)]) np.testing.assert_almost_equal(chunks_lag, self.data[t::s], err_msg="output is not equal for" " lag %i and stride %i" % (t, s))
def test_compare_readline(self): data = np.arange(99*3).reshape(-1, 3) with tempfile.NamedTemporaryFile(delete=False) as f: fn = f.name np.savetxt(fn, data) # calc offsets reader = CSVReader(fn) assert reader.dimension() == 3 trajinfo = reader._get_traj_info(fn) offset = [0] with open(fn, CSVReader.DEFAULT_OPEN_MODE) as fh2: while fh2.readline(): offset.append(fh2.tell()) fh2.seek(0) np.testing.assert_equal(trajinfo.offsets, offset) for ii, off in enumerate(trajinfo.offsets): fh2.seek(off) line = fh2.readline() fh2.seek(offset[ii]) line2 = fh2.readline() self.assertEqual(line, line2, "differs at offset %i (%s != %s)" % (ii, off, offset[ii]))
def test_use_cols(self): reader = CSVReader(self.filename1) cols = (0, 2) with reader.iterator(chunk=0, cols=cols, return_trajindex=False) as it: for x in it: np.testing.assert_equal(x, self.data[:, cols])
def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw): r""" Creates a (possibly featured) file reader by a number of input files and either a topology file or a featurizer. Parameters ---------- :param input_files: A single input file or a list of input files. :param topology: A topology file. If given, the featurizer argument can be None. :param featurizer: A featurizer. If given, the topology file can be None. :param chunksize: The chunk size with which the corresponding reader gets initialized. :return: Returns the reader. """ from pyerna.coordinates.data.numpy_filereader import NumPyFileReader from pyerna.coordinates.data.py_csv_reader import PyCSVReader from pyerna.coordinates.data import FeatureReader from pyerna.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader # fragmented trajectories if (isinstance(input_files, (list, tuple)) and len(input_files) > 0 and any(isinstance(item, (list, tuple)) for item in input_files)): return FragmentedTrajectoryReader(input_files, topology, chunksize, featurizer) # normal trajectories if (isinstance(input_files, str) or (isinstance(input_files, (list, tuple)) and (any(isinstance(item, str) for item in input_files) or len(input_files) is 0))): reader = None # check: if single string create a one-element list if isinstance(input_files, str): input_list = [input_files] elif len(input_files) > 0 and all( isinstance(item, str) for item in input_files): input_list = input_files else: if len(input_files) is 0: raise ValueError("The passed input list should not be empty.") else: raise ValueError( "The passed list did not exclusively contain strings or was a list of lists " "(fragmented trajectory).") # TODO: this does not handle suffixes like .xyz.gz (rare) _, suffix = os.path.splitext(input_list[0]) suffix = str(suffix) # check: do all files have the same file type? If not: raise ValueError. if all(item.endswith(suffix) for item in input_list): # do all the files exist? If not: Raise value error all_exist = True from six import StringIO err_msg = StringIO() for item in input_list: if not os.path.isfile(item): err_msg.write('\n' if err_msg.tell() > 0 else "") err_msg.write('File %s did not exist or was no file' % item) all_exist = False if not all_exist: raise ValueError( 'Some of the given input files were directories' ' or did not exist:\n%s' % err_msg.getvalue()) featurizer_or_top_provided = featurizer is not None or topology is not None # we need to check for h5 first, because of mdtraj custom HDF5 traj format (which is deprecated). if suffix in ('.h5', '.hdf5') and not featurizer_or_top_provided: # This check is potentially expensive for lots of files, we also re-open the file twice (causing atime updates etc.) # So we simply require that no featurizer option is given. # and not all((_is_mdtraj_hdf5_file(f) for f in input_files)): from pyerna.coordinates.data.h5_reader import H5Reader reader = H5Reader(filenames=input_files, chunk_size=chunksize, **kw) # CASE 1.1: file types are MD files elif FeatureReader.supports_format(suffix): # check: do we either have a featurizer or a topology file name? If not: raise ValueError. # create a MD reader with file names and topology if not featurizer_or_top_provided: raise ValueError( 'The input files were MD files which makes it mandatory to have either a ' 'Featurizer or a topology file.') if suffix in ('.pdb', '.pdb.gz'): raise ValueError( 'PyEMMA can not read PDB-fake-trajectories. ' 'Please consider using a sane trajectory format (e.g. xtc, dcd).' ) reader = FeatureReader(input_list, featurizer=featurizer, topologyfile=topology, chunksize=chunksize) elif suffix in ('.npy', '.npz'): reader = NumPyFileReader(input_list, chunksize=chunksize) # otherwise we assume that given files are ascii tabulated data else: reader = PyCSVReader(input_list, chunksize=chunksize, **kw) else: raise ValueError( 'Not all elements in the input list were of the type %s!' % suffix) else: raise ValueError( 'Input "{}" was no string or list of strings.'.format(input_files)) return reader
def test_with_multiple_files(self): files = [self.filename1, self.file_with_header] reader = CSVReader(files) self.assertEqual(reader.number_of_trajectories(), len(files))
def test_with_stride(self): reader = CSVReader(self.filename1) for s in [2, 3, 7, 10]: output = reader.get_output(stride=s)[0] np.testing.assert_almost_equal(output, self.data[::s], err_msg="stride=%s"%s)