Esempio n. 1
0
def test_dtype(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100, 5))
        ff.write(("\n\n\n").encode())
        numpy.savetxt(ff, data, fmt='%.7e')
        ff.seek(0)

        # specify the dtype as dict
        names = ['a', 'b', 'c', 'd', 'e']
        dtype = {'a': 'f4', 'b': 'i8', 'c': 'f16'}
        f = CSVFile(path=ff.name, names=names, blocksize=100, dtype=dtype)

        # make sure data is the same
        assert f.dtype['a'] == 'f4'
        assert f.dtype['b'] == 'i8'
        assert f.dtype['c'] == 'f16'

        # specify the dtype as dict
        f = CSVFile(path=ff.name, names=names, blocksize=100, dtype="f4")

        # make sure data is the same
        assert all(f.dtype[col] == 'f4' for col in names)
Esempio n. 2
0
def test_comma_sep(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100, 5))
        numpy.savetxt(ff, data, fmt='%.7e', delimiter=',')
        ff.seek(0)

        # this must fail
        names = ['a', 'b', 'c', 'd', 'e']
        with pytest.raises(ValueError):
            f = CSVFile(path=ff.name,
                        names=names,
                        blocksize=100,
                        delim_whitespace=True)

        # use , as delimiter
        f = CSVFile(path=ff.name,
                    names=names,
                    blocksize=100,
                    delim_whitespace=False,
                    sep=',')
        data2 = f.asarray()
        numpy.testing.assert_almost_equal(data, data2[:], decimal=7)
Esempio n. 3
0
def test_skiprows(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100, 5))
        numpy.savetxt(ff, data, fmt='%.7e')
        ff.seek(0)

        # read nrows
        names = ['a', 'b', 'c', 'd', 'e']
        f = CSVFile(path=ff.name,
                    names=names,
                    blocksize=1000,
                    nrows=50,
                    skiprows=25)

        # the right size
        assert f.size == len(
            f[:]), "mismatch between 'size' and data read from file"
        assert f.size == 50, "error combining 'skiprows' and 'nrows'"

        # make sure right portion of data was read
        data2 = f.asarray()
        numpy.testing.assert_almost_equal(data[25:75], data2[:], decimal=7)
Esempio n. 4
0
def test_asarray(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100, 5))
        numpy.savetxt(ff, data, fmt='%.7e')
        ff.seek(0)

        # read
        names = ['a', 'b', 'c', 'd', 'e']
        f = CSVFile(path=ff.name,
                    names=names,
                    dtype={
                        'a': 'f4',
                        'b': 'f8'
                    },
                    blocksize=100)

        # cannot do asarray twice
        a = f['a']
        with pytest.raises(ValueError):
            a2 = a.asarray()

        # cannot do view with different dtypes
        with pytest.raises(ValueError):
            f2 = f.asarray()
Esempio n. 5
0
def test_dask(comm):

    with tempfile.NamedTemporaryFile() as ff:    
        
        # generate data
        data = numpy.random.random(size=(100,5))
        numpy.savetxt(ff, data, fmt='%.7e'); ff.seek(0)
        
        # read
        names =['a', 'b', 'c', 'd', 'e']
        f = CSVFile(path=ff.name, names=names, blocksize=100)
        
        a_dask = f.get_dask('a')
        numpy.testing.assert_almost_equal(a_dask.compute(), data[:,0])
Esempio n. 6
0
def test_getitem(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100, 5))
        numpy.savetxt(ff, data, fmt='%.7e')
        ff.seek(0)

        # read
        names = ['a', 'b', 'c', 'd', 'e']
        f = CSVFile(path=ff.name, names=names, blocksize=100)

        # empty slice
        with pytest.raises(IndexError):
            empty = f[[]]

        # cannot slice twice
        a = f['a']
        with pytest.raises(IndexError):
            a2 = a['a']

        # bad column name
        with pytest.raises(IndexError):
            bad = f[['BAD1', 'BAD2']]

        # slice multiple columns
        f2 = f[['a', 'b']]
        assert f2.columns == ['a', 'b']
        f3 = f2[['a']]
        assert f3.columns == ['a']

        # slice as an array
        d = f.asarray()
        numpy.testing.assert_almost_equal(d[:, 0:2], data[:, 0:2])
        numpy.testing.assert_almost_equal(d[:50], data[:50])
        numpy.testing.assert_almost_equal(d[0].squeeze(), data[0])

        # pass list of integers
        numpy.testing.assert_almost_equal(d[[0, 1, 2]], data[[0, 1, 2]])

        # pass boolean slice
        valid = numpy.random.choice([True, False], replace=True, size=len(f))
        f2 = f[valid]
        numpy.testing.assert_almost_equal(f2['a'][:], data[valid, 0])

        # wrong slice shape
        with pytest.raises(IndexError):
            d2 = d[:, :, :]
Esempio n. 7
0
def test_dask(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100, 5))
        numpy.savetxt(ff, data, fmt='%.7e')
        ff.seek(0)

        # read
        names = ['a', 'b', 'c', 'd', 'e']
        f = CSVFile(path=ff.name, names=names, blocksize=100)

        a_dask = f.get_dask('a')
        numpy.testing.assert_almost_equal(a_dask.compute(), data[:, 0])
Esempio n. 8
0
def test_getitem(comm):

    with tempfile.NamedTemporaryFile() as ff:    
        
        # generate data
        data = numpy.random.random(size=(100,5))
        numpy.savetxt(ff, data, fmt='%.7e'); ff.seek(0)
        
        # read
        names =['a', 'b', 'c', 'd', 'e']
        f = CSVFile(path=ff.name, names=names, blocksize=100)

        # empty slice        
        with pytest.raises(IndexError):
            empty = f[[]]

        # cannot slice twice
        a = f['a']
        with pytest.raises(IndexError):
            a2 = a['a']
        
        # bad column name
        with pytest.raises(IndexError):
            bad = f[['BAD1', 'BAD2']]
        
        # slice multiple columns
        f2 = f[['a', 'b']]
        assert f2.columns == ['a', 'b']
        f3 = f2[['a']]
        assert f3.columns == ['a']
        
        # slice as an array 
        d = f.asarray()
        numpy.testing.assert_almost_equal(d[:,0:2], data[:,0:2])
        numpy.testing.assert_almost_equal(d[:50], data[:50])
        numpy.testing.assert_almost_equal(d[0].squeeze(), data[0])
        
        # pass list of integers
        numpy.testing.assert_almost_equal(d[[0,1,2]], data[[0,1,2]])
        
        # pass boolean slice
        valid = numpy.random.choice([True, False], replace=True, size=len(f))
        f2 = f[valid]
        numpy.testing.assert_almost_equal(f2['a'][:], data[valid, 0])
        
        # wrong slice shape
        with pytest.raises(IndexError):
            d2 = d[:,:,:]
Esempio n. 9
0
def test_no_trailing_newline(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data with blank lines
        data = numpy.array([[1, 1, 1, 1], [2, 2, 2, 2]], dtype='i4')
        ff.write(("1 1 1 1\n2 2 2 2").encode())
        ff.seek(0)

        # this will fail -- header should not be in file
        names = ['a', 'b', 'c', 'd']
        f = CSVFile(path=ff.name, names=names, blocksize=1000)

        assert (f.size == 2)
        data2 = f.asarray()
        numpy.testing.assert_almost_equal(data, data2[:], decimal=7)
Esempio n. 10
0
def test_no_trailing_newline(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data with blank lines
        data = numpy.array([[1, 1, 1, 1], [2, 2, 2, 2]], dtype='i4')
        ff.write(("1 1 1 1\n2 2 2 2").encode())
        ff.seek(0)

        # this will fail -- header should not be in file
        names =['a', 'b', 'c', 'd']
        f = CSVFile(path=ff.name, names=names, blocksize=1000)

        assert(f.size == 2)
        data2 = f.asarray()
        numpy.testing.assert_almost_equal(data, data2[:], decimal=7)
Esempio n. 11
0
def test_invalid_keywords(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100, 5))
        numpy.savetxt(ff, data)
        ff.seek(0)

        # the bad keywords
        bad_kws = {
            'index_col': True,
            'header': True,
            'skiprows': [0, 1, 2],
            'skipfooter': True,
            'comment': '#'
        }

        # try each bad kewyord
        names = ['a', 'b', 'c', 'd', 'e']
        for k, v in bad_kws.items():
            with pytest.raises(ValueError):
                f = CSVFile(path=ff.name,
                            names=names,
                            blocksize=1000,
                            **{k: v})
Esempio n. 12
0
def test_comma_sep(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100,5))
        numpy.savetxt(ff, data, fmt='%.7e', delimiter=','); ff.seek(0)

        # this must fail
        names =['a', 'b', 'c', 'd', 'e']
        with pytest.raises(ValueError):
            f = CSVFile(path=ff.name, names=names, blocksize=100, delim_whitespace=True)

        # use , as delimiter
        f = CSVFile(path=ff.name, names=names, blocksize=100, delim_whitespace=False, sep=',')
        data2 = f.asarray()
        numpy.testing.assert_almost_equal(data, data2[:], decimal=7)
Esempio n. 13
0
def test_data(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100,5))
        numpy.savetxt(ff, data, fmt='%.7e'); ff.seek(0)

        # read nrows
        names =['a', 'b', 'c', 'd', 'e']
        f = CSVFile(path=ff.name, names=names, blocksize=100)

        # make sure data is the same
        data2 = f.asarray()
        numpy.testing.assert_almost_equal(data, data2[:], decimal=7)

        # make sure all the columns are there
        assert all(col in f for col in names)
Esempio n. 14
0
def test_data(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100, 5))
        numpy.savetxt(ff, data, fmt='%.7e')
        ff.seek(0)

        # read nrows
        names = ['a', 'b', 'c', 'd', 'e']
        f = CSVFile(path=ff.name, names=names, blocksize=100)

        # make sure data is the same
        data2 = f.asarray()
        numpy.testing.assert_almost_equal(data, data2[:], decimal=7)

        # make sure all the columns are there
        assert all(col in f for col in names)
Esempio n. 15
0
def test_nrows(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100,5))
        numpy.savetxt(ff, data); ff.seek(0)

        # read nrows
        names =['a', 'b', 'c', 'd', 'e']
        f = CSVFile(path=ff.name, names=names, blocksize=1000, nrows=50)

        # the right size
        assert f.size == len(f[:]), "mismatch between 'size' and data read from file"
        assert f.size == 50, "error reading 'nrows'"

        # make sure right portion of data was read
        data2 = f.asarray()
        numpy.testing.assert_almost_equal(data[:50], data2[:], decimal=7)
Esempio n. 16
0
def test_asarray(comm):

    with tempfile.NamedTemporaryFile() as ff:    
        
        # generate data
        data = numpy.random.random(size=(100,5))
        numpy.savetxt(ff, data, fmt='%.7e'); ff.seek(0)
        
        # read
        names =['a', 'b', 'c', 'd', 'e']
        f = CSVFile(path=ff.name, names=names, dtype={'a':'f4', 'b':'f8'}, blocksize=100)
        
        # cannot do asarray twice
        a = f['a']
        with pytest.raises(ValueError):
            a2 = a.asarray()
        
        # cannot do view with different dtypes
        with pytest.raises(ValueError):
            f2 = f.asarray()
Esempio n. 17
0
def test_wrong_names(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100, 5))
        numpy.savetxt(ff, data)
        ff.seek(0)

        # wrong number of columns
        names = ['a', 'b', 'c']
        with pytest.raises(ValueError):
            f = CSVFile(path=ff.name, names=names, blocksize=1000)
Esempio n. 18
0
def test_header_fail(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data with blank lines
        data = numpy.random.random(size=(100, 5))
        ff.write(("# a b c d e\n").encode())
        numpy.savetxt(ff, data, fmt='%.7e')
        ff.seek(0)

        # this will fail -- header should not be in file
        names = ['a', 'b', 'c', 'd', 'e']
        with pytest.raises(ValueError):
            f = CSVFile(path=ff.name, names=names, blocksize=1000)
Esempio n. 19
0
def test_comments(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100, 5))
        ff.write(("# comment line 1\n# comment line 2\n").encode())
        numpy.savetxt(ff, data, fmt='%.7e')
        ff.seek(0)

        # this should raise an error
        names = ['a', 'b', 'c', 'd', 'e']
        with pytest.raises(ValueError):
            f = CSVFile(path=ff.name, names=names, blocksize=1000)
Esempio n. 20
0
def test_blank_lines(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data with blank lines
        data = numpy.random.random(size=(100, 5))
        numpy.savetxt(ff, data, fmt='%.7e')
        ff.seek(0)

        # read
        names = ['a', 'b', 'c', 'd', 'e']
        f = CSVFile(path=ff.name, names=names, blocksize=100)

        # the right size
        assert f.size == len(
            f[:]), "mismatch between 'size' and data read from file"
        assert f.size == 100, "error reading with blank lines'"
Esempio n. 21
0
def test_pickle(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100, 5))
        numpy.savetxt(ff, data, fmt='%.7e')
        ff.seek(0)

        # read nrows
        names = ['a', 'b', 'c', 'd', 'e']
        f = CSVFile(path=ff.name, names=names, blocksize=100)

        s = pickle.dumps(f)
        f2 = pickle.loads(s)

        numpy.testing.assert_almost_equal(f['a'][:], f2['a'][:])
Esempio n. 22
0
def test_usecols(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100, 5))
        numpy.savetxt(ff, data)
        ff.seek(0)

        # read usecols
        names = ['a', 'b', 'c', 'd', 'e']
        f = CSVFile(path=ff.name,
                    names=names,
                    blocksize=1000,
                    usecols=['a', 'c', 'e'])

        assert f.columns == ['a', 'c', 'e'], "error using 'usecols'"
Esempio n. 23
0
def test_slicing(comm):

    with tempfile.NamedTemporaryFile() as ff:

        # generate data
        data = numpy.random.random(size=(100, 5))
        numpy.savetxt(ff, data, fmt='%.7e')
        ff.seek(0)

        # read nrows
        names = ['a', 'b', 'c', 'd', 'e']
        f = CSVFile(path=ff.name, names=names, blocksize=1000)

        # make sure data is the same (check only the first column her)
        for sl in [
                slice(0, 10),
                slice(-10, -5),
                slice(0, 50, 2),
                slice(-50, None, 3)
        ]:
            numpy.testing.assert_almost_equal(data[sl][:, 0],
                                              f[sl]['a'],
                                              decimal=7)