def test_json_date_parse(self): a = nd.parse_json('var * date', '["2012-03-17", "1922-12-30"]') self.assertEqual(nd.as_py(a), [date(2012, 3, 17), date(1922, 12, 30)]) self.assertRaises(ValueError, nd.parse_json, 'var * date', '["2012-03-17T17:00:15-0600", "1922-12-30 Thursday"]') a = nd.parse_json('var * date', '["2012-06-17T17:00:15-0600", "1921-12-30 Thursday"]', ectx=nd.eval_context(errmode='nocheck')) self.assertEqual(nd.as_py(a), [date(2012, 6, 17), date(1921, 12, 30)])
def test_struct(self): a = nd.parse_json('{x:int32, y:string, z:float32}', '{"x":20, "y":"testing one two three", "z":-3.25}') self.assertEqual(nd.type_of(a), ndt.type('{x:int32, y:string, z:float32}')) self.assertEqual(nd.type_of(a[...]), ndt.type('{x:int32, y:string, z:float32}')) self.assertEqual(nd.type_of(a[0]), ndt.int32) self.assertEqual(nd.type_of(a[1]), ndt.string) self.assertEqual(nd.type_of(a[2]), ndt.float32) self.assertEqual(nd.type_of(a[-3]), ndt.int32) self.assertEqual(nd.type_of(a[-2]), ndt.string) self.assertEqual(nd.type_of(a[-1]), ndt.float32) self.assertEqual( nd.type_of(a[1:]), ndt.make_struct([ndt.string, ndt.float32], ['y', 'z'])) self.assertEqual(nd.type_of(a[::-2]), ndt.make_struct([ndt.float32, ndt.int32], ['z', 'x'])) self.assertEqual(nd.as_py(a[0]), 20) self.assertEqual(nd.as_py(a[1]), "testing one two three") self.assertEqual(nd.as_py(a[2]), -3.25) self.assertEqual(nd.as_py(a[1:]), { 'y': 'testing one two three', 'z': -3.25 }) self.assertEqual(nd.as_py(a[::-2]), {'x': 20, 'z': -3.25})
def dynd_arr(self): from ..io.client import requests """Downloads the data and returns a local in-memory nd.array""" # TODO: Need binary serialization j = requests.get_remote_json(self.url) tp = ndt.type(str(self.dshape)) return nd.parse_json(tp, j)
def _arr_cache(self): if self._cache_arr is not None: return self._cache_arr with open(self.path, mode=self.mode) as jsonfile: # This will read everything in-memory (but a memmap approach # is in the works) self._cache_arr = nd.parse_json(self.schema, jsonfile.read()) return self._cache_arr
def _arr_cache(self): if self._cache_arr is not None: return self._cache_arr with open(self.filename) as jsonfile: # This will read everything in-memory (but a memmap approach # is in the works) self._cache_arr = nd.parse_json( self.schema, jsonfile.read()) return self._cache_arr
def _iterchunks(self, blen=100): f = self.open(self.path) for chunk in partition_all(blen, f): text = "[" + ",\r\n".join(chunk) + "]" dshape = str(len(chunk)) + " * " + self.schema yield nd.parse_json(dshape, text) try: f.close() except AttributeError: pass
def _chunks(self, blen=100): f = self.open(self.path) for chunk in partition_all(blen, f): text = '[' + ',\r\n'.join(chunk) + ']' dshape = str(len(chunk) * self.schema) yield nd.parse_json(dshape, text) try: f.close() except AttributeError: pass
def load_json_directory_array(root, array_name): # Load the datashape dsfile = root + ".datashape" if not path.isfile(dsfile): raise Exception("No datashape file found for array %s" % array_name) with open(dsfile) as f: dt = ndt.type(f.read()) # Scan for JSON files, assuming they're just #.json # Sort them numerically files = sorted([(int(path.splitext(path.basename(x))[0]), x) for x in glob.glob(path.join(root, "*.json"))]) files = [x[1] for x in files] # Make an array with an extra fixed dimension, then # read a JSON file into each element of that array dt = ndt.make_fixed_dim(len(files), dt) arr = nd.empty(dt) for i, fname in enumerate(files): nd.parse_json(arr[i], nd.memmap(fname)) arr.flag_as_immutable() return array(arr)
def _arr_cache(self): if self._cache_arr is not None: return self._cache_arr jsonfile = self.open(self.path) # This will read everything in-memory (but a memmap approach # is in the works) self._cache_arr = nd.parse_json(str(self.dshape), jsonfile.read()) try: jsonfile.close() except: pass return self._cache_arr
def _arr_cache(self): if self._cache_arr is not None: return self._cache_arr jsonfile = self.open(self.path) # This will read everything in-memory (but a memmap approach # is in the works) text = '[' + ', '.join(jsonfile) + ']' try: jsonfile.close() except: pass self._cache_arr = nd.parse_json(str(self.dshape), text) return self._cache_arr
def load_json_file_array(root, array_name): # Load the datashape dsfile = root + ".datashape" if not path.isfile(dsfile): dsfile = path.dirname(root) + ".datashape" if not path.isfile(dsfile): raise Exception("No datashape file found for array %s" % array_name) with open(dsfile) as f: dt = ndt.type(f.read()) # Load the JSON # TODO: Add stream support to parse_json for compressed JSON, etc. arr = nd.parse_json(dt, nd.memmap(root + ".json")) return array(arr)
def load_json_file_array(root, array_name): # Load the datashape dsfile = root + '.datashape' if not path.isfile(dsfile): dsfile = path.dirname(root) + '.datashape' if not path.isfile(dsfile): raise Exception('No datashape file found for array %s' % array_name) with open(dsfile) as f: dt = nd.dtype(f.read()) # Load the JSON with open(root + '.json') as f: # TODO: Add stream support to parse_json for compressed JSON, etc. arr = nd.parse_json(dt, f.read()) return arr
def load_json_file_list_array(root, array_name): # Load the datashape dsfile = root + '.datashape' if not path.isfile(dsfile): raise Exception('No datashape file found for array %s' % array_name) with open(dsfile) as f: dt = ndt.type(f.read()) # Scan for JSON files -- no assumption on file suffix #open list of files and load into python list files = root + '.files' with open(files) as f: l_files = [fs.strip() for fs in f] # Make an array with an extra fixed dimension, then # read a JSON file into each element of that array dt = ndt.make_fixed_dim(len(l_files), dt) arr = nd.empty(dt) for i, fname in enumerate(l_files): with open(fname) as f: nd.parse_json(arr[i], f.read()) arr.flag_as_immutable() return array(arr)
def test_simple_computed_column(self): def computed_col(dst, src): for d, s in zip(dst, src): d.fullname = nd.as_py(s.firstname) + ' ' + nd.as_py(s.lastname) d.firstname = s.firstname d.lastname = s.lastname d.country = s.country a = nd.parse_json('2 * {firstname: string, lastname: string, country: string}', """[{"firstname":"Mike", "lastname":"Myers", "country":"Canada"}, {"firstname":"Seth", "lastname":"Green", "country":"USA"}]""") b = nd.elwise_map([a], computed_col, ndt.type( '{fullname: string, firstname: string, lastname: string, country: string}')) self.assertEqual(nd.as_py(b.fullname), ['Mike Myers', 'Seth Green']) self.assertEqual(nd.as_py(b.firstname), ['Mike', 'Seth']) self.assertEqual(nd.as_py(b.lastname), ['Myers', 'Green']) self.assertEqual(nd.as_py(b.country), ['Canada', 'USA'])
def test_struct(self): a = nd.parse_json('{x:int32, y:string, z:float32}', '{"x":20, "y":"testing one two three", "z":-3.25}') self.assertEqual(nd.type_of(a), ndt.type('{x:int32, y:string, z:float32}')) self.assertEqual(nd.type_of(a[...]), ndt.type('{x:int32, y:string, z:float32}')) self.assertEqual(nd.type_of(a[0]), ndt.int32) self.assertEqual(nd.type_of(a[1]), ndt.string) self.assertEqual(nd.type_of(a[2]), ndt.float32) self.assertEqual(nd.type_of(a[-3]), ndt.int32) self.assertEqual(nd.type_of(a[-2]), ndt.string) self.assertEqual(nd.type_of(a[-1]), ndt.float32) self.assertEqual(nd.type_of(a[1:]), ndt.make_struct([ndt.string, ndt.float32], ['y', 'z'])) self.assertEqual(nd.type_of(a[::-2]), ndt.make_struct([ndt.float32, ndt.int32], ['z', 'x'])) self.assertEqual(nd.as_py(a[0]), 20) self.assertEqual(nd.as_py(a[1]), "testing one two three") self.assertEqual(nd.as_py(a[2]), -3.25) self.assertEqual(nd.as_py(a[1:]), {'y':'testing one two three', 'z':-3.25}) self.assertEqual(nd.as_py(a[::-2]), {'x':20, 'z':-3.25})
def load_blaze_array(conf, dir): """Loads a blaze array from the catalog configuration and catalog path""" # This is a temporary hack, need to transition to using the # deferred data descriptors for various formats. fsdir = conf.get_fsdir(dir) if not path.isfile(fsdir + '.array'): raise RuntimeError('Could not find blaze array description file %r' % (fsdir + '.array')) with open(fsdir + '.array') as f: arrmeta = yaml.load(f) tp = arrmeta['type'] imp = arrmeta['import'] ds_str = arrmeta.get('datashape') # optional. HDF5 does not need that. if tp == 'csv': with open(fsdir + '.csv', 'r') as f: rd = csv.reader(f) if imp.get('headers', False): # Skip the header line next(rd) dat = list(rd) arr = nd.array(dat, ndt.type(ds_str))[:] return blaze.array(arr) elif tp == 'json': arr = nd.parse_json(ds_str, nd.memmap(fsdir + '.json')) return blaze.array(arr) elif tp == 'hdf5': import tables as tb from blaze.datadescriptor import HDF5DataDescriptor fname = fsdir + '.h5' # XXX .h5 assumed for HDF5 with tb.open_file(fname, 'r') as f: dp = imp.get('datapath') # specifies a path in HDF5 try: dparr = f.get_node(f.root, dp, 'Leaf') except tb.NoSuchNodeError: raise RuntimeError('HDF5 file does not have a dataset in %r' % dp) dd = HDF5DataDescriptor(fname, dp) return blaze.array(dd) elif tp == 'npy': import numpy as np use_memmap = imp.get('memmap', False) if use_memmap: arr = np.load(fsdir + '.npy', 'r') else: arr = np.load(fsdir + '.npy') arr = nd.array(arr) arr = blaze.array(arr) ds = datashape.dshape(ds_str) if not compatible_array_dshape(arr, ds): raise RuntimeError( ('NPY file for blaze catalog path %r ' + 'has the wrong datashape (%r instead of ' + '%r)') % (arr.dshape, ds)) return arr elif tp == 'py': ds = datashape.dshape(ds_str) # The script is run with the following globals, # and should put the loaded array in a global # called 'result'. gbl = { 'catconf': conf, # Catalog configuration object 'impdata': imp, # Import data from the .array file 'catpath': dir, # Catalog path 'fspath': fsdir, # Equivalent filesystem path 'dshape': ds # Datashape the result should have } if py2help.PY2: execfile(fsdir + '.py', gbl, gbl) else: with open(fsdir + '.py') as f: code = compile(f.read(), fsdir + '.py', 'exec') exec(code, gbl, gbl) arr = gbl.get('result', None) if arr is None: raise RuntimeError( ('Script for blaze catalog path %r did not ' + 'return anything in "result" variable') % (dir)) elif not isinstance(arr, blaze.Array): raise RuntimeError( ('Script for blaze catalog path %r returned ' + 'wrong type of object (%r instead of ' + 'blaze.Array)') % (type(arr))) if not compatible_array_dshape(arr, ds): raise RuntimeError( ('Script for blaze catalog path %r returned ' + 'array with wrong datashape (%r instead of ' + '%r)') % (arr.dshape, ds)) return arr else: raise ValueError( ('Unsupported array type %r from ' + 'blaze catalog entry %r') % (tp, dir))
def load_blaze_array(conf, dir): """Loads a blaze array from the catalog configuration and catalog path""" # This is a temporary hack, need to transition to using the # deferred data descriptors for various formats. fsdir = conf.get_fsdir(dir) if not path.isfile(fsdir + '.array'): raise RuntimeError('Could not find blaze array description file %r' % (fsdir + '.array')) with open(fsdir + '.array') as f: arrmeta = yaml.load(f) tp = arrmeta['type'] imp = arrmeta['import'] ds_str = arrmeta.get('datashape') # optional. HDF5 does not need that. if tp == 'csv': with open(fsdir + '.csv', 'r') as f: rd = csv.reader(f) if imp.get('headers', False): # Skip the header line next(rd) dat = list(rd) arr = nd.array(dat, ndt.type(ds_str))[:] return blaze.array(arr) elif tp == 'json': arr = nd.parse_json(ds_str, nd.memmap(fsdir + '.json')) return blaze.array(arr) elif tp == 'hdf5': import tables as tb from blaze.datadescriptor import HDF5_DDesc fname = fsdir + '.h5' # XXX .h5 assumed for HDF5 with tb.open_file(fname, 'r') as f: dp = imp.get('datapath') # specifies a path in HDF5 try: dparr = f.get_node(f.root, dp, 'Leaf') except tb.NoSuchNodeError: raise RuntimeError( 'HDF5 file does not have a dataset in %r' % dp) dd = HDF5_DDesc(fname, dp) return blaze.array(dd) elif tp == 'npy': import numpy as np use_memmap = imp.get('memmap', False) if use_memmap: arr = np.load(fsdir + '.npy', 'r') else: arr = np.load(fsdir + '.npy') arr = nd.array(arr) arr = blaze.array(arr) ds = datashape.dshape(ds_str) if not matches_datashape_pattern(arr.dshape, ds): raise RuntimeError(('NPY file for blaze catalog path %r ' + 'has the wrong datashape (%r instead of ' + '%r)') % (arr.dshape, ds)) return arr elif tp == 'py': ds = datashape.dshape(ds_str) # The script is run with the following globals, # and should put the loaded array in a global # called 'result'. gbl = {'catconf': conf, # Catalog configuration object 'impdata': imp, # Import data from the .array file 'catpath': dir, # Catalog path 'fspath': fsdir, # Equivalent filesystem path 'dshape': ds # Datashape the result should have } if py2help.PY2: execfile(fsdir + '.py', gbl, gbl) else: with open(fsdir + '.py') as f: code = compile(f.read(), fsdir + '.py', 'exec') exec(code, gbl, gbl) arr = gbl.get('result', None) if arr is None: raise RuntimeError(('Script for blaze catalog path %r did not ' + 'return anything in "result" variable') % (dir)) elif not isinstance(arr, blaze.Array): raise RuntimeError(('Script for blaze catalog path %r returned ' + 'wrong type of object (%r instead of ' + 'blaze.Array)') % (type(arr))) if not matches_datashape_pattern(arr.dshape, ds): raise RuntimeError(('Script for blaze catalog path %r returned ' + 'array with wrong datashape (%r instead of ' + '%r)') % (arr.dshape, ds)) return arr else: raise ValueError(('Unsupported array type %r from ' + 'blaze catalog entry %r') % (tp, dir))
def get_dynd(self): """Downloads the data and returns a local in-memory nd.array""" j = requests.get_remote_json(self.url) tp = ndt.type(str(self.dshape)) return nd.parse_json(tp, j)
def test_basic(self): dd = JSON(self.filename, 'r', dshape=self.dshape) self.assertEqual(list(dd), [nd.as_py(nd.parse_json(self.dshape, json.dumps(self.data)))])
def _chunks(self, blen=100): with self.open(self.path) as f: for chunk in partition_all(blen, f): text = '[' + ',\r\n'.join(chunk) + ']' dshape = str(len(chunk) * self.schema) yield nd.parse_json(dshape, text)
def get_dynd(self): """Downloads the data and returns a local in-memory nd.array""" j = requests.get_remote_json(self.url) return nd.parse_json(self.dtype, j)