def test05(self): """Testing `where()` iterator using `skip`""" a = np.arange(1, 11) b = blz.barray(a) wt = [v for v in a if v<=5][2:] cwt = [v for v in b.where(blz.barray(a<=5), skip=2)] #print "numpy ->", [v for v in a if v<=5][2:] #print "where ->", [v for v in b.where(blz.barray(a<=5), skip=2)] self.assert_(wt == cwt, "where() does not work correctly")
def test06(self): """Testing `where()` iterator (using array bool in fancy indexing)""" a = np.arange(1, 110) b = blz.barray(a, chunklen=10) wt = a[(a<5)|(a>9)] cwt = b[blz.barray((a<5)|(a>9))] #print "numpy ->", a[(a<5)|(a>9)] #print "where ->", b[blz.barray((a<5)|(a>9))] assert_array_equal(wt, cwt, "where() does not work correctly")
def test03(self): """Testing `where()` iterator (using a boolean array)""" a = np.arange(1, 11) b = blz.barray(a) wt = [v for v in a if v<=5] cwt = [v for v in b.where(blz.barray(a<=5))] #print "numpy ->", [v for v in a if v<=5] #print "where ->", [v for v in b.where(blz.barray(a<=5))] self.assert_(wt == cwt, "where() does not work correctly")
def test06(self): """Testing `where()` iterator using `limit` and `skip`""" a = np.arange(1, 11) b = blz.barray(a) wt = [v for v in a if v<=5][1:4] cwt = [v for v in b.where(blz.barray(a<=5), limit=3, skip=1)] #print "numpy ->", [v for v in a if v<=5][1:4] #print "where ->", [v for v in b.where(blz.barray(a<=5), # limit=3, skip=1)] self.assert_(wt == cwt, "where() does not work correctly")
def test04(self): """Testing fancy indexing with __setitem__ (bool barray)""" a = np.arange(1,1e2) b = blz.barray(a, chunklen=10) bc = (a > 5) & (a < 40) sl = blz.barray(bc) b[sl] = 3. a[bc] = 3. #print "b[%s] -> %r" % (sl, b) assert_array_equal(b[:], a, "fancy indexing does not work correctly")
def test07(self): """Testing `where()` iterator using `limit` and `skip` (zeros)""" a = np.arange(10000) b = blz.barray(a,) wt = [v for v in a if v<=5000][1010:2020] cwt = [v for v in b.where(blz.barray(a<=5000, chunklen=100), limit=1010, skip=1010)] # print "numpy ->", [v for v in a if v>=5000][1010:2020] # print "where ->", [v for v in b.where(blz.barray(a>=5000,chunklen=100), # limit=1010, skip=1010)] self.assert_(wt == cwt, "where() does not work correctly")
def test01d(self): """Testing `__getitem()__` method with only a (large) start""" a = np.arange(1e4) b = blz.barray(a, rootdir=self.rootdir) sl = -2 # second last element #print "b[sl]->", `b[sl]` assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test04(self): """Testing `iter()` method with large zero arrays""" a = np.zeros(1e4, dtype='f8') b = blz.barray(a, chunklen=100, rootdir=self.rootdir) c = blz.fromiter((v for v in b), dtype='f8', count=len(a)) #print "c ->", repr(c) assert_array_equal(a, c[:], "iterator fails on zeros")
def test03a(self): """Testing `iter()` method with only step""" a = np.arange(101) b = blz.barray(a, chunklen=2, rootdir=self.rootdir) #print "sum iter->", sum(b.iter(step=4)) self.assert_(sum(a[::4]) == sum(b.iter(step=4)), "Sums are not equal")
def test03d(self): """Testing `__getitem()__` method with ranges and steps (IV)""" a = np.arange(1e3) b = blz.barray(a, chunklen=10, rootdir=self.rootdir) sl = slice(4, 80, 3000) #print "b[sl]->", `b[sl]` assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test02b(self): """Testing `__getitem()__` method with ranges (negative start)""" a = np.arange(1e2) b = blz.barray(a, chunklen=10, rootdir=self.rootdir) sl = slice(-3) #print "b[sl]->", `b[sl]` assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def open(persist, **kwargs): """Open an existing persistent array. Parameters ---------- persist : a Storage instance The Storage instance specifies, among other things, URI of where the array is stored. kwargs : a dictionary Put here different parameters depending on the format. Returns ------- out: a concrete blaze array. Notes ----- Only BLZ, HDF5, CSV and JSON formats are supported currently. """ persist = _persist_convert(persist) if persist.format == 'blz': d = blz.barray(rootdir=persist.path, **kwargs) dd = BLZDataDescriptor(d) elif persist.format == 'csv': dd = CSVDataDescriptor(persist.path, **kwargs) elif persist.format == 'json': dd = JSONDataDescriptor(persist.path, **kwargs) elif persist.format == 'hdf5': dd = HDF5DataDescriptor(persist.path, **kwargs) return Array(dd)
def select_and_apply(self, apply_func = lambda xx:np.dot(xx,np.ones(xx.shape[1]) / xx.shape[1]), select_format = (None,1), combine_fun = lambda xx:np.concatenate(xx,1), limit_n_per_slice = 2000000): selected_slices = self.generate_axis_dividing_slice_selectors(select_format=select_format, limit_n_per_slice=limit_n_per_slice) print "selected_slices = ",selected_slices one_selected_slices = selected_slices[0] combined_arr = self.select_all_barrays(one_selected_slices, combine_fun=combine_fun) output_barray = blz.barray(apply_func(combined_arr)) for one_selected_slices in selected_slices[1:]: print "applying one_selected_slices = ",one_selected_slices combined_arr = self.select_all_barrays(one_selected_slices, combine_fun=combine_fun) output_barray.append(apply_func(combined_arr)) return output_barray
def test03(self): """Testing copy() with no shuffle""" a = np.linspace(-1., 1., 1e4) b = blz.barray(a, rootdir=self.rootdir) c = b.copy(bparams=blz.bparams(shuffle=False)) #print "b.cbytes, c.cbytes:", b.cbytes, c.cbytes self.assert_(b.cbytes < c.cbytes, "shuffle not changed")
def test02c(self): """Testing `iter()` method with positive start, negative stop""" a = np.arange(101) b = blz.barray(a, chunklen=2, rootdir=self.rootdir) #print "sum iter->", sum(b.iter(24, -3)) self.assert_(sum(a[24:-3]) == sum(b.iter(24, -3)), "Sums are not equal")
def test04a(self): """Testing `__getitem()__` method with long ranges""" a = np.arange(1e3) b = blz.barray(a, chunklen=100, rootdir=self.rootdir) sl = slice(1, 8000) #print "b[sl]->", `b[sl]` assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test03b(self): """Testing `iter()` method with start, stop, step""" a = np.arange(101) b = blz.barray(a, chunklen=2, rootdir=self.rootdir) #print "sum iter->", sum(b.iter(3, 24, 4)) self.assert_(sum(a[3:24:4]) == sum(b.iter(3, 24, 4)), "Sums are not equal")
def test04d(self): """Testing `__getitem()__` method with no start and no stop""" a = np.arange(1e3) b = blz.barray(a, chunklen=100, rootdir=self.rootdir) sl = slice(None, None, 2) #print "b[sl]->", `b[sl]` assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test05(self): """Testing `__getitem()__` method with negative steps""" a = np.arange(1e3) b = blz.barray(a, chunklen=10, rootdir=self.rootdir) sl = slice(None, None, -3) #print "b[sl]->", `b[sl]` self.assertRaises(NotImplementedError, b.__getitem__, sl)
def test02(self): """Testing copy() with lesser compression""" a = np.linspace(-1., 1., 1e4) b = blz.barray(a, rootdir=self.rootdir) c = b.copy(bparams=blz.bparams(clevel=1)) #print "b.cbytes, c.cbytes:", b.cbytes, c.cbytes self.assert_(b.cbytes < c.cbytes, "clevel not changed")
def test02(self): """Testing fancy indexing (empty list)""" a = np.arange(101) b = blz.barray(a) c = b[[]] r = a[[]] assert_array_equal(c, r, "fancy indexing does not work correctly")
def test00(self): """Testing fancy indexing (short list)""" a = np.arange(1,111) b = blz.barray(a) c = b[[3,1]] r = a[[3,1]] assert_array_equal(c, r, "fancy indexing does not work correctly")
def test03(self): """Testing fancy indexing (list of floats)""" a = np.arange(1,101) b = blz.barray(a) c = b[[1.1, 3.3]] r = a[[1.1, 3.3]] assert_array_equal(c, r, "fancy indexing does not work correctly")
def test01(self): """Testing fancy indexing (large list, numpy)""" a = np.arange(1,1e4) b = blz.barray(a) idx = np.random.randint(1000, size=1000) c = b[idx] r = a[idx] assert_array_equal(c, r, "fancy indexing does not work correctly")
def test02c(self): """Testing `append()` method (large chunklen III)""" a = np.arange(1000*1000) b = blz.barray(a, chunklen=100*1000-1, rootdir=self.rootdir) b.append(a) #print "b->", `b` c = np.concatenate((a, a)) assert_array_equal(c, b[:], "Arrays are not equal")
def test01(self): """Testing __sizeof__() (big arrays)""" a = np.arange(2e5) b = blz.barray(a, rootdir=self.rootdir) #print "size b uncompressed-->", b.nbytes #print "size b compressed -->", b.cbytes self.assert_(sys.getsizeof(b) < b.nbytes, "barray does not seem to compress at all")
def test02(self): """Testing __sizeof__() (small arrays)""" a = np.arange(111) b = blz.barray(a) #print "size b uncompressed-->", b.nbytes #print "size b compressed -->", b.cbytes self.assert_(sys.getsizeof(b) > b.nbytes, "barray compressed too much??")
def test07(self): """Testing `iter()` method with `limit` and `skip`""" a = np.arange(1e4, dtype='f8') b = blz.barray(a, chunklen=100, rootdir=self.rootdir) c = blz.fromiter((v for v in b.iter(limit=1010, skip=1010)), dtype='f8', count=1010) #print "c ->", repr(c) assert_array_equal(a[1010:2020], c, "iterator fails on zeros")
def test00(self): """Testing unicode types (creation)""" a = np.array([[u"aŀle", u"eñe"], [u"açò", u"áèâë"]], dtype="U4") b = blz.barray(a) #print "b.dtype-->", b.dtype #print "b->", `b` self.assert_(a.dtype == b.dtype.base) assert_array_equal(a, b[:], "Arrays are not equal")
def test00(self): """Testing string types (creation)""" a = np.array([["ale", "ene"], ["aco", "ieie"]], dtype="S4") b = blz.barray(a) #print "b.dtype-->", b.dtype #print "b->", `b` self.assert_(a.dtype == b.dtype.base) assert_array_equal(a, b[:], "Arrays are not equal")
def testImplicitDtype(self): """Testing barray construction keeping dimensions (implicit dtype)""" a = np.eye(6) # 2d b = blz.barray(a, rootdir=self.rootdir) if self.open: b = blz.open(rootdir=self.rootdir) # array equality implies having the same shape assert_array_equal(a, b, "Arrays are not equal")
def test01(self): """Testing unicode types (append)""" a = np.ones((300, 4), dtype="U4") b = blz.barray([], dtype="U4").reshape((0, 4)) b.append(a) #print "b.dtype-->", b.dtype #print "b->", `b` self.assert_(a.dtype == b.dtype.base) assert_array_equal(a, b[:], "Arrays are not equal")
def test_barray_record_as_object(self): src_data = np.empty((10,), dtype=np.dtype('u1,O')) src_data[:] = [(i, 's'*i) for i in range(10)] carr = blz.barray(src_data, dtype=np.dtype('O')) self.assertEqual(len(carr.shape), 1) self.assertEqual(len(src_data), carr.shape[0]) for i in range(len(carr)): self.assertEqual(carr[i][0], src_data[i][0]) self.assertEqual(carr[i][1], src_data[i][1])
def test_barray_1d_source(self): """Testing barray of objects, 1d source""" src_data = ['s'*i for i in range(10)] carr = blz.barray(src_data, dtype=np.dtype('O')) self.assertEqual(len(carr.shape), 1) self.assertEqual(len(src_data), carr.shape[0]) for i in range(len(carr)): self.assertEqual(carr[i], src_data[i]) self.assertEqual(carr[i], src_data[i])
def test00(self): """Testing sum().""" a = np.arange(1e5).reshape(10, 1e4) sa = a.sum() ac = blz.barray(a) sac = ac.sum() #print "numpy sum-->", sa #print "barray sum-->", sac self.assert_(sa.dtype == sac.dtype, "sum() is not working correctly.") self.assert_(sa == sac, "sum() is not working correctly.")
def test04c(self): """Testing `__getitem()__` method with shape reduction (III)""" a = np.arange(6000).reshape((50, 40, 3)) b = blz.barray(a, rootdir=self.rootdir) if self.open: b = blz.open(rootdir=self.rootdir) sl = (1, slice(1, 4, 2), 2) #print "b[sl]->", `b[sl]` self.assert_(a[sl].shape == b[sl].shape, "Shape is not equal") assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def testExplicitDtype(self): """Testing barray construction keeping dimensions (explicit dtype)""" dtype = np.dtype(np.float64) a = np.eye(6, dtype=dtype) b = blz.barray(a, dtype=dtype, rootdir=self.rootdir) if self.open: b = blz.open(rootdir=self.rootdir) # array equality implies having the same shape assert_array_equal(a, b, "Arrays are not equal")
def test03c(self): """Testing `__getitem()__` method with several slices (III)""" a = np.arange(120 * 1000).reshape((5 * 1000, 4, 3, 2)) b = blz.barray(a, rootdir=self.rootdir) if self.open: b = blz.open(rootdir=self.rootdir) sl = (slice(None, None, 3), slice(1, 3, 2), slice(1, 4, 2)) #print "b[sl]->", `b[sl]` self.assert_(a[sl].shape == b[sl].shape, "Shape is not equal") assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test04c(self): """Testing `__setitem()__` method with shape reduction (III)""" a = np.arange(24).reshape((4, 3, 2)) b = blz.barray(a, rootdir=self.rootdir) sl = (1, 2, slice(None, None, None)) #print "before->", `b[sl]` a[sl] = 2 b[sl] = 2 if self.open: b.flush() b = blz.open(rootdir=self.rootdir) #print "after->", `b[sl]` assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test03d(self): """Testing `__setitem()__` method with several slices (IV)""" a = np.arange(120).reshape((5, 4, 3, 2)) b = blz.barray(a, rootdir=self.rootdir) sl = (slice(1, 3), slice(1, 3, 1), slice(1, None, 2), slice(1)) #print "before->", `b[sl]` a[sl] = 2 b[sl] = 2 if self.open: b.flush() b = blz.open(rootdir=self.rootdir) #print "after->", `b[sl]` assert_array_equal(a[:], b[:], "Arrays are not equal")
def __setitem__(self, key, value): # Check config file location = (_config().caching if hasattr(_config(), 'caching') else None) assert location in ('disk', 'memory') # Use tempfolder for saving on disk save_folder = tempfile.mkdtemp() if location == 'disk' else None self.compress = lambda v: blz.barray(v, rootdir=save_folder) if type(value) == np.ndarray: value = self.compress(value) return super(CompressedCache, self).__setitem__(key, value)
def modelPredictor(modelsPath_modelIndex_dataPath_colNames_tuple): """ Input: A tuple, with following two attributes (with order): modelsPath: string, the path to the trained models. (pickle file) modelIndex: integer, the index of the model to predict. dataPath: string, the path to the data. colNames: a list of strings, column names of the output table. It should be like ["Id", "V1", ...] Output: A btable, consists of Id column, Predicted column and the data. Notes: modelPredictor will create following directories for you if they do not exist. 1. Model_No{modelIndex}_predicted_array: it will be under the dataPath. """ # Set up necessary constance. divideN = 300000 modelsPath, modelIndex, dataPath, colNames = modelsPath_modelIndex_dataPath_colNames_tuple def data_abspath(colname): return os.path.abspath(os.path.join(dataPath, colname)) with open(modelsPath, "rb") as rf: models = pickle.load(rf) model = models[modelIndex] del models # Read in data with btable. Id = blz.open(os.path.join(dataPath, colNames[0])) totalN = len(Id) if totalN % divideN == 0: nodes_list = [i * divideN for i in range(totalN / divideN + 1)] else: nodes_list = [i * divideN for i in range(totalN / divideN + 1)] + [totalN] nodes_pair_list = zip(nodes_list[:-1], nodes_list[1:]) # Prediction. y_predict = np.zeros(totalN) print "[Model No.{modelIndex}] Prediction process begins.".format(modelIndex = modelIndex) for begin, end in nodes_pair_list: print "[Model No.{modelIndex}] Processing {begin} ~ {end} observations.".format(modelIndex=modelIndex, begin = begin + 1, end = end) columns = [blz.open(os.path.join(dataPath, colname))[begin:end] for colname in colNames[1:]] X = np.column_stack(columns) temp = model.predict(X) y_predict[begin:end] = temp columns = [Id, blz.barray(y_predict)] data_rootdir = os.path.join(dataPath, "Model_No{modelIndex}_predicted_array".format(modelIndex = modelIndex)) if data_rootdir in os.listdir(dataPath): print "Removing Old result_table directory for new btable." command = "rm -rf " + data_rootdir os.system(command) final_table = blz.btable(columns = columns, names = ["Id", "Predict"], rootdir = data_rootdir) print "The result_table btable rootdir is under {path}".format(path=data_rootdir)
def test_barray_2d_source(self): """Testing barray of objects, 2d source Expected result will be a 1d barray whose elements are containers holding the inner dimension """ src_data = [(i, 's'*i) for i in range(10)] carr = blz.barray(src_data, dtype=np.dtype('O')) # note that barray should always create a 1 dimensional # array of objects. self.assertEqual(len(carr.shape), 1) self.assertEqual(len(src_data), carr.shape[0]) for i in range(len(carr)): self.assertEqual(carr[i][0], src_data[i][0]) self.assertEqual(carr[i][1], src_data[i][1])
def test_barray_tuple_source(self): """Testing a barray of objects that are tuples This uses a numpy container as source. Tuples should be preserved """ src_data = np.empty((10,), dtype=np.dtype('O')) src_data[:] = [(i, 's'*i) for i in range(src_data.shape[0])] carr = blz.barray(src_data) self.assertEqual(len(carr.shape), 1) self.assertEqual(len(src_data), carr.shape[0]) self.assertEqual(type(carr[0]), tuple) self.assertEqual(type(carr[0]), type(src_data[0])) for i in range(len(carr)): self.assertEqual(carr[i][0], src_data[i][0]) self.assertEqual(carr[i][1], src_data[i][1])
def _test_barray_record_inferred_opt2(self): """Testing barray handling of inferred record dtypes containing objects. When there is no explicit dtype in the barray constructor, the dtype becomes 'O', and the barrays behaves accordingly (one dimensional) """ src_data = np.empty((10,), dtype=np.dtype('u1,O')) src_data[:] = [(i, 's'*i) for i in range(10)] carr = blz.barray(src_data) # note: this is similar as if it was created with dtype='O' self.assertEqual(len(carr.shape), 1) self.assertEqual(len(src_data), carr.shape[0]) for i in range(len(carr)): self.assertEqual(carr[i][0], src_data[i][0]) self.assertEqual(carr[i][1], src_data[i][1])
## Benchmark to check the creation of an array of length > 2**32 (5e9) import blz from time import time t0 = time() #cn = blz.zeros(5e9, dtype="i1") cn = blz.zeros(5e9, dtype="i1", rootdir='ondisk_barray', mode='w') print "Creation time:", round(time() - t0, 3) assert len(cn) == int(5e9) t0 = time() cn = blz.barray(rootdir='ondisk_barray', mode='a') print "Re-open time:", round(time() - t0, 3) print "len(cn)", len(cn) assert len(cn) == int(5e9) # Now check some accesses cn[1] = 1 assert cn[1] == 1 cn[int(2e9)] = 2 assert cn[int(2e9)] == 2 cn[long(3e9)] = 3 assert cn[long(3e9)] == 3 cn[-1] = 4 assert cn[-1] == 4 t0 = time() assert cn.sum() == 10 print "Sum time:", round(time() - t0, 3)
def array(obj, dshape=None, ddesc=None): """Create a Blaze array. Parameters ---------- obj : array_like Initial contents for the array. dshape : datashape The datashape for the resulting array. By default the datashape will be inferred from data. If an explicit dshape is provided, the input data will be coerced into the provided dshape. ddesc : data descriptor instance This comes with the necessary info for storing the data. If None, a DyND_DDesc will be used. Returns ------- out : a concrete blaze array. """ dshape = _normalize_dshape(dshape) if ((obj is not None) and (not inspect.isgenerator(obj)) and (dshape is not None)): dt = ndt.type(str(dshape)) if dt.ndim > 0: obj = nd.array(obj, type=dt, access='rw') else: obj = nd.array(obj, dtype=dt, access='rw') if obj is None and ddesc is None: raise ValueError('you need to specify at least `obj` or `ddesc`') if isinstance(obj, Array): return obj elif isinstance(obj, DDesc): if ddesc is None: ddesc = obj return Array(ddesc) else: raise ValueError(('you cannot specify `ddesc` when `obj` ' 'is already a DDesc instance')) if ddesc is None: # Use a dynd ddesc by default try: array = nd.asarray(obj, access='rw') except: raise ValueError(('failed to construct a dynd array from ' 'object %r') % obj) ddesc = DyND_DDesc(array) return Array(ddesc) # The DDesc has been specified if isinstance(ddesc, DyND_DDesc): if obj is not None: raise ValueError(('you cannot specify simultaneously ' '`obj` and a DyND `ddesc`')) return Array(ddesc) elif isinstance(ddesc, BLZ_DDesc): if inspect.isgenerator(obj): dt = None if dshape is None else to_numpy_dtype(dshape) # TODO: Generator logic could go inside barray ddesc.blzarr = blz.fromiter(obj, dtype=dt, count=-1, rootdir=ddesc.path, mode=ddesc.mode, **ddesc.kwargs) else: if isinstance(obj, nd.array): obj = nd.as_numpy(obj) if dshape and isinstance(dshape.measure, datashape.Record): ddesc.blzarr = blz.btable(obj, rootdir=ddesc.path, mode=ddesc.mode, **ddesc.kwargs) else: ddesc.blzarr = blz.barray(obj, rootdir=ddesc.path, mode=ddesc.mode, **ddesc.kwargs) elif isinstance(ddesc, HDF5_DDesc): if isinstance(obj, nd.array): obj = nd.as_numpy(obj) with tb.open_file(ddesc.path, mode=ddesc.mode) as f: where, name = split_path(ddesc.datapath) if dshape and isinstance(dshape.measure, datashape.Record): # Convert the structured array to unaligned dtype # We need that because PyTables only accepts unaligned types, # which are the default in NumPy obj = np.array(obj, datashape.to_numpy_dtype(dshape.measure)) f.create_table(where, name, filters=ddesc.filters, obj=obj) else: f.create_earray(where, name, filters=ddesc.filters, obj=obj) ddesc.mode = 'a' # change into 'a'ppend mode for further operations return Array(ddesc)
import numpy as np import blz from time import time N = 1e8 # the number of elements in x clevel = 5 # the compression level sexpr = "(x-1) < 10." # the expression to compute #sexpr = "((x-1) % 1000) == 0." # the expression to compute #sexpr = "(2*x**3+.3*y**2+z+1)<0" # the expression to compute bparams = blz.bparams(clevel) print "Creating inputs with %d elements..." % N x = np.arange(N) cx = blz.barray(x, bparams=bparams) if 'y' not in sexpr: ct = blz.btable((cx, ), names=['x']) else: y = np.arange(N) z = np.arange(N) cy = blz.barray(y, bparams=bparams) cz = blz.barray(z, bparams=bparams) ct = blz.btable((cx, cy, cz), names=['x', 'y', 'z']) print "Evaluating...", sexpr t0 = time() cbout = ct.eval(sexpr) print "Time for evaluation--> %.3f" % (time() - t0, ) print "Converting to numy arrays" bout = cbout[:]
import numpy as np import blz from time import time N = 100 * 1000 * 1000 CLEVEL = 5 a = np.linspace(0, 1, N) t0 = time() ac = blz.barray(a, bparams=blz.bparams(clevel=CLEVEL)) print "time creation (memory) ->", round(time()-t0, 3) print "data (memory):", repr(ac) t0 = time() b = blz.barray(a, bparams=blz.bparams(clevel=CLEVEL), rootdir='myarray', mode='w') b.flush() print "time creation (disk) ->", round(time()-t0, 3) #print "meta (disk):", b.read_meta() t0 = time() an = np.array(a) print "time creation (numpy) ->", round(time()-t0, 3) t0 = time() c = blz.barray(rootdir='myarray') print "time open (disk) ->", round(time()-t0, 3) #print "meta (disk):", c.read_meta() print "data (disk):", repr(c)
#print "cout-->", repr(cout) if __name__=="__main__": N = 1e8 # the number of elements in x clevel = 5 # the compression level sexpr = "(x+1)<0" sexpr = "(((.25*x + .75)*x - 1.5)*x - 2)<0" #sexpr = "(((.25*x + .75)*x - 1.5)*x - 2)" doprofile = True print("Creating inputs...") x = np.arange(N) #x = np.linspace(0,100,N) cx = blz.barray(x, bparams=blz.bparams(clevel)) print("Evaluating '%s' with 10^%d points" % (sexpr, int(math.log10(N)))) t0 = time() cout = ne.evaluate(sexpr) print "Time for numexpr --> %.3f" % (time()-t0,) if doprofile: import pstats import cProfile as prof prof.run('compute_barray(sexpr, clevel=clevel, kernel="numexpr")', #prof.run('compute_barray(sexpr, clevel=clevel, kernel="python")', 'eval.prof') stats = pstats.Stats('eval.prof') stats.strip_dirs()
def append(data, clevel, cname): alldata = blz.barray(data[0], bparams=blz.bparams(clevel, cname=cname)) for carr in data[1:]: alldata.append(carr) return alldata
import numpy as np import blz from time import time N = 1e8 #a = np.arange(N, dtype='f8') a = np.random.randint(0, 10, N).astype('bool') t0 = time() sa = a.sum() print "Time sum() numpy --> %.3f" % (time() - t0) t0 = time() ac = blz.barray(a, bparams=blz.bparams(9)) print "Time barry conv --> %.3f" % (time() - t0) print "ac-->", ` ac ` t0 = time() sac = ac.sum() #sac = ac.sum(dtype=np.dtype('i8')) print "Time sum() barray --> %.3f" % (time() - t0) # t0 = time() # sac = sum(i for i in ac) # print "Time sum() carray (iter) --> %.3f" % (time()-t0) print "sa, sac-->", sa, sac, type(sa), type(sac) assert (sa == sac)
import blz from time import time N = 1e7 # the number of elements in x M = 100000 # the elements to get clevel = 1 # the compression level print "Creating inputs with %d elements..." % N bparams = blz.bparams(clevel) #x = np.arange(N) x = np.zeros(N, dtype="f8") y = x.copy() z = x.copy() cx = blz.barray(x, bparams=bparams) cy = cx.copy() cz = cx.copy() ct = blz.btable((cx, cy, cz), names=['x', 'y', 'z']) t = ct[:] print "Starting benchmark now for getting %d elements..." % M # Retrieve from a ndarray t0 = time() vals = [x[i] for i in xrange(0, M, 3)] print "Time for array--> %.3f" % (time() - t0, ) print "vals-->", len(vals) #blz.set_num_threads(blz.ncores//2) # Retrieve from a barray
def array(obj, dshape=None, caps={'efficient-write': True}, storage=None): """Create a Blaze array. Parameters ---------- obj : array_like Initial contents for the array. dshape : datashape The datashape for the resulting array. By default the datashape will be inferred from data. If an explicit dshape is provided, the input data will be coerced into the provided dshape. caps : capabilities dictionary A dictionary containing the desired capabilities of the array. storage : Storage instance A Storage object with the necessary info for storing the data. Returns ------- out : a concrete blaze array. Bugs ---- Right now the explicit dshape is ignored. This needs to be corrected. When the data cannot be coerced to an explicit dshape an exception should be raised. """ dshape = _normalize_dshape(dshape) storage = _storage_convert(storage) if isinstance(obj, Array): return obj elif isinstance(obj, IDataDescriptor): # TODO: Validate the 'caps', convert to another kind # of data descriptor if necessary # Note by Francesc: but if it is already an IDataDescriptor I wonder # if `caps` should be ignored. Hmm, probably not... # # Note by Oscar: Maybe we shouldn't accept a datadescriptor at # all at this level. If you've got a DataDescriptor you are # playing with internal datastructures anyways, go to the # Array constructor directly. If you want to transform to # another datadescriptor... convert it yourself (you are # playing with internal datastructures, remember? you should # be able to do it in your own. dd = obj elif storage is not None: dt = None if dshape is None else to_numpy_dtype(dshape) if inspect.isgenerator(obj): # TODO: Generator logic can go inside barray dd = BLZDataDescriptor( blz.barray(obj, dtype=dt, count=-1, rootdir=storage.path)) else: dd = BLZDataDescriptor( blz.barray(obj, dtype=dt, rootdir=storage.path)) elif 'efficient-write' in caps and caps['efficient-write'] is True: # In-Memory array if dshape is None: dd = DyNDDataDescriptor(nd.asarray(obj, access='rw')) else: # Use the uniform/full dtype specification in dynd depending # on whether the datashape has a uniform dim dt = ndt.type(str(dshape)) if dt.ndim > 0: dd = DyNDDataDescriptor(nd.array(obj, type=dt, access='rw')) else: dd = DyNDDataDescriptor(nd.array(obj, dtype=dt, access='rw')) elif 'compress' in caps and caps['compress'] is True: dt = None if dshape is None else to_numpy_dtype(dshape) # BLZ provides compression if inspect.isgenerator(obj): # TODO: Generator logic can go inside barray dd = BLZDataDescriptor(blz.fromiter(obj, dtype=dt, count=-1)) else: dd = BLZDataDescriptor(blz.barray(obj, dtype=dt)) elif isinstance(obj, np.ndarray): dd = DyNDDataDescriptor(nd.view(obj)) elif isinstance(obj, nd.array): dd = DyNDDataDescriptor(obj) elif isinstance(obj, blz.barray): dd = BLZDataDescriptor(obj) else: raise TypeError(('Failed to construct blaze array from ' 'object of type %r') % type(obj)) return Array(dd)
# Benchmark to compare times for iterators in generator contexts by # using barrays vs plain numpy arrays. import numpy as np import blz from time import time N = 1e7 a = np.arange(N) b = blz.barray(a) t0 = time() #sum1 = sum(a) sum1 = sum((v for v in a[2::3] if v < 10)) t1 = time()-t0 print "Summing using numpy iterator: %.3f" % t1 t0 = time() #sum2 = sum(b) sum2 = sum((v for v in b.iter(2, None, 3) if v < 10)) t2 = time()-t0 print "Summing using barray iterator: %.3f speedup: %.2f" % (t2, t1/t2) assert sum1 == sum2, "Summations are not equal!"
kernel, time() - t0, ), print ", cratio (out): %.1f" % (cout.nbytes / float(cout.cbytes)) #print "cout-->", repr(cout) if __name__ == "__main__": print "Creating inputs..." bparams = blz.bparams(clevel) y = x.copy() z = x.copy() cx = blz.barray(x, bparams=bparams) cy = blz.barray(y, bparams=bparams) cz = blz.barray(z, bparams=bparams) for sexpr in sexprs: print "Evaluating '%s' with 10^%d points" % (sexpr, int(math.log10(N))) compute_ref(sexpr) for kernel in "python", "numexpr": compute_blz(sexpr, clevel=0, kernel=kernel) if doprofile: import pstats import cProfile as prof #prof.run('compute_blz(sexpr, clevel=clevel, kernel="numexpr")', prof.run( 'compute_blz(sexpr, clevel=0, kernel="numexpr")', #prof.run('compute_blz(sexpr, clevel=clevel, kernel="python")',