def _bcolz(self, tblname, dbname=None, type=None, df=None, blaze=False): # Suppress the warning until the next wersion import warnings #from flask.exthook import ExtDeprecationWarning #warnings.simplefilter('ignore',ExtDeprecationWarning) warnings.simplefilter('ignore', FutureWarning) import blaze as blz if type is None: type = self.type if dbname is None: dbname = self.name if df is None: # return the dataframe if it exists try: df = bcz.open( os.path.expanduser( os.path.join(cf.options.basedir, 'databases', "{}.{}.{}".format(type, dbname, tblname)))) except IOError: return None else: if len(df) == 0: df = pd.DataFrame() if blaze: df = blz.data(df) else: if blaze: df = blz.data(df) else: df = df.todataframe() if not blaze and 'idx' in df.columns.values: df.set_index('idx', drop=True, inplace=True) df.index.name = None return df else: if not (df.index.dtype_str == 'int64') and not (df.empty): df = df.copy() df['idx'] = df.index if isinstance(df, pd.DataFrame): path = os.path.expanduser( os.path.join(cf.options.basedir, 'databases', "{}.{}.{}".format(type, dbname, tblname))) if df.empty: bcz.fromiter((), dtype=np.int32, mode='w', count=0, rootdir=path) else: bcz.ctable.fromdataframe(df, mode='w', rootdir=path) if 'idx' in df.columns.values: del df return
def test06(self): """Testing `fetchwhere` method off of a timestamp (pd.datetime64)""" N = self.N query_idx = np.random.randint(0, self.N) t = bcolz.fromiter(((i, np.datetime64('2018-03-01') + i) for i in range(N)), dtype="i4,M8[D]", count=N) threshold = t[query_idx][1] result = t.fetchwhere('(f1 > threshold)', user_dict={'threshold': threshold}) t_fin = bcolz.fromiter(((i + query_idx, threshold + i) for i in range(1, N - query_idx)), dtype="i4,M8[D]", count=N) np.testing.assert_array_equal(result[:], t_fin[:])
def _bcolz(self, tblname, dbname=None, type=None, df=None, blaze=False): ''' This is the access point to the bcolz database ''' if type is None: type = self._m80_type if dbname is None: dbname = self._m80_name if df is None: # return the dataframe if it exists try: df = bcz.open( os.path.expanduser( os.path.join(cf.options.basedir, 'databases', "{}.{}.{}".format(type, dbname, tblname)))) except IOError: return None else: if len(df) == 0: df = pd.DataFrame() if blaze: df = blz.data(df) else: if blaze: df = blz.data(df) else: df = df.todataframe() if not blaze and 'idx' in df.columns.values: df.set_index('idx', drop=True, inplace=True) df.index.name = None return df else: if not (df.index.dtype_str == 'int64') and not (df.empty): df = df.copy() df['idx'] = df.index if isinstance(df, pd.DataFrame): path = os.path.expanduser( os.path.join(cf.options.basedir, 'databases', "{}.{}.{}".format(type, dbname, tblname))) if df.empty: bcz.fromiter((), dtype=np.int32, mode='w', count=0, rootdir=path) else: bcz.ctable.fromdataframe(df, mode='w', rootdir=path) if 'idx' in df.columns.values: del df return
def shards(bcolz_dir, taxi_df): single_bcolz = str(bcolz_dir.join('yellow_tripdata_2016-01.bcolz')) ct = ctable.fromdataframe(taxi_df, rootdir=single_bcolz) step, remainder = divmod(len(ct), NR_SHARDS) count = 0 shards = [single_bcolz] for idx in range(0, len(ct), step): print("Creating shard {}".format(count + 1)) if idx == len(ct) * (NR_SHARDS - 1): step = step + remainder shard_file = str(bcolz_dir.join('tripdata_2016-01-%s.bcolzs' % count)) ct_shard = bcolz.fromiter(ct.iter(idx, idx + step), ct.dtype, step, rootdir=shard_file, mode='w') shards.append(shard_file) ct_shard.flush() count += 1 yield shards
def getobject(self): if self.flavor == 'carray': obj = bcolz.zeros(10, dtype="i1", rootdir=self.rootdir) assert type(obj) == bcolz.carray elif self.flavor == 'ctable': obj = bcolz.fromiter(((i, i*2) for i in range(10)), dtype='i2,f4', count=10, rootdir=self.rootdir) assert type(obj) == bcolz.ctable return obj
def on_disk_data_cleaner(generator): rootdir = tempfile.mkdtemp(prefix='bcolz-') os.rmdir(rootdir) # folder should be emtpy ct = bz.fromiter(generator, dtype='i4,i4', count=N, rootdir=rootdir) ct = bq.open(rootdir) # print ct ct.flush() ct = bq.open(rootdir) yield ct shutil.rmtree(rootdir)
def test_ctable(clevel): enter() tc = bcolz.fromiter( (mv + np.random.rand(NC) - mv for i in xrange(int(NR))), dtype=dt, cparams=bcolz.cparams(clevel, cname=cname), count=int(NR)) after_create() out = np.fromiter((row for row in tc.where(squery, 'f1,f3')), dtype="f8,f8") after_query() return out
def floats_to_bcolz(input_dir, output_dir, progress=False, **kwargs): """Convert MITgcm float data to bcolz format. Paramters --------- input_dir : path Where to find the MITgcm output data output_dir : path Where to but the bcolz data store (equivalent to bcolz rootdir) kwargs : Extra keyword arguments to pass to floater.input_formats.MITgcmFloatData """ import bcolz output_dir = _maybe_add_suffix(output_dir, '.bcolz') mfd = input.MITgcmFloatData(input_dir, cast_to_dtype='f4', **kwargs) # it does NOT WORK to typecast at this point # values get all mangled #output_dtype = _convert_dtype(mfd.out_dtype, 'f4') ct = bcolz.fromiter(mfd.generator(progress=progress), dtype=mfd.out_dtype, count=int(mfd.nrecs), mode='w', rootdir=output_dir) return ct
# Benchmark for evaluate best ways to convert from a pandas dataframe # (version with a mix of columns of ints and strings) import bcolz import pandas as pd import numpy as np from time import time NR = int(1e6) NC = 100 #bcolz.cparams.setdefaults(clevel=0) print("Creating inputs...") a = bcolz.arange(NR, dtype='i4') s = bcolz.fromiter(("%d"%i for i in xrange(NR)), dtype='S7', count=NR) df = pd.DataFrame.from_items(( ('f%d'%i, a[:] if i < (NC//2) else s[:]) for i in range(NC))) dsize = (NR * (NC//2) * (a.dtype.itemsize + s.dtype.itemsize)) / 2. ** 20 print("Performing benchmarks...") # # Using an iterator (will get objects) # t0 = time() # names = list(df.columns.values) # t = bcolz.ctable([df[key] for key in names], names) # tt = time() - t0 # print("time with constructor: %.2f (%.2f MB/s)" % (tt, dsize / tt)) # print(repr(t.dtype)) # Using generic implementation
a = bcolz.arange(NR, dtype='i4') #ra = np.rec.fromarrays([a]*NC, names=['f%d'%i for i in range(NC)]) ra = bcolz.ctable((a,)*NC)[:] t0 = time() f = tb.open_file(filepath, "w") f.create_table(f.root, nodepath[1:], ra) f.close() tt = time() - t0 print("time for storing the HDF5 table: %.2f (%.2f GB/s)" % (tt, dsize / tt)) # Using an iterator t0 = time() f = tb.open_file(filepath) t = f.get_node(nodepath) t = bcolz.fromiter((r[:] for r in t), dtype=t.dtype, count=len(t)) f.close() tt = time() - t0 print("time with fromiter: %.2f (%.2f GB/s)" % (tt, dsize / tt)) # Using blocked read t0 = time() f = tb.open_file(filepath) t = f.get_node(nodepath) names = t.colnames dtypes = [dt[0] for dt in t.dtype.fields.values()] cols = [np.zeros(0, dtype=dt) for dt in dtypes] ct = bcolz.ctable(cols, names) bs = t._v_chunkshape[0] for i in xrange(0, len(t), bs): ct.append(t[i:i+bs])
def handle_work(self, msg): if msg.isa('execute_code'): return self.execute_code(msg) tmp_dir = tempfile.mkdtemp(prefix='result_') buf_file_fd, buf_file = tempfile.mkstemp(prefix='tar_') os.close(buf_file_fd) args, kwargs = msg.get_args_kwargs() self.logger.info('doing calc %s' % args) filename = args[0] groupby_col_list = args[1] aggregation_list = args[2] where_terms_list = args[3] expand_filter_column = kwargs.get('expand_filter_column') aggregate = kwargs.get('aggregate', True) # create rootdir rootdir = os.path.join(self.data_dir, filename) if not os.path.exists(rootdir): raise Exception('Path %s does not exist' % rootdir) ct = bquery.ctable(rootdir=rootdir, mode='r', auto_cache=True) # prepare filter if not where_terms_list: bool_arr = None else: # quickly verify the where_terms_list if not ct.where_terms_factorization_check(where_terms_list): # return an empty result because the where terms do not give a result for this ctable msg['data'] = '' return msg # else create the boolean array bool_arr = ct.where_terms(where_terms_list, cache=True) # expand filter column check if expand_filter_column: bool_arr = ct.is_in_ordered_subgroups(basket_col=expand_filter_column, bool_arr=bool_arr) # retrieve & aggregate if needed rm_file_or_dir(tmp_dir) if aggregate: # aggregate by groupby parameters result_ctable = ct.groupby(groupby_col_list, aggregation_list, bool_arr=bool_arr, rootdir=tmp_dir) else: # direct result from the ctable column_list = groupby_col_list + [x[0] for x in aggregation_list] if bool_arr is not None: result_ctable = bcolz.fromiter(ct[column_list].where(bool_arr), ct[column_list].dtype, sum(bool_arr), rootdir=tmp_dir, mode='w') else: result_ctable = bcolz.fromiter(ct[column_list], ct[column_list].dtype, ct.len, rootdir=tmp_dir, mode='w') # *** clean up temporary files and memory objects # filter del bool_arr # input ct.free_cachemem() ct.clean_tmp_rootdir() del ct # save result to archive result_ctable.flush() result_ctable.free_cachemem() with tarfile.open(buf_file, mode='w') as archive: archive.add(tmp_dir, arcname=os.path.basename(tmp_dir)) del result_ctable rm_file_or_dir(tmp_dir) # create message with open(buf_file, 'r') as file: # add result to message msg['data'] = file.read() rm_file_or_dir(buf_file) return msg
z = xrange(2, N + 2) print("Starting benchmark now for creating arrays...") # Create a ndarray # x = (i for i in xrange(N)) # true iterable t0 = time() out = np.fromiter(x, dtype='f8', count=N) print("Time for array--> %.3f" % (time() - t0,)) print("out-->", len(out)) #bcolz.set_num_threads(bcolz.ncores//2) # Create a carray #x = (i for i in xrange(N)) # true iterable t0 = time() cout = bcolz.fromiter(x, dtype='f8', count=N, cparams=bcolz.cparams(clevel)) print("Time for carray--> %.3f" % (time() - t0,)) print("cout-->", len(cout)) assert_array_equal(out, cout, "Arrays are not equal") # Create a carray (with unknown size) #x = (i for i in xrange(N)) # true iterable t0 = time() cout = bcolz.fromiter(x, dtype='f8', count=-1, cparams=bcolz.cparams(clevel)) print("Time for carray (count=-1)--> %.3f" % (time() - t0,)) print("cout-->", len(cout)) assert_array_equal(out, cout, "Arrays are not equal") # Retrieve from a structured ndarray gen = ((i, j, k) for i, j, k in izip(x, y, z)) t0 = time()
def _bcolz(self, tblname, dbname=None, type=None, df=None, blaze=False): # Suppress the warning until the next wersion import warnings #from flask.exthook import ExtDeprecationWarning #warnings.simplefilter('ignore',ExtDeprecationWarning) warnings.simplefilter('ignore',FutureWarning) import blaze as blz if type is None: type = self.type if dbname is None: dbname = self.name if df is None: # return the dataframe if it exists try: df = bcz.open( os.path.expanduser( os.path.join( cf.options.basedir, 'databases', "{}.{}.{}".format(type, dbname, tblname) ) ) ) except IOError: return None else: if len(df) == 0: df = pd.DataFrame() if blaze: df = blz.data(df) else: if blaze: df = blz.data(df) else: df = df.todataframe() if not blaze and 'idx' in df.columns.values: df.set_index('idx', drop=True, inplace=True) df.index.name = None return df else: if not(df.index.dtype_str == 'int64') and not(df.empty): df = df.copy() df['idx'] = df.index if isinstance(df,pd.DataFrame): path = os.path.expanduser( os.path.join( cf.options.basedir, 'databases', "{}.{}.{}".format(type, dbname, tblname) ) ) if df.empty: bcz.fromiter((),dtype=np.int32,mode='w',count=0,rootdir=path) else: bcz.ctable.fromdataframe(df,mode='w',rootdir=path) if 'idx' in df.columns.values: del df return
from __future__ import print_function import contextlib, time import bcolz, numpy @contextlib.contextmanager def ctime(label=""): "Counts the time spent in some context" t = time.time() yield print(label, round(time.time() - t, 3), "sec") N = 1000 * 1000 ct = bcolz.fromiter(((i, i*i, i*i*i) for i in xrange(N)), dtype='i8,i8,i8', count=N) b = numpy.array(numpy.arange(N) % 2, dtype="bool") c = bcolz.carray(b) sorted_index = range(1, N, 2) with ctime(): r0 = (ct['f0'][sorted_index]).tolist() with ctime(): r1 = [x.f0 for x in ct.where(b)] assert r0 == r1 with ctime(): r2 = [x.f0 for x in ct.where(c)] assert r0 == r2
import bcolz import numpy from .bench_helper import ctime N = 1000 * 1000 ct = bcolz.fromiter(((i, i * i, i * i * i) for i in xrange(N)), dtype='i8,i8,i8', count=N) b = numpy.array(numpy.arange(N) % 2, dtype="bool") c = bcolz.carray(b) sorted_index = range(1, N, 2) class Suite: def time_tolist(self): return (ct['f0'][sorted_index]).tolist() def time_where_01(self): return [x.f0 for x in ct.where(b)] def time_where_02(self): return [x.f0 for x in ct.where(c)] def time_where_03(self): return [x for x in ct['f0'].where(b)] def time_where_04(self): return [x for x in ct['f0'].where(c)]
def fetchwhere(self, expression, outcols=None, limit=None, skip=0, out_flavor=None, user_dict={}, vm=None, **kwargs): """Fetch the rows fulfilling the `expression` condition. Parameters ---------- expression : string or carray A boolean Numexpr expression or a boolean carray. outcols : list of strings or string The list of column names that you want to get back in results. Alternatively, it can be specified as a string such as 'f0 f1' or 'f0, f1'. If None, all the columns are returned. If the special name 'nrow__' is present, the number of row will be included in output. limit : int A maximum number of elements to return. The default is return everything. skip : int An initial number of elements to skip. The default is 0. out_flavor : string The flavor for the `out` object. It can be 'bcolz' or 'numpy'. If None, the value is get from `bcolz.defaults.out_flavor`. user_dict : dict An user-provided dictionary where the variables in expression can be found by name. vm : string The virtual machine to be used in computations. It can be 'numexpr', 'python' or 'dask'. The default is to use 'numexpr' if it is installed. kwargs : list of parameters or dictionary Any parameter supported by the carray constructor. Returns ------- out : bcolz or numpy object The outcome of the expression. In case out_flavor='bcolz', you can adjust the properties of this object by passing any additional arguments supported by the carray constructor in `kwargs`. See Also -------- whereblocks """ if out_flavor is None: out_flavor = bcolz.defaults.out_flavor if out_flavor == "numpy": it = self.whereblocks(expression, len(self), outcols, limit, skip, user_dict=self._ud(user_dict), vm=vm) return next(it) elif out_flavor in ("bcolz", "carray"): dtype = self._dtype_fromoutcols(outcols) it = self.where(expression, outcols, limit, skip, out_flavor=tuple, user_dict=self._ud(user_dict), vm=vm) ct = bcolz.fromiter(it, dtype, count=-1, **kwargs) ct.flush() return ct else: raise ValueError( "`out_flavor` can only take 'bcolz' or 'numpy values")
def _bcolz(self, tblname, df=None, m80name=None, m80type=None, blaze=False): ''' This is the access point to the bcolz database ''' try: import blaze as blz except FutureWarning: # pragma: no cover pass import warnings # from flask.exthook import ExtDeprecationWarning # warnings.simplefilter('ignore', ExtDeprecationWarning) warnings.simplefilter('ignore', FutureWarning) # Fill in the defaults if they were not provided if m80type is None: m80type = self._m80_dtype if m80name is None: m80name = self._m80_name path = self._get_dbpath('bcz', create=True) # function is a getter if df is provided if df is None: # return the dataframe if it exists try: df = bcz.open(os.path.join(path, tblname)) except IOError: raise IOError( f'could not open database for {m80type}:{m80name} ') else: if len(df) == 0: df = pd.DataFrame() if blaze: df = blz.data(df) else: if blaze: df = blz.data(df) else: df = df.todataframe() if not blaze and f'{tblname}_index' in self._dict: df.set_index(self._dict[f'{tblname}_index'], inplace=True) return df # If df is set, then store the table else: df = df.copy() if df.index.name is not None: # We need to remember to index self._dict[tblname + '_index'] = df.index.name df.reset_index(inplace=True) path = os.path.join(path, tblname) if df.empty: bcz.fromiter((), dtype=np.int32, mode='w', count=0, rootdir=path) else: bcz.ctable.fromdataframe(df, mode='w', rootdir=path) return
def sql2bcolz(sql, dsfilename, con, type_hints={}): """ Read SQL query, return a DataFrame. Parameters ---------- sql : SQL string with all parameters substituted con : connectable (django connection, or psycopg connection) """ # We have to estimate number of rows for one-time allocation of numpy arrays count = None sql_count = "select count(*) from (%s) s" % sql cursor = con.cursor() cursor.execute(sql_count) count = cursor.fetchone()[0] cursor.close() if count == 0: return None # Funny way to reliable get psycopg connection. We need it to get server-side cursors. pgcon = con.cursor().connection # with transaction.atomic(): if True: cursor = pgcon.cursor("serversidecursor", withhold=True) pgcon.commit() chunk_size = 100000 if count: chunk_size = int(max(min(count / 10, 100000), 10)) cursor.itersize = chunk_size print(str(datetime.datetime.now()), "Start executing query … ") cursor.execute(sql) print(str(datetime.datetime.now()), "End executing query … ") row0 = cursor.fetchone() print(str(datetime.datetime.now()), "End fetch first row … ") columns = [] dtypes = [] def ResultIter(cursor): while True: results = cursor.fetchmany(chunk_size) if not results: break for result in results: yield result for i, col_desc in enumerate(cursor.description): col_name = col_desc[0] dtype = None if col_name == 'agg_color_model_ids': iii = 1 pass if col_name in type_hints: dtype = type_hints[col_name] else: if col_desc.type_code == 25: dtype = 'S16' pass if col_desc.type_code == 16: dtype = 'bool' elif col_desc.type_code == 1043: dtype = 'S16' pass elif col_desc.type_code == 1082: dtype = 'i8' elif col_desc.type_code in [700, 701]: dtype = 'f%d' % col_desc.internal_size elif col_desc.type_code == 1016: dtype = '(128,)i8' elif col_desc.type_code == psycopg2.NUMBER: size_ = col_desc.internal_size if size_ < 0: size_ = 8 dtype = 'i%d' % size_ columns.append(col_name) dtypes.append((col_name, dtype)) # ct = bcolz.fromiter(ResultIter(cursor), dtype=dtypes, count=count, rootdir=dsfilename) ct = bcolz.fromiter(cursor, dtype=dtypes, count=count, rootdir=dsfilename) cursor.close() del cursor pass
def time_sum_03(self): return bcolz.fromiter((x for x in ct['f0'].where(c)), dtype=ct['f0'].dtype, count=c.wheretrue().sum()).sum()
z = xrange(2, N + 2) print("Starting benchmark now for creating arrays...") # Create a ndarray # x = (i for i in xrange(N)) # true iterable t0 = time() out = np.fromiter(x, dtype='f8', count=N) print("Time for array--> %.3f" % (time() - t0, )) print("out-->", len(out)) #bcolz.set_num_threads(bcolz.ncores//2) # Create a carray #x = (i for i in xrange(N)) # true iterable t0 = time() cout = bcolz.fromiter(x, dtype='f8', count=N, cparams=bcolz.cparams(clevel)) print("Time for carray--> %.3f" % (time() - t0, )) print("cout-->", len(cout)) assert_array_equal(out, cout, "Arrays are not equal") # Create a carray (with unknown size) #x = (i for i in xrange(N)) # true iterable t0 = time() cout = bcolz.fromiter(x, dtype='f8', count=-1, cparams=bcolz.cparams(clevel)) print("Time for carray (count=-1)--> %.3f" % (time() - t0, )) print("cout-->", len(cout)) assert_array_equal(out, cout, "Arrays are not equal") # Retrieve from a structured ndarray gen = ((i, j, k) for i, j, k in izip(x, y, z)) t0 = time()
# Benchmark for evaluate best ways to convert from a pandas dataframe # (version with a mix of columns of ints and strings) import bcolz import pandas as pd import numpy as np from time import time NR = int(1e6) NC = 100 #bcolz.cparams.setdefaults(clevel=0) print("Creating inputs...") a = bcolz.arange(NR, dtype='i4') s = bcolz.fromiter(("%d" % i for i in xrange(NR)), dtype='S7', count=NR) df = pd.DataFrame.from_items( (('f%d' % i, a[:] if i < (NC // 2) else s[:]) for i in range(NC))) dsize = (NR * (NC // 2) * (a.dtype.itemsize + s.dtype.itemsize)) / 2.**20 print("Performing benchmarks...") # # Using an iterator (will get objects) # t0 = time() # names = list(df.columns.values) # t = bcolz.ctable([df[key] for key in names], names) # tt = time() - t0 # print("time with constructor: %.2f (%.2f MB/s)" % (tt, dsize / tt)) # print(repr(t.dtype)) # Using generic implementation