def get_quantized_ctable(dtype, cparams, quantize=None, expectedlen=None): """Return a ctable with the quantize filter enabled for floating point cols. License This function is taken from the reflexible package (https://github.com/spectraphilic/reflexible/tree/master/reflexible). Authored by John F Burkhart <*****@*****.**> with contributions Francesc Alted <*****@*****.**>. Licensed under: 'This script follows creative commons usage.' """ columns, names = [], [] for fname, ftype in dtype.descr: names.append(fname) if 'f' in ftype: cparams2 = bcolz.cparams(clevel=cparams.clevel, cname=cparams.cname, quantize=quantize) columns.append( bcolz.zeros(0, dtype=ftype, cparams=cparams2, expectedlen=expectedlen)) else: columns.append( bcolz.zeros(0, dtype=ftype, cparams=cparams, expectedlen=expectedlen)) return bcolz.ctable(columns=columns, names=names)
def get_quantized_ctable(dtype, cparams, quantize=None, expectedlen=None): """Return a ctable with the quantize filter enabled for floating point cols. """ import bcolz columns, names = [], [] for fname, ftype in dtype.descr: names.append(fname) if 'f' in ftype: cparams2 = bcolz.cparams(clevel=cparams.clevel, cname=cparams.cname, quantize=quantize) columns.append(bcolz.zeros(0, dtype=ftype, cparams=cparams2, expectedlen=expectedlen)) else: columns.append(bcolz.zeros(0, dtype=ftype, cparams=cparams, expectedlen=expectedlen)) return bcolz.ctable(columns=columns, names=names)
def test_into_inplace(): x = np.arange(600).reshape((20, 30)) a = into(Array, x, blockshape=(4, 5)) b = bcolz.zeros(shape=(20, 30), dtype=x.dtype) append(b, a, inplace=True) assert eq(b[:], x)
def test01c(self): """Testing `zeros` constructor (III)""" a = np.zeros((2, 2), dtype='(4,)i4') b = bcolz.zeros((2, 2), dtype='(4,)i4', rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) # print "b->", `b` assert_array_equal(a, b, "Arrays are not equal")
def read_releases_v10(pathname): """ Parses release file in `pathname` and return a ctable with its contents. This is only suited for files in Fortran90 namelist format (FP v10). Parameters ---------- pathname : pathname Release file name (in Fortran90 namelist format). Returns ------- A ctable object from bcolz package. """ import bcolz # Setup the container for the data dtype = [('IDATE1', np.int32), ('ITIME1', np.int32), ('IDATE2', np.int32), ('ITIME2', np.int32), ('LON1', np.float32), ('LON2', np.float32), ('LAT1', np.float32), ('LAT2', np.float32), ('Z1', np.float32), ('Z2', np.float32), ('ZKIND', np.int8), ('MASS', np.float32), ('PARTS', np.int32), ('COMMENT', 'S32')] cparams = bcolz.cparams(cname="lz4", clevel=6, shuffle=1) ctable = bcolz.zeros(0, dtype=dtype, cparams=cparams) nrecords = ctable['IDATE1'].chunklen releases = np.zeros(nrecords, dtype=dtype) # Prepare for reading the input input_str = open(pathname, 'r').read() marker = "&RELEASE\n" len_marker = len(marker) release_re = r'\S+=\s+[\"|\s](\S+)[,|\"|\w]' # Loop over all the marker groups i, n = 0, 0 while True: i = input_str.find(marker, i) j = input_str.find(marker, i + 1) n += 1 group_block = input_str[i + len_marker:j] i = j values = tuple(re.findall(release_re, group_block)) try: releases[(n - 1) % nrecords] = values except ValueError: print("Problem at: group: %d, %s" % (n, group_block)) print("values:", values) raise if (n % nrecords) == 0: ctable.append(releases) if (i == -1) or (j == -1): break # marker is not found anymore # Remainder ctable.append(releases[:n % nrecords]) ctable.flush() return ctable
def test00a(self): """Testing wheretrue() in combination with a list constructor""" a = bcolz.zeros(self.N, dtype="bool", rootdir=self.rootdir) a[30:40] = bcolz.ones(10, dtype="bool") alist = list(a) blist1 = [r for r in a.wheretrue()] self.assertTrue(blist1 == list(range(30, 40))) alist2 = list(a) self.assertTrue(alist == alist2, "wheretrue() not working correctly")
def __init__(self, df, rootdir=None, chunklen=2**16, **kwargs): if rootdir is None: rootdir = tempfile.mkdtemp('.cframe') self._explicitly_given_path = False else: os.mkdir(rootdir) self._explicitly_given_path = True self.blocks = dict((col, bcolz.zeros(rootdir=os.path.join(rootdir, '%s.bcolz' % col), shape=(0,), dtype=df.dtypes[col], safe=False, chunklen=chunklen, **kwargs)) for col in df.columns) self.columns = df.columns self.index = bcolz.zeros(shape=(0,), dtype=df.index.values.dtype, safe=False, chunklen=chunklen, **kwargs) self.rootdir = rootdir
def getobject(self): if self.flavor == 'carray': obj = bcolz.zeros(10, dtype="i1", rootdir=self.rootdir) assert type(obj) == bcolz.carray elif self.flavor == 'ctable': obj = bcolz.fromiter(((i, i*2) for i in range(10)), dtype='i2,f4', count=10, rootdir=self.rootdir) assert type(obj) == bcolz.ctable return obj
def __init__(self, n_elements, sizes=None, dtype=float, datadir=None): """ Create a new ring buffer with the given number of elements\n individual element size and element type Parameters: ----------- n_elements: int The number of elements (individual ring buffers) sizes: int, list or array Size for all elements or list/array with sizes for elements dtype: data-type, optional (default=float) Data type of the ring buffer datadir: str If specified the data is stored on disk (default=None) """ self._arr = np.empty(n_elements, object) if isinstance(sizes, (list, np.ndarray)): self._sizes = np.array(sizes) for i in range(len(self._arr)): if datadir is not None: self._arr[i] = bc.zeros(int(sizes[i]), dtype, rootdir=datadir + '/arr_' + str(i), mode='w') else: self._arr[i] = bc.zeros(int(sizes[i]), dtype) elif isinstance(sizes, int): self._sizes = np.array([sizes] * len(self._arr)) for i in range(len(self._arr)): if datadir is not None: self._arr[i] = bc.zeros(int(sizes), dtype, rootdir=datadir + '/arr_' + str(i), mode='w') else: self._arr[i] = bc.zeros(int(sizes), dtype) self._dtype = dtype self._sizes = np.zeros(n_elements, int) if sizes is not None: self._sizes += sizes self._indices = np.zeros(n_elements, dtype=int)
def test01a(self): """Testing where() in combination with a list constructor""" a = bcolz.zeros(self.N, dtype="bool") a[30:40] = bcolz.ones(10, dtype="bool") b = bcolz.arange(self.N, dtype="f4") blist = list(b) blist1 = [r for r in b.where(a)] self.assert_(blist1 == range(30,40)) blist2 = list(b) self.assert_(blist == blist2, "where() not working correctly")
def test_append_to_array(): x = np.arange(600).reshape((20, 30)) a = into(Array, x, blockshape=(4, 5)) b = bcolz.zeros(shape=(0, 30), dtype=x.dtype) append(b, a) assert eq(b[:], x) with tmpfile('hdf5') as fn: h = into(fn+'::/data', a) assert eq(h[:], x) h.file.close()
def test_append_to_array(): x = np.arange(600).reshape((20, 30)) a = into(Array, x, blockshape=(4, 5)) b = bcolz.zeros(shape=(0, 30), dtype=x.dtype) append(b, a) assert eq(b[:], x) with tmpfile('hdf5') as fn: h = into(fn + '::/data', a) assert eq(h[:], x) h.file.close()
def preprocess_brats(data_path, path, size, train_data=True, x_name='data', y_name='labels', csv_name='mapping.csv'): path = Path(path) path.mkdir(parents=True, exist_ok=True) size = listify(size, 3) volumes = bcolz.zeros([0, 4, *size], dtype=np.int64, chunklen=1, mode='w', rootdir=path / x_name) if train_data: labels = bcolz.zeros([0, *size], dtype=np.int64, chunklen=1, mode='w', rootdir=path / y_name) processors = [CropProcessor(), ResizeProcessor(size)] files = (data_path / 'LGG').ls() + ( data_path / 'HGG').ls() if train_data else data_path.ls() (path / csv_name ).open('w').write('modal,subject\n' if train_data else 'subject\n') with (path / csv_name).open('a') as f: for file in progress_bar(files): if not file.is_dir(): continue x, y = get_brats_data(file, train_data) for p in processors: x, y = p(x, y) volumes.append(x) if train_data: labels.append(y) f.write(f'{file.parent.name},{file.name}\n' if train_data else f'{file.name}\n') volumes.flush() if train_data: labels.flush()
def preprocess_brain_val(source_dir: Path, destination_dir: Path): # Full Validation 100 (out of 800) brain 2d. slice_w = 256 slice_h = 256 train = list(range(800)) random.seed(0) random.shuffle(train) disk_x = bcolz.zeros((0, 1, slice_w, slice_h), rootdir=destination_dir, chunklen=1) for i in train[700:]: volume = nib.load(source_dir / "{:05}.nii.gz".format(i)).get_fdata() for s in range(256): x = volume[None, None, :, :, s] disk_x.append(x) disk_x.flush()
def __init__(self, data_element_shape, dtype, batch_size, save_path, length=None, append=False, kwargs={}): import bcolz super(bcolz_array_writer, self).__init__(None, data_element_shape, dtype, batch_size, length) self.save_path = save_path self.kwargs = kwargs # Set up array kwargs self.arr_kwargs = { 'expectedlen': length, 'cparams': bcolz.cparams(clevel=5, shuffle=True, cname='blosclz'), 'dtype': dtype, 'rootdir': save_path } if kwargs is not None: self.arr_kwargs.update(kwargs) # Create the file-backed array, open for writing. # (check if the array exists; if not, create it) if append: try: self.storage_array = bcolz.open(self.save_path, mode='a') self.storage_array_ptr = len(self.storage_array) except FileNotFoundError: append = False if not append: try: self.storage_array = bcolz.zeros(shape=(0, ) + data_element_shape, mode='w', **self.arr_kwargs) self.storage_array_ptr = 0 except: print("Error: failed to create file-backed bcolz storage " "array.") raise
def preprocess_abdomen_val(source_dir: Path, destination_path: Path): # Abdom data val 2d, downsample by 2x, 50/500 volumes, all slices slice_w = 256 # 512 slice_h = 256 # 512 train = list(range(550)) random.seed(0) random.shuffle(train) disk_x = bcolz.zeros((0, 1, slice_w, slice_h), rootdir=destination_path, chunklen=1) for i in train[450:]: volume = nib.load(source_dir / "{:05}.nii.gz".format(i)).get_fdata() #volume = nib.load("../data/abdom/abdom_train/{:05}.nii.gz".format(i)).get_fdata() slices = list(range(0, 512, 2)) for s in slices: x = volume[None, None, :, :, s] # .transpose([0, 1, 3, 2]) x = zoom(x[0, 0, ...], 0.5, order=2) x = x[None, None, ...] disk_x.append(x) disk_x.flush()
from time import time import numpy as np import bcolz N = 2e8 dtype = 'i4' t0 = time() a = np.zeros(N, dtype=dtype) print("Time numpy.zeros() --> %.4f" % (time() - t0)) t0 = time() ac = bcolz.zeros(N, dtype=dtype) # ac = bcolz.carray(a) print("Time bcolz.zeros() --> %.4f" % (time() - t0)) print("ac-->", repr(ac)) #assert(np.all(a == ac))
def make_group_index(self, factor_list, values_list, groupby_cols, array_length, bool_arr): # create unique groups for groupby loop if len(factor_list) == 0: # no columns to groupby over, so directly aggregate the measure # columns to 1 total (index 0/zero) factor_carray = bcolz.zeros(array_length, dtype='int64') values = ['Total'] elif len(factor_list) == 1: # single column groupby, the groupby output column # here is 1:1 to the values factor_carray = factor_list[0] values = values_list[0] else: # multi column groupby # nb: this might also be cached in the future # first combine the factorized columns to single values factor_set = {x: y for x, y in zip(groupby_cols, factor_list)} # create a numexpr expression that calculates the place on # a cartesian join index eval_str = '' previous_value = 1 for col, values \ in zip(reversed(groupby_cols), reversed(values_list)): if eval_str: eval_str += ' + ' eval_str += str(previous_value) + '*' + col previous_value *= len(values) # calculate the cartesian group index for each row factor_input = bcolz.eval(eval_str, user_dict=factor_set) # now factorize the unique groupby combinations factor_carray, values = ctable_ext.factorize(factor_input) skip_key = None if bool_arr is not None: # make all non relevant combinations -1 factor_carray = bcolz.eval( '(factor + 1) * bool - 1', user_dict={'factor': factor_carray, 'bool': bool_arr}) # now check how many unique values there are left factor_carray, values = ctable_ext.factorize(factor_carray) # values might contain one value too much (-1) (no direct lookup # possible because values is a reversed dict) filter_check = \ [key for key, value in values.iteritems() if value == -1] if filter_check: skip_key = filter_check[0] # using nr_groups as a total length might be one one off due to the skip_key # (skipping a row in aggregation) # but that is okay normally nr_groups = len(values) if skip_key is None: # if we shouldn't skip a row, set it at the first row after the total number of groups skip_key = nr_groups return factor_carray, nr_groups, skip_key
def test01b(self): """Testing where() with a multidimensional array""" a = bcolz.zeros((self.N, 10), dtype="bool", rootdir=self.rootdir) a[30:40] = bcolz.ones(10, dtype="bool") b = bcolz.arange(self.N * 10, dtype="f4").reshape((self.N, 10)) self.assertRaises(NotImplementedError, b.where, a)
def make_group_index(self, factor_list, values_list, groupby_cols, array_length, bool_arr): '''Create unique groups for groupby loop Args: factor_list: values_list: groupby_cols: array_length: bool_arr: Returns: carray: (factor_carray) int: (nr_groups) the number of resulting groups int: (skip_key) ''' def _create_eval_str(groupby_cols, values_list, check_overflow=True): eval_list = [] eval_str = '' col_list = [] previous_value = 1 # Sort evaluated columns by length col_len_list = [(col, values) for col, values in zip(groupby_cols, values_list)] col_len_list.sort(key=lambda x: len(x[1])) groupby_cols = [col for col, _ in col_len_list] values_list = [values for _, values in col_len_list] for col, values \ in zip(groupby_cols, values_list): # check for overflow if check_overflow: if previous_value * len(values) > 4294967295: eval_list.append((eval_str, col_list)) # reset eval_str = '' col_list = [] previous_value = 1 if eval_str: eval_str += ' + ' else: eval_str += '-2147483648 + ' eval_str += str(previous_value) + '*' + col col_list.append(col) previous_value *= len(values) eval_list.append((eval_str, col_list)) return eval_list def _calc_group_index(eval_list, factor_set, vm=None): factorize_list = [] for eval_node in eval_list: # calculate the cartesian group index for each row factor_input = bcolz.eval(eval_node[0], user_dict=factor_set, vm=vm) # now factorize the unique groupby combinations sub_factor_carray, sub_values = ctable_ext.factorize(factor_input) factorize_list.append((sub_factor_carray, sub_values)) return factorize_list def _is_reducible(eval_list): for eval_node in eval_list: if len(eval_node[1]) > 1: return True return False def calc_index(groupby_cols, values_list, factor_set, vm=None): # Initialize eval list eval_list = _create_eval_str(groupby_cols, values_list) # Reduce expression as possible while _is_reducible(eval_list): del groupby_cols del values_list factorize_list = _calc_group_index(eval_list, factor_set) factor_set = {'g' + str(i): x[0] for i, x in enumerate(factorize_list)} groupby_cols = ['g' + str(i) for i, x in enumerate(factorize_list)] values_list = [x[1] for i, x in enumerate(factorize_list)] eval_list = _create_eval_str(groupby_cols, values_list) # If we have multiple expressions that cannot be reduced anymore, rewrite as a single one and use Python vm if len(eval_list) > 1: eval_list = _create_eval_str(groupby_cols, values_list, check_overflow=False) vm = 'python' del groupby_cols del values_list # Now we have a single expression, factorize it return _calc_group_index(eval_list, factor_set, vm=vm)[0] # create unique groups for groupby loop if len(factor_list) == 0: # no columns to groupby over, so directly aggregate the measure # columns to 1 total (index 0/zero) factor_carray = bcolz.zeros(array_length, dtype='int64') values = ['Total'] elif len(factor_list) == 1: # single column groupby, the groupby output column # here is 1:1 to the values factor_carray = factor_list[0] values = values_list[0] else: # multi column groupby # nb: this might also be cached in the future # first combine the factorized columns to single values factor_set = {x: y for x, y in zip(groupby_cols, factor_list)} # create a numexpr expression that calculates the place on # a cartesian join index factor_carray, values = calc_index(groupby_cols, values_list, factor_set) skip_key = None if bool_arr is not None: # make all non relevant combinations -1 factor_carray = bcolz.eval( '(factor + 1) * bool - 1', user_dict={'factor': factor_carray, 'bool': bool_arr}) # now check how many unique values there are left factor_carray, values = ctable_ext.factorize(factor_carray) # values might contain one value too much (-1) (no direct lookup # possible because values is a reversed dict) filter_check = \ [key for key, value in values.items() if value == -1] if filter_check: skip_key = filter_check[0] # using nr_groups as a total length might be one one off due to the skip_key # (skipping a row in aggregation) # but that is okay normally nr_groups = len(values) if skip_key is None: # if we shouldn't skip a row, set it at the first row after the total number of groups skip_key = nr_groups return factor_carray, nr_groups, skip_key
def test00b(self): """Testing wheretrue() with a multidimensional array""" a = bcolz.zeros((self.N, 10), dtype="bool", rootdir=self.rootdir) a[30:40] = bcolz.ones(10, dtype="bool") self.assertRaises(NotImplementedError, a.wheretrue)
def read_partpositions(filename, nspec, ctable=True, clevel=5, cname="lz4", quantize=None): """Read the particle positions in `filename`. This function strives to use as less memory as possible; for this, a bcolz ctable container is used for holding the data. Besides to be compressed in-memory, its chunked nature makes a natural fit for data that needs to be appended because it does not need expensive memory resize operations. NOTE: This code reads directly from un UNFORMATTED SEQUENTIAL data Fortran file so care has been taken to skip the record length at the beginning and the end of every record. See: http://stackoverflow.com/questions/8751185/fortran-unformatted-file-format Parameters ---------- filename : string The file name of the particle raw data nspec : int number of species in particle raw data ctable : bool Return a bcolz ctable container. If not, a numpy structured array is returned instead. clevel : int Compression level for the ctable container cname : string Codec name for the ctable container. Can be 'blosclz', 'lz4', 'zlib' or 'zstd'. quantize : int Quantize data to improve (lossy) compression. Data is quantized using np.around(scale*data)/scale, where scale is 2**bits, and bits is determined from the quantize value. For example, if quantize=1, bits will be 4. 0 means that the quantization is disabled. Returns ------- ctable object OR structured_numpy_array Returning a ctable is preferred because it is used internally so it does not require to be converted to other formats, so it is faster and uses less memory. Note: Passing a `quantize` param > 0 can increase the compression ratio of the ctable container, but it may also slow down the reading speed significantly. License This function is taken from the reflexible package (https://github.com/spectraphilic/reflexible/tree/master/reflexible). Authored by John F Burkhart <*****@*****.**> with contributions Francesc Alted <*****@*****.**>. Licensed under: 'This script follows creative commons usage.' """ CHUNKSIZE = 10 * 1000 xmass_dtype = [('xmass_%d' % (i + 1), 'f4') for i in range(nspec)] # note age is calculated from itramem by adding itimein out_fields = [('npoint', 'i4'), ('xtra1', 'f4'), ('ytra1', 'f4'), ('ztra1', 'f4'), ('itramem', 'i4'), ('topo', 'f4'), ('pvi', 'f4'), ('qvi', 'f4'), ('rhoi', 'f4'), ('hmixi', 'f4'), ('tri', 'f4'), ('tti', 'f4')] + xmass_dtype raw_fields = [('begin_recsize', 'i4') ] + out_fields + [('end_recsize', 'i4')] raw_rectype = np.dtype(raw_fields) recsize = raw_rectype.itemsize cparams = bcolz.cparams(clevel=clevel, cname=cname) if quantize is not None and quantize > 0: out = get_quantized_ctable(raw_rectype, cparams=cparams, quantize=quantize, expectedlen=int(1e6)) else: out = bcolz.zeros(0, dtype=raw_rectype, cparams=cparams, expectedlen=int(1e6)) with open(filename, "rb", buffering=1) as f: # The timein value is at the beginning of the file reclen = np.ndarray(shape=(1, ), buffer=f.read(4), dtype="i4")[0] assert reclen == 4 itimein = np.ndarray(shape=(1, ), buffer=f.read(4), dtype="i4") reclen = np.ndarray(shape=(1, ), buffer=f.read(4), dtype="i4")[0] assert reclen == 4 nrec = 0 while True: # Try to read a complete chunk data = f.read(CHUNKSIZE * recsize) read_records = int(len(data) / recsize) # the actual number of records read chunk = np.ndarray(shape=(read_records, ), buffer=data, dtype=raw_rectype) # Add the chunk to the out array out.append(chunk[:read_records]) nrec += read_records if read_records < CHUNKSIZE: # We reached the end of the file break # Truncate at the max length (last row is always a sentinel, so remove it) out.trim(1) # Remove the first and last columns out.delcol("begin_recsize") out.delcol("end_recsize") if ctable: return out else: return out[:]
def make_group_index(self, factor_list, values_list, groupby_cols, array_length, bool_arr): # create unique groups for groupby loop if len(factor_list) == 0: # no columns to groupby over, so directly aggregate the measure # columns to 1 total (index 0/zero) factor_carray = bcolz.zeros(array_length, dtype='int64') values = ['Total'] elif len(factor_list) == 1: # single column groupby, the groupby output column # here is 1:1 to the values factor_carray = factor_list[0] values = values_list[0] else: # multi column groupby # nb: this might also be cached in the future # first combine the factorized columns to single values factor_set = {x: y for x, y in zip(groupby_cols, factor_list)} # create a numexpr expression that calculates the place on # a cartesian join index eval_str = '' previous_value = 1 for col, values \ in zip(reversed(groupby_cols), reversed(values_list)): if eval_str: eval_str += ' + ' eval_str += str(previous_value) + '*' + col previous_value *= len(values) # calculate the cartesian group index for each row factor_input = bcolz.eval(eval_str, user_dict=factor_set) # now factorize the unique groupby combinations factor_carray, values = ctable_ext.factorize(factor_input) skip_key = None if bool_arr is not None: # make all non relevant combinations -1 factor_carray = bcolz.eval('(factor + 1) * bool - 1', user_dict={ 'factor': factor_carray, 'bool': bool_arr }) # now check how many unique values there are left factor_carray, values = ctable_ext.factorize(factor_carray) # values might contain one value too much (-1) (no direct lookup # possible because values is a reversed dict) filter_check = \ [key for key, value in values.iteritems() if value == -1] if filter_check: skip_key = filter_check[0] # using nr_groups as a total length might be one one off due to the skip_key # (skipping a row in aggregation) # but that is okay normally nr_groups = len(values) if skip_key is None: # if we shouldn't skip a row, set it at the first row after the total number of groups skip_key = nr_groups return factor_carray, nr_groups, skip_key
def make_group_index(self, groupby_cols, bool_arr): '''Create unique groups for groupby loop Args: factor_list: values_list: groupby_cols: bool_arr: Returns: carray: (carray_factor) int: (nr_groups) the number of resulting groups int: (skip_key) ''' factor_list, values_list = self.factorize_groupby_cols(groupby_cols) # create unique groups for groupby loop if len(factor_list) == 0: # no columns to groupby over, so directly aggregate the measure # columns to 1 total tmp_rootdir = self.create_tmp_rootdir() carray_factor = bcolz.zeros(len(self), dtype='int64', rootdir=tmp_rootdir, mode='w') carray_values = ['Total'] elif len(factor_list) == 1: # single column groupby, the groupby output column # here is 1:1 to the values carray_factor = factor_list[0] carray_values = values_list[0] else: # multi column groupby # first combine the factorized columns to single values if self.group_cache_valid(col_list=groupby_cols): # there is a group cache that we can use col_rootdir = os.path.join(self.rootdir, self.create_group_base_name(groupby_cols)) col_factor_rootdir = col_rootdir + '.factor' carray_factor = bcolz.carray(rootdir=col_factor_rootdir) col_values_rootdir = col_rootdir + '.values' carray_values = bcolz.carray(rootdir=col_values_rootdir) else: # create a brand new groupby col combination carray_factor, carray_values = \ self.create_group_column_factor(factor_list, groupby_cols, cache=self.auto_cache) nr_groups = len(carray_values) skip_key = None if bool_arr is not None: # make all non relevant combinations -1 tmp_rootdir = self.create_tmp_rootdir() carray_factor = bcolz.eval( '(factor + 1) * bool - 1', user_dict={'factor': carray_factor, 'bool': bool_arr}, rootdir=tmp_rootdir, mode='w') # now check how many unique values there are left tmp_rootdir = self.create_tmp_rootdir() labels = bcolz.carray([], dtype='int64', expectedlen=len(carray_factor), rootdir=tmp_rootdir, mode='w') carray_factor, values = ctable_ext.factorize(carray_factor, labels) # values might contain one value too much (-1) (no direct lookup # possible because values is a reversed dict) filter_check = \ [key for key, value in values.items() if value == -1] if filter_check: skip_key = filter_check[0] # the new nr of groups depends on the outcome after filtering nr_groups = len(values) # using nr_groups as a total length might be one one off due to the skip_key # (skipping a row in aggregation) # but that is okay normally if skip_key is None: # if we shouldn't skip a row, set it at the first row after the total number of groups skip_key = nr_groups return carray_factor, nr_groups, skip_key
# # Benchmark to check the creation of an array of length > 2**32 (5e9) import sys from time import time import bcolz if sys.version_info >= (3,0): long = int t0 = time() #cn = bcolz.zeros(5e9, dtype="i1") cn = bcolz.zeros(5e9, dtype="i1", rootdir='large_carray-bench', mode='w') print("Creation time:", round(time() - t0, 3)) print("len:", len(cn)) assert len(cn) == int(5e9) t0 = time() cn = bcolz.carray(rootdir='large_carray-bench', mode='a') print("Re-open time:", round(time() - t0, 3)) print("len(cn)", len(cn)) assert len(cn) == int(5e9) # Now check some accesses cn[1] = 1 assert cn[1] == 1 cn[int(2e9)] = 2 assert cn[int(2e9)] == 2 cn[long(3e9)] = 3 assert cn[long(3e9)] == 3
def create_group_column_factor(self, factor_list, groupby_cols, cache=False): """ Create a unique, factorized column out of several individual columns Parameters ---------- factor_list groupby_cols cache Returns ------- """ if not self.rootdir: # in-memory scenario input_rootdir = None col_rootdir = None col_factor_rootdir = None col_values_rootdir = None col_factor_rootdir_tmp = None col_values_rootdir_tmp = None else: # temporary input_rootdir = tempfile.mkdtemp(prefix='bcolz-') col_factor_rootdir_tmp = tempfile.mkdtemp(prefix='bcolz-') col_values_rootdir_tmp = tempfile.mkdtemp(prefix='bcolz-') # create combination of groupby columns group_array = bcolz.zeros(0, dtype=np.int64, expectedlen=len(self), rootdir=input_rootdir, mode='w') factor_table = bcolz.ctable(factor_list, names=groupby_cols) ctable_iter = factor_table.iter(outcols=groupby_cols, out_flavor=tuple) ctable_ext.create_group_index(ctable_iter, len(groupby_cols), group_array) # now factorize the results carray_factor = \ bcolz.carray([], dtype='int64', expectedlen=self.size, rootdir=col_factor_rootdir_tmp, mode='w') carray_factor, values = ctable_ext.factorize(group_array, labels=carray_factor) carray_factor.flush() carray_values = \ bcolz.carray(np.fromiter(values.values(), dtype=np.int64), rootdir=col_values_rootdir_tmp, mode='w') carray_values.flush() del group_array if cache: # clean up the temporary file rm_file_or_dir(input_rootdir, ignore_errors=True) if cache: # official end destination col_rootdir = os.path.join(self.rootdir, self.create_group_base_name(groupby_cols)) col_factor_rootdir = col_rootdir + '.factor' col_values_rootdir = col_rootdir + '.values' lock_file = col_rootdir + '.lock' # only works for linux if not os.path.exists(lock_file): uid = str(uuid.uuid4()) try: with open(lock_file, 'a+') as fn: fn.write(uid + '\n') with open(lock_file, 'r') as fn: temp = fn.read().splitlines() if temp[0] == uid: lock = True else: lock = False del temp except: lock = False else: lock = False if lock: rm_file_or_dir(col_factor_rootdir, ignore_errors=False) shutil.move(col_factor_rootdir_tmp, col_factor_rootdir) carray_factor = bcolz.carray(rootdir=col_factor_rootdir, mode='r') rm_file_or_dir(col_values_rootdir, ignore_errors=False) shutil.move(col_values_rootdir_tmp, col_values_rootdir) carray_values = bcolz.carray(rootdir=col_values_rootdir, mode='r') else: # another process has a lock, we will work with our current files and clean up later self._dir_clean_list.append(col_factor_rootdir) self._dir_clean_list.append(col_values_rootdir) return carray_factor, carray_values
def _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor, blen, **kwargs): """Perform the evaluation in blocks.""" if not blen: # Compute the optimal block size (in elements) # The next is based on experiments with bench/ctable-query.py # and the 'movielens-bench' repository if vm == "numexpr": bsize = 2**23 elif vm == "dask": bsize = 2**25 else: # python bsize = 2**21 blen = int(bsize / typesize) # Protection against too large atomsizes if blen == 0: blen = 1 if vm == "dask": if 'da' in vars: raise NameError("'da' is reserved as a prefix for dask.array. " "Please use another prefix") for name in vars: var = vars[name] if is_sequence_like(var): vars[name] = da.from_array(var, chunks=(blen, ) + var.shape[1:]) # Build the expression graph vars['da'] = da da_expr = _eval(expression, vars) if out_flavor in ("bcolz", "carray") and da_expr.shape: result = bcolz.zeros(da_expr.shape, da_expr.dtype, **kwargs) # Store while compute expression graph da.store(da_expr, result) return result else: # Store while compute return np.array(da_expr) # Check whether we have a re_evaluate() function in numexpr re_evaluate = bcolz.numexpr_here and hasattr(bcolz.numexpr, "re_evaluate") vars_ = {} # Get containers for vars maxndims = 0 for name in vars: var = vars[name] if is_sequence_like(var): ndims = len(var.shape) + len(var.dtype.shape) if ndims > maxndims: maxndims = ndims if len(var) > blen and hasattr(var, "_getrange"): shape = (blen, ) + var.shape[1:] vars_[name] = np.empty(shape, dtype=var.dtype) for i in xrange(0, vlen, blen): # Fill buffers for vars for name in vars: var = vars[name] if is_sequence_like(var) and len(var) > blen: if hasattr(var, "_getrange"): if i + blen < vlen: var._getrange(i, blen, vars_[name]) else: vars_[name] = var[i:] else: vars_[name] = var[i:i + blen] else: if hasattr(var, "__getitem__"): vars_[name] = var[:] else: vars_[name] = var # Perform the evaluation for this block if vm == "python": res_block = _eval(expression, vars_) else: if i == 0 or not re_evaluate: try: res_block = bcolz.numexpr.evaluate(expression, local_dict=vars_) except ValueError: # numexpr cannot handle this, so fall back to "python" vm warnings.warn( "numexpr cannot handle this expression: falling back " "to the 'python' virtual machine. You can choose " "another virtual machine by using the `vm` parameter.") return _eval_blocks(expression, vars, vlen, typesize, "python", out_flavor, blen, **kwargs) else: res_block = bcolz.numexpr.re_evaluate(local_dict=vars_) if i == 0: # Detection of reduction operations scalar = False dim_reduction = False if len(res_block.shape) == 0: scalar = True result = res_block continue elif len(res_block.shape) < maxndims: dim_reduction = True result = res_block continue # Get a decent default for expectedlen if out_flavor in ("bcolz", "carray"): nrows = kwargs.pop('expectedlen', vlen) result = bcolz.carray(res_block, expectedlen=nrows, **kwargs) else: out_shape = list(res_block.shape) out_shape[0] = vlen result = np.empty(out_shape, dtype=res_block.dtype) result[:blen] = res_block else: if scalar or dim_reduction: result += res_block elif out_flavor in ("bcolz", "carray"): result.append(res_block) else: result[i:i + blen] = res_block if isinstance(result, bcolz.carray): result.flush() if scalar: return result[()] return result
def _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor, blen, **kwargs): """Perform the evaluation in blocks.""" if not blen: # Compute the optimal block size (in elements) # The next is based on experiments with bench/ctable-query.py # and the 'movielens-bench' repository if vm == "numexpr": bsize = 2**23 elif vm == "dask": bsize = 2**25 else: # python bsize = 2**21 blen = int(bsize / typesize) # Protection against too large atomsizes if blen == 0: blen = 1 if vm == "dask": if 'da' in vars: raise NameError( "'da' is reserved as a prefix for dask.array. " "Please use another prefix") for name in vars: var = vars[name] if is_sequence_like(var): vars[name] = da.from_array(var, chunks=(blen,) + var.shape[1:]) # Build the expression graph vars['da'] = da da_expr = _eval(expression, vars) if out_flavor in ("bcolz", "carray") and da_expr.shape: result = bcolz.zeros(da_expr.shape, da_expr.dtype, **kwargs) # Store while compute expression graph da.store(da_expr, result) return result else: # Store while compute return np.array(da_expr) # Check whether we have a re_evaluate() function in numexpr re_evaluate = bcolz.numexpr_here and hasattr(bcolz.numexpr, "re_evaluate") vars_ = {} # Get containers for vars maxndims = 0 for name in vars: var = vars[name] if is_sequence_like(var): ndims = len(var.shape) + len(var.dtype.shape) if ndims > maxndims: maxndims = ndims if len(var) > blen and hasattr(var, "_getrange"): shape = (blen, ) + var.shape[1:] vars_[name] = np.empty(shape, dtype=var.dtype) for i in xrange(0, vlen, blen): # Fill buffers for vars for name in vars: var = vars[name] if is_sequence_like(var) and len(var) > blen: if hasattr(var, "_getrange"): if i+blen < vlen: var._getrange(i, blen, vars_[name]) else: vars_[name] = var[i:] else: vars_[name] = var[i:i+blen] else: if hasattr(var, "__getitem__"): vars_[name] = var[:] else: vars_[name] = var # Perform the evaluation for this block if vm == "python": res_block = _eval(expression, vars_) else: if i == 0 or not re_evaluate: try: res_block = bcolz.numexpr.evaluate(expression, local_dict=vars_) except ValueError: # numexpr cannot handle this, so fall back to "python" vm warnings.warn( "numexpr cannot handle this expression: falling back " "to the 'python' virtual machine. You can choose " "another virtual machine by using the `vm` parameter.") return _eval_blocks( expression, vars, vlen, typesize, "python", out_flavor, blen, **kwargs) else: res_block = bcolz.numexpr.re_evaluate(local_dict=vars_) if i == 0: # Detection of reduction operations scalar = False dim_reduction = False if len(res_block.shape) == 0: scalar = True result = res_block continue elif len(res_block.shape) < maxndims: dim_reduction = True result = res_block continue # Get a decent default for expectedlen if out_flavor in ("bcolz", "carray"): nrows = kwargs.pop('expectedlen', vlen) result = bcolz.carray(res_block, expectedlen=nrows, **kwargs) else: out_shape = list(res_block.shape) out_shape[0] = vlen result = np.empty(out_shape, dtype=res_block.dtype) result[:blen] = res_block else: if scalar or dim_reduction: result += res_block elif out_flavor in ("bcolz", "carray"): result.append(res_block) else: result[i:i+blen] = res_block if isinstance(result, bcolz.carray): result.flush() if scalar: return result[()] return result