def compress_data(original_class): import bcolz bcolz.cparams(clevel=4, shuffle=1, cname="blosclz") orig_prepare = original_class.prepare def prepare(self, *args, **kwargs): orig_prepare(self, *args, **kwargs) for key in self.data.keys(): self.data[key] = bcolz.carray(self.data[key]) original_class.prepare = prepare # set the class' __init__ to the new one return original_class
def setChannelData(self, channelName, data, compression=False): """Modifies data of channel Parameters ---------------- channelName : str channel name data : numpy array channel data compression : bool or str trigger for data compression """ if compression and CompressionPossible: if not isinstance(compression, str): if isinstance(compression, int): comp = compression else: comp = self._compression_level temp = carray(data, cparams=cparams(clevel=comp), expectedlen=int(getsizeof(data) / 10)) else: temp = compressed_data() temp.compression(data) self._setChannel(channelName, temp, field=dataField) else: self._setChannel(channelName, data, field=dataField)
def __init__(self, ncols, names, meta_data={}, *args, **kwargs): self._type = 'BColz' self._ncols = ncols self._colnames = names self._meta_data = meta_data self._cparams = kwargs.pop("cparams", bcolz.cparams()) self._schema = kwargs.pop("schema", None) if not isinstance(self._schema, BcolzSchema): raise ValueError("Illegal or no schema supplied.") if not isinstance(self._cparams, bcolz.toplevel.cparams): try: self._cparams = bcolz.cparams(**self._cparams) except (TypeError, NameError): raise ValueError("Illegal compression params supplied.")
def get_quantized_ctable(dtype, cparams, quantize=None, expectedlen=None): """Return a ctable with the quantize filter enabled for floating point cols. License This function is taken from the reflexible package (https://github.com/spectraphilic/reflexible/tree/master/reflexible). Authored by John F Burkhart <*****@*****.**> with contributions Francesc Alted <*****@*****.**>. Licensed under: 'This script follows creative commons usage.' """ columns, names = [], [] for fname, ftype in dtype.descr: names.append(fname) if 'f' in ftype: cparams2 = bcolz.cparams(clevel=cparams.clevel, cname=cparams.cname, quantize=quantize) columns.append( bcolz.zeros(0, dtype=ftype, cparams=cparams2, expectedlen=expectedlen)) else: columns.append( bcolz.zeros(0, dtype=ftype, cparams=cparams, expectedlen=expectedlen)) return bcolz.ctable(columns=columns, names=names)
def __init__(self, columns=None, names=None, **kwargs): # Important optional params self._cparams = kwargs.get("cparams", bcolz.cparams()) self.rootdir = kwargs.get("rootdir", None) "The directory where this object is saved." if self.rootdir is None and columns is None: raise ValueError("For creating a new ctable you should pass a `columns` param") if os.path.exists(self.rootdir): self.mode = kwargs.setdefault("mode", "a") else: self.mode = kwargs.setdefault("mode", "w") "The mode in which the object is created/opened." # Setup the columns accessor self.cols = cols(self.rootdir, self.mode) "The ctable columns accessor." # The length counter of this array self.len = 0 # Create a new ctable or open it from disk if self.mode in ("r", "a"): self.open_ctable() _new = False elif columns is not None: self.create_ctable(columns, names, **kwargs) _new = True # Attach the attrs to this object self.attrs = attrs.attrs(self.rootdir, self.mode, _new=_new) # Cache a structured array of len 1 for ctable[int] acceleration self._arr1 = np.empty(shape=(1,), dtype=self.dtype)
def _init_ctable(self, path): """ Create empty ctable for given path. Obtain 、Create 、Append、Attr empty ctable for given path. addcol(newcol[, name, pos, move]) Add a new newcol object as column. append(cols) Append cols to this ctable -- e.g. : ctable Flush data in internal buffers to disk: This call should typically be done after performing modifications (__settitem__(), append()) in persistence mode. If you don’t do this, you risk losing part of your modifications. Parameters ---------- path : string The path to rootdir of the new ctable. """ bcolz_dir = os.path.dirname(path) print('bcolz_dir', bcolz_dir) if not os.path.exists(bcolz_dir): os.makedirs(bcolz_dir) print('path', path) initial_array = np.empty(0, np.uint32) # 配置bcolz bcolz.set_nthreads(Num * bcolz.detect_number_of_cores()) # Print all the versions of packages that bcolz relies on. bcolz.print_versions() """ clevel : int (0 <= clevel < 10) The compression level. shuffle : int The shuffle filter to be activated. Allowed values are bcolz.NOSHUFFLE (0), bcolz.SHUFFLE (1) and bcolz.BITSHUFFLE (2). The default is bcolz.SHUFFLE. cname : string (‘blosclz’, ‘lz4’, ‘lz4hc’, ‘snappy’, ‘zlib’, ‘zstd’) Select the compressor to use inside Blosc. quantize : int (number of significant digits) Quantize data to improve (lossy) compression. Data is quantized using np.around(scale*data)/scale, where scale is 2**bits, and bits is determined from the quantize value. For example, if quantize=1, bits will be 4. 0 means that the quantization is disabled. default : cparams(clevel=5, shuffle=1, cname='lz4', quantize=0) """ params = bcolz.cparams(clevel=9) table = bcolz.ctable( rootdir=path, columns=[ initial_array, initial_array, initial_array, initial_array, initial_array, initial_array, initial_array, ], names=self._bcolz_fields, mode='w', cparams=params ) print('cparams', table.cparams) table.flush() table = self._init_attr(table, path) # table.attrs['metadata'] = self._init_metadata(path) return table
def save_bcolz(data, rootdir): data_bcolz = bcolz.carray(array=data, # chunklen=data.shape[0], dtype="uint8", cparams=bcolz.cparams(clevel=1, cname="zlib"), # lz4hc zlib blosc rootdir=rootdir, mode="w") data_bcolz.flush()
def read_releases_v10(pathname): """ Parses release file in `pathname` and return a ctable with its contents. This is only suited for files in Fortran90 namelist format (FP v10). Parameters ---------- pathname : pathname Release file name (in Fortran90 namelist format). Returns ------- A ctable object from bcolz package. """ import bcolz # Setup the container for the data dtype = [('IDATE1', np.int32), ('ITIME1', np.int32), ('IDATE2', np.int32), ('ITIME2', np.int32), ('LON1', np.float32), ('LON2', np.float32), ('LAT1', np.float32), ('LAT2', np.float32), ('Z1', np.float32), ('Z2', np.float32), ('ZKIND', np.int8), ('MASS', np.float32), ('PARTS', np.int32), ('COMMENT', 'S32')] cparams = bcolz.cparams(cname="lz4", clevel=6, shuffle=1) ctable = bcolz.zeros(0, dtype=dtype, cparams=cparams) nrecords = ctable['IDATE1'].chunklen releases = np.zeros(nrecords, dtype=dtype) # Prepare for reading the input input_str = open(pathname, 'r').read() marker = "&RELEASE\n" len_marker = len(marker) release_re = r'\S+=\s+[\"|\s](\S+)[,|\"|\w]' # Loop over all the marker groups i, n = 0, 0 while True: i = input_str.find(marker, i) j = input_str.find(marker, i + 1) n += 1 group_block = input_str[i + len_marker:j] i = j values = tuple(re.findall(release_re, group_block)) try: releases[(n - 1) % nrecords] = values except ValueError: print("Problem at: group: %d, %s" % (n, group_block)) print("values:", values) raise if (n % nrecords) == 0: ctable.append(releases) if (i == -1) or (j == -1): break # marker is not found anymore # Remainder ctable.append(releases[:n % nrecords]) ctable.flush() return ctable
def save_pred(fpath, pred_arr, meta_dict=None): bc = bcolz.carray(pred_arr, mode='w', rootdir=fpath, cparams=bcolz.cparams(clevel=9, cname='lz4')) if meta_dict is not None: bc.attrs['meta'] = meta_dict bc.flush() return bc
def compute_bcolz(sexpr, clevel, vm): # Uncomment the next for disabling threading # bcolz.set_nthreads(1) #bcolz.blosc_set_nthreads(1) print("*** bcolz (using compression clevel = %d):" % clevel) x = cx # comment this for using numpy arrays in inputs t0 = time() cout = bcolz.eval(sexpr, vm=vm, cparams=bcolz.cparams(clevel)) print("Time for bcolz.eval (%s) --> %.3f" % (vm, time() - t0,))
def create_bcolz(arr, dirname): cparams = bcolz.cparams(clevel=5, cname='lz4') ca = bcolz.carray(arr, rootdir=dirname, mode='w', cparams=cparams, chunklen=1) ca.flush() return ca
def __init__(self, columns=None, names=None, **kwargs): # Important optional params self._cparams = kwargs.get('cparams', bcolz.cparams()) self.rootdir = kwargs.get('rootdir', None) if self.rootdir is not None: self.auto_flush = kwargs.pop('auto_flush', True) else: self.auto_flush = False # We actually need to pop it from the kwargs, so it doesn't get # passed down to the carray. try: kwargs.pop('auto_flush') except KeyError: pass "The directory where this object is saved." if self.rootdir is None and columns is None: raise ValueError( "You should pass either a `columns` or a `rootdir` param" " at very least") # The mode in which the object is created/opened if self.rootdir is not None and os.path.exists(self.rootdir): self.mode = kwargs.setdefault('mode', 'a') if columns is not None and self.mode == 'a': raise ValueError( "You cannot pass a `columns` param in 'a'ppend mode.\n" "(If you are trying to create a new ctable, perhaps the " "directory exists already.)") else: self.mode = kwargs.setdefault('mode', 'w') # Setup the columns accessor self.cols = cols(self.rootdir, self.mode) "The ctable columns accessor." # The length counter of this array self.len = 0 # Create a new ctable or open it from disk _new = False if self.mode in ('r', 'a'): self._open_ctable() elif columns is not None: self._create_ctable(columns, names, **kwargs) _new = True else: raise ValueError( "You cannot open a ctable in 'w'rite mode" " without a `columns` param") # Attach the attrs to this object self.attrs = attrs.attrs(self.rootdir, self.mode, _new=_new) # Cache a structured array of len 1 for ctable[int] acceleration self._arr1 = np.empty(shape=(1,), dtype=self.dtype)
def compute_carray(sexpr, clevel, vm): # Uncomment the next for disabling threading # Maybe due to some contention between Numexpr and Blosc? # bcolz.set_nthreads(bcolz.ncores//2) print("*** carray (using compression clevel = %d):" % clevel) if clevel > 0: x, y, z = cx, cy, cz t0 = time() cout = bcolz.eval(sexpr, vm=vm, cparams=bcolz.cparams(clevel)) print("Time for bcolz.eval (%s) --> %.3f" % (vm, time() - t0,), end="") print(", cratio (out): %.1f" % (cout.nbytes / float(cout.cbytes)))
def compute_bcolz(sexpr, clevel, vm): # Uncomment the next for disabling threading # bcolz.set_nthreads(1) #bcolz.blosc_set_nthreads(1) print("*** bcolz (using compression clevel = %d):" % clevel) x = cx # comment this for using numpy arrays in inputs t0 = time() cout = bcolz.eval(sexpr, vm=vm, cparams=bcolz.cparams(clevel)) print("Time for bcolz.eval (%s) --> %.3f" % ( vm, time() - t0, ))
def test_ctable(clevel): enter() tc = bcolz.fromiter( (mv + np.random.rand(NC) - mv for i in xrange(int(NR))), dtype=dt, cparams=bcolz.cparams(clevel, cname=cname), count=int(NR)) after_create() out = np.fromiter((row for row in tc.where(squery, 'f1,f3')), dtype="f8,f8") after_query() return out
def test01a(self): """Testing `__setitem()__` method with start,stop (scalar)""" a = np.ones((500, 200), dtype="i4") * 3 b = bcolz.fill((500, 200), 3, dtype="i4", rootdir=self.rootdir, cparams=bcolz.cparams()) sl = slice(100, 400) a[sl, :] = 0 b[sl] = 0 if self.open: b.flush() b = bcolz.open(rootdir=self.rootdir) # print "b[sl]->", `b[sl]` assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def get_quantized_ctable(dtype, cparams, quantize=None, expectedlen=None): """Return a ctable with the quantize filter enabled for floating point cols. """ import bcolz columns, names = [], [] for fname, ftype in dtype.descr: names.append(fname) if 'f' in ftype: cparams2 = bcolz.cparams(clevel=cparams.clevel, cname=cparams.cname, quantize=quantize) columns.append(bcolz.zeros(0, dtype=ftype, cparams=cparams2, expectedlen=expectedlen)) else: columns.append(bcolz.zeros(0, dtype=ftype, cparams=cparams, expectedlen=expectedlen)) return bcolz.ctable(columns=columns, names=names)
def __init__(self, path=None, journal=None, contiguity=None, # bcolz params expectedlen=None, chunklen=1024 ** 2 // 2, # 500K rows cparams=bcolz.cparams(clevel=5, shuffle=False, cname='lz4hc')): super(JaggedByCarray, self).__init__(path, journal=journal, contiguity=contiguity) self.expectedlen = expectedlen self.chunklen = chunklen self.cparams = whatable(cparams, add_properties=True) self._bcolz = None
def csv_to_carray(self): list_csvs = WalkDir(self._srcdir) logger.info("totoal counts:", len(list_csvs)) for csv in list_csvs: df = ReadFromCsv(pd_names, csv, 1, ',') # print(df) df['time'] = df['time'].map(timestamp_to_unix) # print(df) arr = np.array(df) dst_root = os.path.join(self._dstdir, os.path.basename(csv)) carr = bcolz.carray(arr, chunklen=100 * 1024, expectedlen=100 * 1024, rootdir=dst_root, cparams=bcolz.cparams(quantize=1)) carr.flush()
def df_to_carray(self, df, dir, name): ''' :param df: 数据 :param dir: 目录 :param name: 名称 :return: carray ''' arr = np.array(df) dst_root = os.path.join(dir, name) carr = bcolz.carray(arr, chunklen=100 * 1024, expectedlen=100 * 1024, rootdir=dst_root, cparams=bcolz.cparams(quantize=1)) return carr
def test_whatid(): assert "JaggedByCarray(chunklen=1000," \ "contiguity=None," \ "cparams=cparams(clevel=3,cname='zlib',quantize=0,shuffle=False)," \ "expectedlen=None)" \ == JaggedByCarray(chunklen=1000, cparams=bcolz.cparams(clevel=3, cname='zlib', shuffle=False), expectedlen=None).what().id() assert "JaggedByH5Py(checksum=False," \ "chunklen=1000," \ "compression='lzf'," \ "compression_opts=0," \ "contiguity=None," \ "shuffle=True)" \ == JaggedByH5Py(chunklen=1000, compression='lzf', compression_opts=0, shuffle=True).what().id()
def __init__(self, data_element_shape, dtype, batch_size, save_path, length=None, append=False, kwargs={}): import bcolz super(bcolz_array_writer, self).__init__(None, data_element_shape, dtype, batch_size, length) self.save_path = save_path self.kwargs = kwargs # Set up array kwargs self.arr_kwargs = { 'expectedlen': length, 'cparams': bcolz.cparams(clevel=5, shuffle=True, cname='blosclz'), 'dtype': dtype, 'rootdir': save_path } if kwargs is not None: self.arr_kwargs.update(kwargs) # Create the file-backed array, open for writing. # (check if the array exists; if not, create it) if append: try: self.storage_array = bcolz.open(self.save_path, mode='a') self.storage_array_ptr = len(self.storage_array) except FileNotFoundError: append = False if not append: try: self.storage_array = bcolz.zeros(shape=(0, ) + data_element_shape, mode='w', **self.arr_kwargs) self.storage_array_ptr = 0 except: print("Error: failed to create file-backed bcolz storage " "array.") raise
def test_constructor(self): # missing data arg with assert_raises(ValueError): # noinspection PyArgumentList GenotypeCArray() # data has wrong dtype data = 'foo bar' with assert_raises(TypeError): GenotypeCArray(data) # data has wrong dtype data = [4., 5., 3.7] with assert_raises(TypeError): GenotypeCArray(data) # data has wrong dimensions data = [1, 2, 3] with assert_raises(TypeError): GenotypeCArray(data) # data has wrong dimensions data = [[1, 2], [3, 4]] # use HaplotypeCArray instead with assert_raises(TypeError): GenotypeCArray(data) # diploid data (typed) g = GenotypeCArray(diploid_genotype_data, dtype='i1') aeq(diploid_genotype_data, g) eq(np.int8, g.dtype) # polyploid data (typed) g = GenotypeCArray(triploid_genotype_data, dtype='i1') aeq(triploid_genotype_data, g) eq(np.int8, g.dtype) # cparams g = GenotypeCArray(diploid_genotype_data, cparams=bcolz.cparams(clevel=10)) aeq(diploid_genotype_data, g) eq(10, g.cparams.clevel)
def __init__(self, like, blockdivs, path=None, **kwargs): # Create directory if path is None: path = tempfile.mkdtemp('.pframe') self._explicitly_given_path = False else: # TODO: support loading of existing pframe os.mkdir(path) self._explicitly_given_path = True self.path = path self.blockdivs = tuple(blockdivs) # Store Metadata self.columns = like.columns self.dtypes = like.dtypes self.index_name = like.index.name self.categories = categorical_metadata(like) like2 = strip_categories(like.copy()).iloc[:10] if any(str(dt) == 'O' for dt in like.dtypes) or like.index.dtype == 'O': raise TypeError( 'Object dtypes not supported, consider categoricals') # Compression # TODO: Sane default compression if not kwargs: cp = bcolz.cparams(clevel=0, shuffle=False, cname=None) kwargs['cparams'] = cp # Create partitions npartitions = len(blockdivs) + 1 logn = int(ceil(log(npartitions, 10))) subpath = 'part-%0' + str(logn) + 'd' self.partitions = [ cframe(like2, rootdir=os.path.join(path, subpath % i), **kwargs) for i in range(npartitions) ] self.lock = Lock()
def __init__(self, like, divisions, path=None, **kwargs): # Create directory if path is None: path = tempfile.mkdtemp('.pframe') self._explicitly_given_path = False else: # TODO: support loading of existing pframe os.mkdir(path) self._explicitly_given_path = True self.path = path self.divisions = tuple(divisions) # Store Metadata self.columns = like.columns self.dtypes = like.dtypes self.index_name = like.index.name self.categories = categorical_metadata(like) like2 = strip_categories(like.copy()).iloc[:10] if (any(str(dt) in ('O', 'object') for dt in like.dtypes) or str(like.index.dtype) in ('O', 'object')): raise TypeError('Object dtypes not supported, consider categoricals') # Compression # TODO: Sane default compression if not kwargs: cp = bcolz.cparams(clevel=0, shuffle=False, cname=None) kwargs['cparams'] = cp # Create partitions npartitions = len(divisions) + 1 logn = int(ceil(log(npartitions, 10))) subpath = 'part-%0' + str(logn) + 'd' self.partitions = [cframe(like2, rootdir=os.path.join(path, subpath % i), **kwargs) for i in range(npartitions)] self.lock = Lock()
def create_dataset(*, source_file, out_dir): data_frame_chunks = pd.read_csv(source_file, chunksize=1_000_000) first_chunk: pd.DataFrame = next(data_frame_chunks) _convert_df_to_32_bit(first_chunk) column_names = first_chunk.columns.tolist() # Note: To work around a bug when `names` is present but `columns` is empty, # construct this manually. table = bcolz.ctable.fromdataframe( first_chunk, # For some reason, higher compression levels are actually performing worse. cparams=bcolz.cparams(clevel=3, cname="lz4hc", shuffle=1), rootdir=str(out_dir), ) for next_chunk in data_frame_chunks: _convert_df_to_32_bit(next_chunk) table.append(cols=[next_chunk[col] for col in column_names]) table.flush() num_rows = table.shape[0] size_mb = table.cbytes / (1024.0**2) print(f"Created bcolz table with {num_rows} rows, compression settings " f"{table.cparams}, final size {size_mb:.1f} MiB")
def test_constructor(self): # missing data arg with assert_raises(ValueError): # noinspection PyArgumentList HaplotypeCArray() # data has wrong dtype data = 'foo bar' with assert_raises(TypeError): HaplotypeCArray(data) # data has wrong dtype data = [4., 5., 3.7] with assert_raises(TypeError): HaplotypeCArray(data) # data has wrong dimensions data = [1, 2, 3] with assert_raises(TypeError): HaplotypeCArray(data) # data has wrong dimensions data = [[[1, 2], [3, 4]]] # use GenotypeCArray instead with assert_raises(TypeError): HaplotypeCArray(data) # typed data (typed) h = HaplotypeCArray(haplotype_data, dtype='i1') aeq(haplotype_data, h) eq(np.int8, h.dtype) # cparams h = HaplotypeCArray(haplotype_data, cparams=bcolz.cparams(clevel=10)) aeq(haplotype_data, h) eq(10, h.cparams.clevel)
def test_constructor(self): # missing data arg with assert_raises(ValueError): # noinspection PyArgumentList AlleleCountsCArray() # data has wrong dtype data = 'foo bar' with assert_raises(TypeError): AlleleCountsCArray(data) # data has wrong dtype data = [4., 5., 3.7] with assert_raises(TypeError): AlleleCountsCArray(data) # data has wrong dimensions data = [1, 2, 3] with assert_raises(TypeError): AlleleCountsCArray(data) # data has wrong dimensions data = [[[1, 2], [3, 4]]] with assert_raises(TypeError): AlleleCountsCArray(data) # typed data (typed) ac = AlleleCountsCArray(allele_counts_data, dtype='u1') aeq(allele_counts_data, ac) eq(np.uint8, ac.dtype) # cparams ac = AlleleCountsCArray(allele_counts_data, cparams=bcolz.cparams(clevel=10)) aeq(allele_counts_data, ac) eq(10, ac.cparams.clevel)
#print("cout-->", repr(cout)) if __name__ == "__main__": N = 1e8 # the number of elements in x clevel = 3 # the compression level sexpr = "(x+1)<0" sexpr = "(((.25*x + .75)*x - 1.5)*x - 2)<0" # sexpr = "(((.25*x + .75)*x - 1.5)*x - 2)" doprofile = 0 print("Creating inputs...") x = np.arange(N) #x = np.linspace(0,100,N) cx = bcolz.carray(x, cparams=bcolz.cparams(clevel)) print("Evaluating '%s' with 10^%d points" % (sexpr, int(math.log10(N)))) t0 = time() cout = ne.evaluate(sexpr) print("Time for numexpr --> %.3f" % (time() - t0,)) if doprofile: import pstats import cProfile as prof prof.run('compute_bcolz(sexpr, clevel=clevel, vm="numexpr")', #prof.run('compute_bcolz(sexpr, clevel=clevel, vm="python")', 'eval.prof') stats = pstats.Stats('eval.prof')
from time import time import numpy as np import bcolz N = 1e8 # a = np.arange(N, dtype='f8') a = np.random.randint(0, 10, N).astype('bool') t0 = time() sa = a.sum() print("Time sum() numpy --> %.3f" % (time() - t0)) t0 = time() ac = bcolz.carray(a, cparams=bcolz.cparams(9)) print("Time carray conv --> %.3f" % (time() - t0)) print("ac-->", repr(ac)) t0 = time() sac = ac.sum() #sac = ac.sum(dtype=np.dtype('i8')) print("Time sum() carray --> %.3f" % (time() - t0)) # t0 = time() # sac = sum(i for i in ac) # print "Time sum() carray (iter) --> %.3f" % (time()-t0) print("sa, sac-->", sa, sac, type(sa), type(sac)) assert (sa == sac)
# Benchmark for iterators from time import time import numpy as np import bcolz N = 1e8 # the number of elements in x clevel = 5 # the compression level sexpr = "(x-1) < 10." # the expression to compute # sexpr = "((x-1) % 1000) == 0." # the expression to compute #sexpr = "(2*x**3+.3*y**2+z+1)<0" # the expression to compute cparams = bcolz.cparams(clevel) print("Creating inputs...") x = np.arange(N) cx = bcolz.carray(x, cparams=cparams) if 'y' not in sexpr: ct = bcolz.ctable((cx, ), names=['x']) else: y = np.arange(N) z = np.arange(N) cy = bcolz.carray(y, cparams=cparams) cz = bcolz.carray(z, cparams=cparams) ct = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z']) print("Evaluating...", sexpr) t0 = time()
cPickle.dump(spacings, f, protocol=cPickle.HIGHEST_PROTOCOL) with gzip.open(OUTPUT_FOLDER + "origins.pkl.gz", "wb") as f: cPickle.dump(origins, f, protocol=cPickle.HIGHEST_PROTOCOL) # STAGE1 patients = os.listdir(INPUT_FOLDER_STAGE1) patients.sort() print len(patients), "patients" for i, patient in enumerate(patients): scan = preptools.load_scan(INPUT_FOLDER_STAGE1 + patient) # scan = preptools.load_scan(INPUT_FOLDER_STAGE1 + patient, stop_before_pixels=True) pixels = preptools.get_pixels_hu(scan) spacing, flipped = preptools.get_spacing(scan[0]) spacings[patient] = tuple(spacing) if flipped: pixels = pixels[::-1, :, :] # if i > -1: preptools.plot_3d(pixels, theshold=-500, spacing=spacing) data_bcolz = bcolz.carray( array=pixels, chunklen=pixels.shape[0], dtype="int16", cparams=bcolz.cparams(clevel=1, cname="zlib"), #lz4hc zlib blosc rootdir=DATA_FOLDER + patient, mode="w") data_bcolz.flush() print "%i/%i" % (i + 1, len(patients)), patient, spacing #, pixels.shape with gzip.open(OUTPUT_FOLDER + "spacings.pkl.gz", "wb") as f: cPickle.dump(spacings, f, protocol=cPickle.HIGHEST_PROTOCOL)
suffix = kwargs.pop('suffix', '.bcolz') prefix = kwargs.pop('prefix', 'scikit_allel_') tempdir = kwargs.pop('dir', None) rootdir = tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=tempdir) atexit.register(shutil.rmtree, rootdir) kwargs['rootdir'] = rootdir kwargs['mode'] = 'w' return kwargs bcolz_storage = BcolzStorage() """bcolz storage with default parameters""" bcolzmem_storage = BcolzMemStorage() """bcolz in-memory storage with default compression""" bcolztmp_storage = BcolzTmpStorage() """bcolz temporary file storage with default compression""" _zlib1 = bcolz.cparams(cname='zlib', clevel=1) bcolz_zlib1_storage = BcolzStorage(cparams=_zlib1) """bcolz storage with zlib level 1 compression""" bcolzmem_zlib1_storage = BcolzMemStorage(cparams=_zlib1) """bcolz in-memory storage with zlib level 1 compression""" bcolztmp_zlib1_storage = BcolzTmpStorage(cparams=_zlib1) """bcolz temporary file storage with zlib level 1 compression""" _util.storage_registry['bcolz'] = bcolz_storage _util.storage_registry['bcolzmem'] = bcolzmem_storage _util.storage_registry['bcolztmp'] = bcolztmp_storage _util.storage_registry['bcolz_zlib1'] = bcolz_zlib1_storage _util.storage_registry['bcolzmem_zlib1'] = bcolzmem_zlib1_storage _util.storage_registry['bcolztmp_zlib1'] = bcolztmp_zlib1_storage
from time import time import numpy as np import bcolz N = int(1e7) CLEVEL = 5 a = np.linspace(0, 1, N) t0 = time() ac = bcolz.carray(a, cparams=bcolz.cparams(clevel=CLEVEL)) print("time creation (memory) ->", round(time() - t0, 3)) print("data (memory):", repr(ac)) t0 = time() b = bcolz.carray(a, cparams=bcolz.cparams(clevel=CLEVEL), rootdir='myarray', mode='w') b.flush() print("time creation (disk) ->", round(time() - t0, 3)) # print "meta (disk):", b.read_meta() t0 = time() an = np.array(a) print("time creation (numpy) ->", round(time() - t0, 3)) t0 = time() c = bcolz.carray(rootdir='myarray') print("time open (disk) ->", round(time() - t0, 3))
import six import bcolz from pybedtools import BedTool import pyBigWig from pysam import FastaFile from .util import makedirs from .util import one_hot_encode_sequence from .util import nan_to_zero from .tiledb_array import write_tiledb from .tiledb_array import load_tiledb NUM_SEQ_CHARS = 4 _blosc_params = bcolz.cparams(clevel=5, shuffle=bcolz.SHUFFLE, cname="lz4") _array_writer = { "numpy": lambda arr, path: np.save(path, arr), "bcolz": lambda arr, path: bcolz.carray( arr, rootdir=path, cparams=_blosc_params, mode="w").flush(), "tiledb": write_tiledb, } def extract_fasta_to_file(fasta, output_dir, mode="bcolz", overwrite=False): assert mode in _array_writer
z = xrange(2, N + 2) print("Starting benchmark now for creating arrays...") # Create a ndarray # x = (i for i in xrange(N)) # true iterable t0 = time() out = np.fromiter(x, dtype='f8', count=N) print("Time for array--> %.3f" % (time() - t0,)) print("out-->", len(out)) #bcolz.set_num_threads(bcolz.ncores//2) # Create a carray #x = (i for i in xrange(N)) # true iterable t0 = time() cout = bcolz.fromiter(x, dtype='f8', count=N, cparams=bcolz.cparams(clevel)) print("Time for carray--> %.3f" % (time() - t0,)) print("cout-->", len(cout)) assert_array_equal(out, cout, "Arrays are not equal") # Create a carray (with unknown size) #x = (i for i in xrange(N)) # true iterable t0 = time() cout = bcolz.fromiter(x, dtype='f8', count=-1, cparams=bcolz.cparams(clevel)) print("Time for carray (count=-1)--> %.3f" % (time() - t0,)) print("cout-->", len(cout)) assert_array_equal(out, cout, "Arrays are not equal") # Retrieve from a structured ndarray gen = ((i, j, k) for i, j, k in izip(x, y, z)) t0 = time()
def append(data, clevel): alldata = bcolz.carray(data[0], cparams=bcolz.cparams(clevel)) for carr in data[1:]: alldata.append(carr) return alldata
import bcolz N = int(1e8) # the number of elements in x clevel = 9 # the compression level cname = "blosclz" # the compressor name sexpr = "(x+1)<10" # small number of items # sexpr = "(x+1)<1000000" # large number # sexpr = "(2*x*x*x+.3*y**2+z+1)<10" # small number #sexpr = "(2*x*x*x+.3*y**2+z+1)<1e15" # medium number #sexpr = "(2*x*x*x+.3*y**2+z+1)<1e20" # large number print("Creating inputs...") cparams = bcolz.cparams(clevel=clevel, cname=cname) x = np.arange(N) cx = bcolz.carray(x, cparams=cparams) if 'y' not in sexpr: t = bcolz.ctable((cx,), names=['x']) else: y = np.arange(N) z = np.arange(N) cy = bcolz.carray(y, cparams=cparams) cz = bcolz.carray(z, cparams=cparams) t = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z']) nt = t[:] print("Querying '%s' with 10^%d points" % (sexpr, int(math.log10(N))))
import numpy as np import bcolz N = 1e7 # the number of elements in x clevel = 5 # the compression level cname = "blosclz" # the compressor name sexpr = "(x+1)<10" # small number of items # sexpr = "(x+1)<1000000" # large number sexpr = "(2*x*x*x+.3*y**2+z+1)<10" # small number #sexpr = "(2*x*x*x+.3*y**2+z+1)<1e15" # medium number #sexpr = "(2*x*x*x+.3*y**2+z+1)<1e20" # large number print("Creating inputs...") cparams = bcolz.cparams(clevel=clevel, cname=cname) x = np.arange(N) cx = bcolz.carray(x, cparams=cparams) if 'y' not in sexpr: t = bcolz.ctable((cx, ), names=['x']) else: y = np.arange(N) z = np.arange(N) cy = bcolz.carray(y, cparams=cparams) cz = bcolz.carray(z, cparams=cparams) t = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z']) nt = t[:] print("Querying '%s' with 10^%d points" % (sexpr, int(math.log10(N))))
import numpy as np import bcolz N = 1e8 # a = np.arange(N, dtype='f8') a = np.random.randint(0, 10, N).astype('bool') t0 = time() sa = a.sum() print("Time sum() numpy --> %.3f" % (time() - t0)) t0 = time() ac = bcolz.carray(a, cparams=bcolz.cparams(9)) print("Time carray conv --> %.3f" % (time() - t0)) print("ac-->", repr(ac)) t0 = time() sac = ac.sum() #sac = ac.sum(dtype=np.dtype('i8')) print("Time sum() carray --> %.3f" % (time() - t0)) # t0 = time() # sac = sum(i for i in ac) # print "Time sum() carray (iter) --> %.3f" % (time()-t0) print("sa, sac-->", sa, sac, type(sa), type(sac)) assert (sa == sac)
def setUp(self): chunked.storage_registry['default'] = chunked.BcolzMemStorage( cparams=bcolz.cparams(cname='zlib', clevel=1) )
def read_partpositions(filename, nspec, ctable=True, clevel=5, cname="lz4", quantize=None): """Read the particle positions in `filename`. This function strives to use as less memory as possible; for this, a bcolz ctable container is used for holding the data. Besides to be compressed in-memory, its chunked nature makes a natural fit for data that needs to be appended because it does not need expensive memory resize operations. NOTE: This code reads directly from un UNFORMATTED SEQUENTIAL data Fortran file so care has been taken to skip the record length at the beginning and the end of every record. See: http://stackoverflow.com/questions/8751185/fortran-unformatted-file-format Parameters ---------- filename : string The file name of the particle raw data nspec : int number of species in particle raw data ctable : bool Return a bcolz ctable container. If not, a numpy structured array is returned instead. clevel : int Compression level for the ctable container cname : string Codec name for the ctable container. Can be 'blosclz', 'lz4', 'zlib' or 'zstd'. quantize : int Quantize data to improve (lossy) compression. Data is quantized using np.around(scale*data)/scale, where scale is 2**bits, and bits is determined from the quantize value. For example, if quantize=1, bits will be 4. 0 means that the quantization is disabled. Returns ------- ctable object OR structured_numpy_array Returning a ctable is preferred because it is used internally so it does not require to be converted to other formats, so it is faster and uses less memory. Note: Passing a `quantize` param > 0 can increase the compression ratio of the ctable container, but it may also slow down the reading speed significantly. License This function is taken from the reflexible package (https://github.com/spectraphilic/reflexible/tree/master/reflexible). Authored by John F Burkhart <*****@*****.**> with contributions Francesc Alted <*****@*****.**>. Licensed under: 'This script follows creative commons usage.' """ CHUNKSIZE = 10 * 1000 xmass_dtype = [('xmass_%d' % (i + 1), 'f4') for i in range(nspec)] # note age is calculated from itramem by adding itimein out_fields = [('npoint', 'i4'), ('xtra1', 'f4'), ('ytra1', 'f4'), ('ztra1', 'f4'), ('itramem', 'i4'), ('topo', 'f4'), ('pvi', 'f4'), ('qvi', 'f4'), ('rhoi', 'f4'), ('hmixi', 'f4'), ('tri', 'f4'), ('tti', 'f4')] + xmass_dtype raw_fields = [('begin_recsize', 'i4') ] + out_fields + [('end_recsize', 'i4')] raw_rectype = np.dtype(raw_fields) recsize = raw_rectype.itemsize cparams = bcolz.cparams(clevel=clevel, cname=cname) if quantize is not None and quantize > 0: out = get_quantized_ctable(raw_rectype, cparams=cparams, quantize=quantize, expectedlen=int(1e6)) else: out = bcolz.zeros(0, dtype=raw_rectype, cparams=cparams, expectedlen=int(1e6)) with open(filename, "rb", buffering=1) as f: # The timein value is at the beginning of the file reclen = np.ndarray(shape=(1, ), buffer=f.read(4), dtype="i4")[0] assert reclen == 4 itimein = np.ndarray(shape=(1, ), buffer=f.read(4), dtype="i4") reclen = np.ndarray(shape=(1, ), buffer=f.read(4), dtype="i4")[0] assert reclen == 4 nrec = 0 while True: # Try to read a complete chunk data = f.read(CHUNKSIZE * recsize) read_records = int(len(data) / recsize) # the actual number of records read chunk = np.ndarray(shape=(read_records, ), buffer=data, dtype=raw_rectype) # Add the chunk to the out array out.append(chunk[:read_records]) nrec += read_records if read_records < CHUNKSIZE: # We reached the end of the file break # Truncate at the max length (last row is always a sentinel, so remove it) out.trim(1) # Remove the first and last columns out.delcol("begin_recsize") out.delcol("end_recsize") if ctable: return out else: return out[:]
import numpy as np import numexpr as ne import bcolz N = 1e7 # the number of elements in x clevel = 3 # the compression level # sexpr = "(x+1)<0" # the expression to compute #sexpr = "(2*x**3+.3*y**2+z+1)<0" # the expression to compute sexpr = "((.25*x + .75)*x - 1.5)*x - 2" # a computer-friendly polynomial #sexpr = "(((.25*x + .75)*x - 1.5)*x - 2)<0" # a computer-friendly polynomial print("Creating inputs...") cparams = bcolz.cparams(clevel) x = np.arange(N) #x = np.linspace(0,100,N) cx = bcolz.carray(x, cparams=cparams) if 'y' not in sexpr: t = bcolz.ctable((cx,), names=['x']) else: y = np.arange(N) z = np.arange(N) cy = bcolz.carray(y, cparams=cparams) cz = bcolz.carray(z, cparams=cparams) t = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z']) print("Evaluating '%s' with 10^%d points" % (sexpr, int(math.log10(N))))