def defaultHeader( ): """ Returns a default MRC header dictionary with all fields with default values. """ header = {} header['fileConvention'] = "imod" header['endian'] = 'le' header['MRCtype'] = 0 header['dimensions'] = np.array( [0,0,0], dtype=int ) header['dtype'] = 'u1' header['compressor'] = None header['packedBytes'] = 0 header['clevel'] = 1 header['maxImage'] = 1.0 header['minImage'] = 0.0 header['meanImage'] = 0.0 header['pixelsize'] = 0.1 header['pixelunits'] = u"nm" # Can be "\AA" for Angstroms header['voltage'] = 300.0 # kV header['C3'] = 2.7 # mm header['gain'] = 1.0 # counts/electron if bloscPresent: header['n_threads'] = blosc.detect_number_of_cores() return header
def defaultHeader(): ''' Returns a default MRC header dictionary with all fields with default values. ''' header = {} header['fileConvention'] = 'ccpem' header['endian'] = 'le' header['MRCtype'] = 0 header['dimensions'] = np.array([0, 0, 0], dtype=int) header['dtype'] = 'u1' header['compressor'] = None header['packedBytes'] = 0 header['clevel'] = 1 header['maxImage'] = 1.0 header['minImage'] = 0.0 header['meanImage'] = 0.0 header['pixelsize'] = 0.1 header['pixelunits'] = u'nm' # Can be '\\AA' for Angstroms header['voltage'] = 300.0 # kV header['C3'] = 2.7 # mm header['gain'] = 1.0 # counts/electron if bloscPresent: header['n_threads'] = blosc.detect_number_of_cores() return header
def load_kernel(filepath, n_threads=None): """ Loads a kernel that was saved using save_kernel(). Parameters ---------- filepath: str The filepath of the saved kernel n_threads: int The threads to use for decompression. By default, all threads are used. Returns ------- ImputationKernel """ n_threads = blosc.detect_number_of_cores() if n_threads is None else n_threads blosc.set_nthreads(n_threads) with open(filepath, "rb") as f: kernel = dill.loads(blosc.decompress(dill.load(f))) if kernel.original_data_class == "pd_DataFrame": kernel.working_data = pd_read_parquet(kernel.working_data) for col in kernel.working_data.columns: kernel.working_data[col] = kernel.working_data[col].astype( kernel.working_dtypes[col] ) return kernel
def __MRCZImport(f, header, endian='le', fileConvention='ccpem', returnHeader=False, n_threads=None): ''' Equivalent to MRCImport, but for compressed data using the blosc library. The following compressors are supported: ``'zlib'`` ``'zstd'`` ``'lz4'`` Memory mapping is not possible in this case at present. ''' if not bloscPresent: logger.error('blosc not present, cannot compress files.') return if n_threads == None: blosc.nthreads = blosc.detect_number_of_cores() else: blosc.nthreads = n_threads image = np.empty(header['dimensions'], dtype=header['dtype']) blosc_chunk_pos = DEFAULT_HEADER_LEN + header['extendedBytes'] for J in np.arange(image.shape[0]): f.seek(blosc_chunk_pos) ((nbytes, blockSize, ctbytes), (ver_info)) = readBloscHeader(f) f.seek(blosc_chunk_pos) # blosc includes the 16 header bytes in ctbytes image[J, :, :] = np.reshape( np.frombuffer(blosc.decompress(f.read(ctbytes)), dtype=image.dtype), image.shape[1:]) blosc_chunk_pos += (ctbytes) pass if header['MRCtype'] == 101: # Seems the 4-bit is interlaced interlaced_image = image image = np.empty(np.product(header['dimensions']), dtype=header['dtype']) # Bit-shift and Bit-and to seperate decimated pixels image[0::2] = np.left_shift(interlaced_image, 4) / 15 image[1::2] = np.right_shift(interlaced_image, 4) # We don't need to reshape packed data. image = np.squeeze(image) return image, header
def __MRCZImport( f, header, endian='le', fileConvention = "imod", returnHeader = False, n_threads=None ): """ Equivalent to MRCImport, but for compressed data using the blosc library. The following compressors are supported: 'zlib' 'zstd' 'lz4' Memory mapping is not possible in this case at present. """ if not bloscPresent: print( "ioMRC: blosc not present, cannot compress files." ) return if n_threads == None: blosc.nthreads = blosc.detect_number_of_cores() else: blosc.nthreads = n_threads image = np.empty( header['dimensions'], dtype=header['dtype'] ) # We can read MRC2014 files that don't start at 1024 bytes, but not write them # (as they are non-standard and we don't like breaking stuff) blosc_chunk_pos = 1024 + header['extendedBytes'] for J in np.arange(image.shape[0]): f.seek( blosc_chunk_pos ) ( (nbytes, blockSize, ctbytes ), (ver_info) ) = readBloscHeader(f) f.seek(blosc_chunk_pos) # blosc includes the 16 header bytes in ctbytes image[J,:,:] = np.reshape( np.frombuffer( blosc.decompress( f.read( ctbytes ) ), dtype=image.dtype ), image.shape[1:] ) blosc_chunk_pos += (ctbytes) pass if header['MRCtype'] == 101: # Seems the 4-bit is interlaced interlaced_image = image image = np.empty( np.product(header['dimensions']), dtype=header['dtype'] ) # Bit-shift and Bit-and to seperate decimated pixels image[0::2] = np.left_shift(interlaced_image,4) / 15 image[1::2] = np.right_shift(interlaced_image,4) # We don't need to reshape packed data. image = np.squeeze( image ) return image, header
def print_versions(): """Print all the versions of software that python-blosc relies on.""" import platform print("-=" * 38) print("python-blosc version: %s" % blosc.__version__) print("Blosc version: %s" % blosc.blosclib_version) print("Blosc compressors in this build: %s" % blosc.compressor_list()) print("Python version: %s" % sys.version) (sysname, nodename, release, version, machine, processor) = platform.uname() print("Platform: %s-%s-%s (%s)" % (sysname, release, machine, version)) if sysname == "Linux": print("Linux dist: %s" % " ".join(platform.linux_distribution()[:-1])) if not processor: processor = "not recognized" print("Processor: %s" % processor) print("Byte-ordering: %s" % sys.byteorder) print("Detected cores: %s" % blosc.detect_number_of_cores()) print("-=" * 38)
def set_blosc_nthreads() -> int: """set the blosc library to two less than the core count on the system. If less than 2 cores are ncores-2, we set the value to two. Returns ------- int ncores blosc will use on the system """ nCores = blosc.detect_number_of_cores() if nCores <= 2: nUsed = 1 elif nCores <= 4: nUsed = nCores - 1 else: nUsed = nCores - 2 blosc.set_nthreads(nUsed) return nUsed
def print_versions(): """Print all the versions of software that python-blosc relies on.""" import platform print("-=" * 38) print("python-blosc version: %s" % blosc.__version__) print("Blosc version: %s" % blosc.blosclib_version) print("Blosc compressors in this build: %s" % blosc.compressor_list()) print("Python version: %s" % sys.version) (sysname, nodename, release, version, machine, processor) = platform.uname() print("Platform: %s-%s-%s (%s)" % (sysname, release, machine, version)) if sysname == "Linux": print("Linux dist: %s" % " ".join(platform.linux_distribution()[:-1])) if not processor: processor = "not recognized" print("Processor: %s" % processor) print("Byte-ordering: %s" % sys.byteorder) print("Detected cores: %s" % blosc.detect_number_of_cores()) print("-=" * 38)
from . import chunks from . import hangar_service_pb2 from . import hangar_service_pb2_grpc from .. import config from ..context import Environments from ..context import TxnRegister from ..hdf5_store import FileHandles from ..records import commiting from ..records import hashs from ..records import heads from ..records import parsing from ..records import queries from ..records import summarize blosc.set_nthreads(blosc.detect_number_of_cores() - 2) class HangarClient(object): '''Client which connects and handles data transfer to the hangar server. Parameters ---------- envs : Environments environment handles to manage all required calls to the local repostory state. address : str IP:PORT where the hangar server can be reached. ''' def __init__(self, envs: Environments, address: str): self.env = envs
except ImportError as e: if sys.version_info > (3,0): raise ImportError('Get the backport for `concurrent.futures` for Py2.7 as `pip install futures`') raise e from mrcz.__version__ import __version__ from distutils.version import StrictVersion import logging logger = logging.getLogger('MRCZ') try: import blosc BLOSC_PRESENT = True # For async operations we want to release the GIL in blosc operations and # file IO operations. blosc.set_releasegil(True) DEFAULT_N_THREADS = blosc.detect_number_of_cores() except ImportError: # Can be ImportError or ModuleNotFoundError depending on the Python version, # but ModuleNotFoundError is a child of ImportError and is still caught. BLOSC_PRESENT = False logger.info('`blosc` meta-compression library not found, file compression disabled.') DEFAULT_N_THREADS = 1 try: import rapidjson as json except ImportError: import json logger.info('`python-rapidjson` not found, using builtin `json` instead.') def _defaultMetaSerialize(value): """ Is called by `json.dumps()` whenever it encounters an object it does
def writeMRC( input_image, MRCfilename, endian='le', dtype=None, pixelsize=[0.1,0.1,0.1], pixelunits=u"\AA", shape=None, voltage = 0.0, C3 = 0.0, gain = 1.0, compressor=None, clevel = 1, n_threads=None, quickStats=True ): """ MRCExport( input_image, MRCfilename, endian='le', shape=None, compressor=None, clevel = 1 ) Created on Thu Apr 02 15:56:34 2015 @author: Robert A. McLeod Given a numpy 2-D or 3-D array `input_image` write it has an MRC file `MRCfilename`. dtype will cast the data before writing it. pixelsize is [z,y,x] pixel size (singleton values are ok for square/cubic pixels) pixelunits is "AA" for Angstroms, "pm" for picometers, "\mum" for micrometers, or "nm" for nanometers. MRC standard is always Angstroms, so pixelsize is converted internally from nm to Angstroms if necessary shape is only used if you want to later append to the file, such as merging together Relion particles for Frealign. Not recommended and only present for legicacy reasons. voltage is accelerating potential in keV, defaults to 300.0 C3 is spherical aberration in mm, defaults to 2.7 mm gain is detector gain (counts/primary electron), defaults to 1.0 (for counting camera) compressor is a choice of 'lz4', 'zlib', or 'zstd', plus 'blosclz', 'lz4hc' 'zstd' generally gives the best compression performance, and is still almost as fast as 'lz4' with clevel = 1 'zlib' is easiest to decompress with other utilities. clevel is the compression level, 1 is fastest, 11 is very-slow. The compression ratio will rise slowly with clevel. n_threads is number of threads to use for blosc compression quickStats = True estimates the image mean, min, max from the first frame only, which saves a lot of computational time for stacks. Note that MRC definitions are not consistent. Generally we support the IMOD schema. """ if dtype == 'uint4' and compressor != None: raise TypeError( "uint4 packing is not compatible with compression, use int8 datatype." ) header = {} if endian == 'le': endchar = '<' else: endchar = '>' if dtype == None: # TODO: endian support header['dtype'] = endchar + input_image.dtype.descr[0][1].strip( "<>|" ) else: header['dtype'] = dtype # Now we need to filter dtype to make sure it's actually acceptable to MRC if not header['dtype'].strip( "<>|" ) in REVERSE_IMOD_ENUM: raise TypeError( "ioMRC.MRCExport: Unsupported dtype cast for MRC %s" % header['dtype'] ) header['dimensions'] = input_image.shape header['pixelsize'] = pixelsize header['pixelunits'] = pixelunits header['compressor'] = compressor header['clevel'] = clevel header['shape'] = shape # This overhead calculation is annoying but many 3rd party tools that use # MRC require these statistical parameters. if bool(quickStats) and input_image.ndim == 3: header['maxImage'] = np.max( np.real( input_image[0,:,:] ) ) header['minImage'] = np.min( np.real( input_image[0,:,:] ) ) header['maxImage'] = np.mean( np.real( input_image[0,:,:] ) ) else: header['maxImage'] = np.max( np.real( input_image ) ) header['minImage'] = np.min( np.real( input_image ) ) header['maxImage'] = np.mean( np.real( input_image ) ) header['voltage'] = voltage if not bool( header['voltage'] ): header['voltage'] = 0.0 header['C3'] = C3 if not bool( header['C3'] ): header['C3'] = 0.0 header['gain'] = gain if not bool( header['gain'] ): header['gain'] = 1.0 header['compressor'] = compressor header['clevel'] = clevel if n_threads == None and bloscPresent: n_threads = blosc.detect_number_of_cores() header['n_threads'] = n_threads # TODO: can we detect the number of cores without adding a heavy dependancy? if dtype == 'uint4': # Decimate to packed 4-bit input_image = input_image.astype('uint8') input_image = input_image[:,:,::2] + np.left_shift(input_image[:,:,1::2],4) __MRCExport( input_image, header, MRCfilename, endchar )
def writeMRC(input_image, MRCfilename, meta=None, endian='le', dtype=None, pixelsize=[0.1, 0.1, 0.1], pixelunits=u'\\AA', shape=None, voltage=0.0, C3=0.0, gain=1.0, compressor=None, clevel=1, n_threads=None, quickStats=True, idx=None): ''' writeMRC( input_image, MRCfilename, meta=None, idx=None, endian='le', dtype=None, pixelsize=[0.1,0.1,0.1], pixelunits=u'\\AA', shape=None, voltage=0.0, C3=0.0, gain=1.0, compressor=None, clevel=1, n_threads=None, quickStats=True, ) Given a ``numpy`` 2-D or 3-D array ``input_image`` write it has an MRC file ``MRCfilename``. * ``meta`` is a Python dict{} which will be serialized by JSON and written into the extended header. * ``dtype`` will cast the data before writing it. * ``pixelsize`` is [z,y,x] pixel size (singleton values are ok for square/cubic pixels) * ``pixelunits`` is ``'\\AA'`` for Angstroms, ``'pm'`` for picometers, ``'\mum'`` for micrometers, or ``'nm'`` for nanometers. MRC standard is always Angstroms, so pixelsize is converted internally from nm to Angstroms if necessary * ``shape`` is only used if you want to later append to the file, such as merging together Relion particles for Frealign. Not recommended and only present for legacy reasons. * ``voltage`` is accelerating potential in keV, defaults to 300.0 * ``C3`` is spherical aberration in mm, defaults to 2.7 mm * ``gain`` is detector gain (counts/primary electron), defaults to 1.0 (for counting camera) * ``compressor`` is a choice of ``'lz4'``, ``'zlib'``, or ``'zstd'``, plus ``'blosclz'``, ``'lz4hc'`` - ``'lz4'`` is generally the fastest. - ``'zstd'`` generally gives the best compression performance, and is still almost as fast as 'lz4' with clevel = 1 * ``clevel`` is the compression level, 1 is fastest, 9 is slowest. The compression ratio will rise slowly with clevel. * ``n_threads`` is number of threads to use for blosc compression * ``quickStats=True`` estimates the image mean, min, max from the first frame only, which saves a lot of computational time for stacks. * ``idx`` can be used to write an image or set of images starting at a specific position in the MRC file (which may already exist). Index of first image is 0. A negative index can be used to count backwards. If omitted, will write whole stack to file. If writing to an existing file, compression or extended MRC2014 headers are currently not supported with this option. *Note: MRC definitions are not consistent. Generally we support the CCPEM schema.* ''' if len(input_image.shape) == 2: # If it's a 2D image we force it to 3D - this makes life easier later: input_image = input_image.reshape( (1, input_image.shape[0], input_image.shape[1])) # For dask, we don't want to import dask, but we can still work-around how to # check its type without isinstance() image_type = type(input_image) if image_type.__module__ == 'dask.array.core' and image_type.__name__ == 'Array': # Ideally it would be faster to iterate over the chunks and pass each one # to blosc but that likely requires c-blosc2 input_image = input_image.__array__() # We will need this regardless if writing to an existing file or not: if endian == 'le': endchar = '<' else: endchar = '>' # We now check if we have to create a new header (i.e. new file) or not. If # the file exists, but idx is 'None', it will be replaced by a new file # with new header anyway: if os.path.isfile(MRCfilename): if idx == None: idxnewfile = True else: idxnewfile = False else: idxnewfile = True if idxnewfile: if dtype == 'uint4' and compressor != None: raise TypeError( 'uint4 packing is not compatible with compression, use int8 datatype.' ) header = {'meta': meta} if dtype == None: # TODO: endian support header['dtype'] = endchar + input_image.dtype.descr[0][1].strip( '<>|') else: header['dtype'] = dtype # Now we need to filter dtype to make sure it's actually acceptable to MRC if not header['dtype'].strip('<>|') in REVERSE_CCPEM_ENUM: raise TypeError( 'ioMRC.MRCExport: Unsupported dtype cast for MRC %s' % header['dtype']) header['dimensions'] = input_image.shape header['pixelsize'] = pixelsize header['pixelunits'] = pixelunits header['compressor'] = compressor header['clevel'] = clevel header['shape'] = shape # This overhead calculation is annoying but many 3rd party tools that use # MRC require these statistical parameters. if bool(quickStats) and input_image.ndim == 3: header['maxImage'] = np.max(np.real(input_image[0, :, :])) header['minImage'] = np.min(np.real(input_image[0, :, :])) header['meanImage'] = np.mean(np.real(input_image[0, :, :])) else: header['maxImage'] = np.max(np.real(input_image)) header['minImage'] = np.min(np.real(input_image)) header['meanImage'] = np.mean(np.real(input_image)) header['voltage'] = voltage if not bool(header['voltage']): header['voltage'] = 0.0 header['C3'] = C3 if not bool(header['C3']): header['C3'] = 0.0 header['gain'] = gain if not bool(header['gain']): header['gain'] = 1.0 header['compressor'] = compressor header['clevel'] = clevel if n_threads == None and bloscPresent: n_threads = blosc.detect_number_of_cores() header['n_threads'] = n_threads # TODO: can we detect the number of cores without adding a heavy dependancy? if dtype == 'uint4': # Decimate to packed 4-bit input_image = input_image.astype('uint8') input_image = input_image[:, :, ::2] + np.left_shift( input_image[:, :, 1::2], 4) else: # We are going to append to an already existing file: # So we try to figure out its header with 'CCPEM' or 'eman2' file conventions: try: header = readMRCHeader(MRCfilename, endian, fileConvention='CCPEM', pixelunits=pixelunits) except ValueError: try: header = readMRCHeader(MRCfilename, endian, fileConvention='eman2', pixelunits=pixelunits) except ValueError: # If neither 'CCPEM' nor 'eman2' formats satisfy: raise ValueError('Error: unrecognized MRC type for file: %s ' % MRCfilename) # No support for extended headers in arbitrary appending mode: # RAM: should work now # if header['extendedBytes'] > 0: # raise ValueError( 'Error: MRC2014 files with extended headers not supported for writing: %s = %d' % ('extendedBytes', header['extendedBytes'] ) ) # If the file already exists, its X,Y dimensions must be consistent with the current image to be written: if np.any(header['dimensions'][1:] != input_image.shape[1:]): raise ValueError( 'Error: x,y dimensions of image do not match that of MRC file: %s ' % MRCfilename) # TO DO: check also consistency of dtype? if 'meta' not in header.keys(): header['meta'] = meta # Now that we have a proper header, we go into the details of writing to a specific position: if idx != None: if header['compressor'] != None: raise RuntimeError( 'Writing at arbitrary positions not supported for compressed files. Compressor = %s' % header['compressor']) idx = int(idx) # Force 2D to 3D dimensions: if len(header['dimensions']) == 2: header['dimensions'] = np.array( [1, header['dimensions'][0], header['dimensions'][1]]) # Convert negative index to equivalent positive index: if idx < 0: idx = header['dimensions'][0] + idx # Just check if the desired image is within the stack range: # In principle we could write to a position beyond the limits of the file (missing slots would be filled with zeros), but let's avoid that the user writes a big file with zeros by mistake. So only positions within or immediately consecutive to the stack are allowed: if idx < 0 or idx > header['dimensions'][0]: raise ValueError( 'Error: image or slice index out of range. idx = %d, z_dimension = %d' % (idx, header['dimensions'][0])) # The new Z dimension may be larger than that of the existing file, or even of the new file, if an index larger than the current stack is specified: newZ = idx + input_image.shape[0] if newZ > header['dimensions'][0]: header['dimensions'] = np.array([ idx + input_image.shape[0], header['dimensions'][1], header['dimensions'][2] ]) # This offset will be applied to f.seek(): offset = idx * np.product(header['dimensions'][1:]) * np.dtype( header['dtype']).itemsize else: offset = 0 __MRCExport(input_image, header, MRCfilename, endchar, offset, idxnewfile)
import pickle import warnings import blosc import numpy from dpsutil.dataframe.convert import cvt_dec2hex, cvt_hex2dec, cvt_hex2str, cvt_str2hex COMPRESS_FASTEST = 0 COMPRESS_BEST = 1 blosc.set_nthreads(min(8, max(4, blosc.detect_number_of_cores() // 2))) def compress(data: bytes, compress_type=COMPRESS_FASTEST) -> bytes: """ compress(data[, compress_type=COMPRESS_FASTEST, nthreads=blosc.ncores, level=None]) Optionals: - compress_type: [COMPRESS_FASTEST, COMPRESS_BEST] - nthreads: range 0 -> 256. Default is the number of cores in this system. - level: 0-16. If 'level' is None, compress_type will set. Higher values will result in better compression at the cost of more CPU usage. High speed compress with multi-threading. Implement from blosc.compress Raise ValueError if size of buffer larger than 2147483631 bytes. """ assert type(data) is bytes compressor = "lz4" if compress_type == COMPRESS_FASTEST else "zstd" level = 1 if compress_type == COMPRESS_FASTEST else 5 return blosc.compress(data, cname=compressor, clevel=level)