def analysis_refresh_bruteforce(self): """Refresh points display from cluster-based analysis. This works in the crudest possible way by simply repeatedly re-reading the results file. This should also work for old style analysis, with the same caveats (below) about IO. WARNING: This has potential performance implications as there wll be significant IO overhead to doing it this way!!! Files should be closed when spooling is complete. TODO: - Allow incremental update (as is done in refresh_analysis above). Will probably need modification of PYMEDataServer. - Stop updates when spooling is complete (HOW? Check file size?) - What is the effect of the caches in clusterIO? Do we actually get the updated file? - Make sure this only gets called at a reasonable rate. - Add logic to call this (and make sure that self.results_filename is defined) """ from PYME.IO import unifiedIO try: with unifiedIO.local_or_temp_filename( self.results_filename ) as fn: #download a copy of the file if needed, so that we can pass pytables a local filename self.dsviewer.pipeline.OpenFile(fn) self.numEvents = len( self.dsviewer.pipeline.selectedDataSoure['x']) #populate the fitResults member TODO - is this actually needed? self.fitResults = self.dsviewer.pipeline.selectedDataSoure.resultsSource.fitResults #FIXME - this is really fragile! except IOError: pass self.progPan.draw() self.progPan.Refresh() self.dsviewer.Refresh() self.dsviewer.update()
def test_mulithread_result_filing(): import numpy as np from PYME.IO import clusterResults, unifiedIO import tables import posixpath import threading n_filings = 500 n_per = np.random.randint(0, 100, n_filings) data = [np.ones(n_per[ind], dtype=[('a', '<f4'), ('b', '<f4')]) for ind in range(n_filings)] dest = 'pyme-cluster://TES1/__aggregate_h5r/_testing/test_result_filing.h5r' threads = [] for ind in range(n_filings): t = threading.Thread(target=clusterResults.fileResults, args=(posixpath.join(dest, 'foo'), data[ind])) t.start() threads.append(t) [t.join() for t in threads] time.sleep(5) with unifiedIO.local_or_temp_filename('pyme-cluster://TES1/_testing/test_result_filing.h5r') as f,\ tables.open_file(f) as t: n_received = len(t.root.foo) assert n_received == np.sum(n_per)
def _loadClassifier(self): from PYME.Analysis import svmSegment if not (('_cf' in dir(self)) and (self._classifier == self.classifier)): self._classifier = self.classifier with unifiedIO.local_or_temp_filename(self.classifier) as fn: self._cf = svmSegment.svmClassifier(filename=fn)
def test_mulithread_result_filing(): # FIXME - this test is expected to fail as files should be created before multi-threaded aggregate operations # with enough of a delay between creation and access to ensure that the file is present in directory caches. import numpy as np from PYME.IO import clusterResults, unifiedIO import tables import posixpath import threading n_filings = 500 n_per = np.random.randint(0, 100, n_filings) data = [np.ones(n_per[ind], dtype=[('a', '<f4'), ('b', '<f4')]) for ind in range(n_filings)] dest = 'pyme-cluster://TES1/__aggregate_h5r/_testing/test_result_filing.h5r' threads = [] for ind in range(n_filings): t = threading.Thread(target=clusterResults.fileResults, args=(posixpath.join(dest, 'foo'), data[ind])) t.start() threads.append(t) [t.join() for t in threads] time.sleep(5) with unifiedIO.local_or_temp_filename('pyme-cluster://TES1/_testing/test_result_filing.h5r') as f,\ tables.open_file(f) as t: n_received = len(t.root.foo) assert n_received == np.sum(n_per)
def load_shiftmap(uri): """ helper function to handle I/O of two versions of shiftmaps. Note that HDF is prefered :param uri: str path or url to shiftmap-containing file (hdf, or [less ideal] json) :return: dict shiftmap """ from PYME.IO import unifiedIO, tabular from PYME.IO.MetaDataHandler import HDFMDHandler import tables import json try: # try loading shift map as hdf file with unifiedIO.local_or_temp_filename(uri) as f: t = tables.open_file(f) shift_map_source = tabular.HDFSource(t, 'shift_map') # todo - is there a cleaner way to do this? shift_map_source.mdh = HDFMDHandler(t) # build dict of dicts so we can easily rebuild shiftfield objects in multiview.calc_shifts_for_points shift_map = {'shiftModel': shift_map_source.mdh['Multiview.shift_map.model']} legend = shift_map_source.mdh['Multiview.shift_map.legend'] for l in legend.keys(): keys = shift_map_source.keys() shift_map[l] = dict(zip(keys, [shift_map_source[k][legend[l]] for k in keys])) t.close() except tables.HDF5ExtError: # file is probably saved as json (legacy) s = unifiedIO.read(uri) shift_map = json.loads(s) return shift_map
def loadInput(self, filename, key='input'): """ Load input data from a file and inject into namespace """ from PYME.IO import unifiedIO import os extension = os.path.splitext(filename)[1] if extension in ['.h5r', '.hdf']: import tables from PYME.IO import h5rFile try: with unifiedIO.local_or_temp_filename( filename) as fn, h5rFile.openH5R( fn, mode='r')._h5file as h5f: self._inject_tables_from_hdf5(key, h5f, fn, extension) except tables.exceptions.HDF5ExtError: # access issue likely due to multiple processes if unifiedIO.is_cluster_uri(filename): # try again, this time forcing access through the dataserver # NOTE: it is unclear why this should work when local_or_temp_filename() doesn't # as this still opens / copies the file independently, albeit in the same process as is doing the writing. # The fact that this works is relying on one of a quirk of the GIL, a quirk in HDF5 locking, or the fact # that copying the file to a stream is much faster than opening it with pytables. The copy vs pytables open # scenario would match what has been observed with old style spooling analysis where copying a file # prior to opening in VisGUI would work more reliably than opening directly. This retains, however, # an inherent race condition so we risk replacing a predictable failure with a less frequent one. # TODO - consider whether h5r_part might be a better choice. # FIXME: (DB) I'm not comfortable with having this kind of special case retry logic here, and would # much prefer if we could find an alternative workaround, refactor into something like h5rFile.open_robust(), # or just let this fail). Leaving it for the meantime to get chained recipes working, but we should revisit. from PYME.IO import clusterIO relative_filename, server_filter = unifiedIO.split_cluster_url( filename) file_as_bytes = clusterIO.get_file( relative_filename, serverfilter=server_filter, local_short_circuit=False) with tables.open_file('in-memory.h5', driver='H5FD_CORE', driver_core_image=file_as_bytes, driver_core_backing_store=0) as h5f: self._inject_tables_from_hdf5(key, h5f, filename, extension) else: #not a cluster file, doesn't make sense to retry with cluster. Propagate exception to user. raise elif extension == '.csv': logger.error('loading .csv not supported yet') raise NotImplementedError elif extension in ['.xls', '.xlsx']: logger.error('loading .xls not supported yet') raise NotImplementedError else: self.namespace[key] = ImageStack(filename=filename, haveGUI=False)
def list_h5(filename): import tables from PYME.IO import MetaDataHandler from PYME.IO import tabular from PYME.IO import unifiedIO import json with unifiedIO.local_or_temp_filename(filename) as fn: with tables.open_file(fn, mode='r') as h5f: #make sure our hdf file gets closed try: mdh = MetaDataHandler.NestedClassMDHandler( MetaDataHandler.HDFMDHandler(h5f)) print('Metadata:\n____________') print(repr(mdh)) except tables.FileModeError: # Occurs if no metadata is found, since we opened the table in read-mode logger.warning( 'No metadata found, proceeding with empty metadata') mdh = MetaDataHandler.NestedClassMDHandler() print('\n\n') for t in h5f.list_nodes('/'): # FIXME - The following isinstance tests are not very safe (and badly broken in some cases e.g. # PZF formatted image data, Image data which is not in an EArray, etc ...) # Note that EArray is only used for streaming data! # They should ideally be replaced with more comprehensive tests (potentially based on array or dataset # dimensionality and/or data type) - i.e. duck typing. Our strategy for images in HDF should probably # also be improved / clarified - can we use hdf attributes to hint at the data intent? How do we support # > 3D data? if not isinstance(t, tables.Group): print(t.name) print('______________') if isinstance(t, tables.VLArray): data = h5f.get_node(h5f.root, t.name) print('Ragged (VLArray) with %d rows' % len(data)) print('Row 0: %s' % data) elif isinstance(t, tables.table.Table): # pipe our table into h5r or hdf source depending on the extension data = h5f.get_node(h5f.root, t.name) print('Table with %d rows\n dtype = %s' % (len(data), data[0].dtype)) elif isinstance(t, tables.EArray): data = h5f.get_node(h5f.root, t.name) print('Image, shape = %s' % data.shape) print('\n\n')
def _loadNPY(self, filename): """Load numpy .npy data. """ from PYME.IO import unifiedIO mdfn = self._findAndParseMetadata(filename) with unifiedIO.local_or_temp_filename(filename) as fn: self.data = numpy.load(fn) #from PYME.ParallelTasks.relativeFiles import getRelFilename self.seriesName = getRelFilename(filename) self.mode = 'default'
def execute(self, namespace): from PYME.Analysis.points import multiview from PYME.IO import unifiedIO from PYME.IO.MetaDataHandler import HDFMDHandler import tables import json inp = namespace[self.input_name] if 'mdh' not in dir(inp): raise RuntimeError('ShiftCorrect needs metadata') if self.shift_map_path == '': # grab shftmap from the metadata loc = inp.mdh['Shiftmap'] else: loc = self.shift_map_path try: # try loading shift map as hdf file with unifiedIO.local_or_temp_filename(loc) as f: t = tables.open_file(f) shift_map_source = tabular.HDFSource( t, 'shift_map') # todo - is there a cleaner way to do this? shift_map_source.mdh = HDFMDHandler(t) # build dict of dicts so we can easily rebuild shiftfield objects in multiview.calc_shifts_for_points shift_map = { 'shiftModel': shift_map_source.mdh['Multiview.shift_map.model'] } legend = shift_map_source.mdh['Multiview.shift_map.legend'] for l in legend.keys(): keys = shift_map_source.keys() shift_map[l] = dict( zip(keys, [shift_map_source[k][legend[l]] for k in keys])) t.close() except tables.HDF5ExtError: # file is probably saved as json (legacy) s = unifiedIO.read(self.shift_map_path) shift_map = json.loads(s) mapped = tabular.MappingFilter(inp) multiview.apply_shifts_to_points(mapped, shift_map) # propagate metadata mapped.mdh = inp.mdh mapped.mdh['Multiview.shift_map.location'] = loc namespace[self.output_name] = mapped
def _loadPSF(self, filename): """Load PYME .psf data. .psf files consist of a tuple containing the data and the voxelsize. """ from PYME.IO import unifiedIO with unifiedIO.local_or_temp_filename(filename) as fn: self.data, vox = numpy.load(fn) self.mdh = MetaDataHandler.NestedClassMDHandler(MetaData.ConfocDefault) self.mdh.setEntry('voxelsize.x', vox.x) self.mdh.setEntry('voxelsize.y', vox.y) self.mdh.setEntry('voxelsize.z', vox.z) #from PYME.ParallelTasks.relativeFiles import getRelFilename self.seriesName = getRelFilename(filename) self.mode = 'psf'
def OpenFile(self, filename='', ds=None, clobber_recipe=True, **kwargs): """Open a file - accepts optional keyword arguments for use with files saved as .txt and .mat. These are: FieldNames: a list of names for the fields in the text file or matlab variable. VarName: the name of the variable in the .mat file which contains the data. SkipRows: Number of header rows to skip for txt file data PixelSize: Pixel size if not in nm """ #close any files we had open previously while len(self.filesToClose) > 0: self.filesToClose.pop().close() # clear our state # nb - equivalent to clearing recipe namespace self.dataSources.clear() if clobber_recipe: # clear any processing modules from the pipeline # call with clobber_recipe = False in a 'Open a new file with the processing pipeline I've set up' use case # TODO: Add an "File-->Open [preserving recipe]" menu option or similar self.recipe.modules = [] if 'zm' in dir(self): del self.zm self.filter = None self.mapping = None self.colourFilter = None self.events = None self.mdh = MetaDataHandler.NestedClassMDHandler() self.filename = filename if ds is None: from PYME.IO import unifiedIO # TODO - what is the launch time penalty here for importing clusterUI and finding a nameserver? # load from file(/cluster, downloading a copy of the file if needed) with unifiedIO.local_or_temp_filename(filename) as fn: # TODO - check that loading isn't lazy (i.e. we need to make a copy of data in memory whilst in the # context manager in order to be safe with unifiedIO and cluster data). From a quick look, it would seem # that _ds_from_file() copies the data, but potentially keeps the file open which could be problematic. # This won't effect local file loading even if loading is lazy (i.e. shouldn't cause a regression) ds = self._ds_from_file(fn, **kwargs) self.events = getattr(ds, 'events', None) self.mdh.copyEntriesFrom(ds.mdh) # skip the MappingFilter wrapping, etc. in self.addDataSource and add this datasource as-is self.dataSources['FitResults'] = ds # Fit module specific filter settings # TODO - put all the defaults here and use a local variable rather than in __init__ (self.filterKeys is largely an artifact of pre-recipe based pipeline) if 'Analysis.FitModule' in self.mdh.getEntryNames(): fitModule = self.mdh['Analysis.FitModule'] if 'Interp' in fitModule: self.filterKeys['A'] = (5, 100000) if fitModule == 'SplitterShiftEstFR': self.filterKeys['fitError_dx'] = (0, 10) self.filterKeys['fitError_dy'] = (0, 10) if clobber_recipe: from PYME.recipes.localisations import ProcessColour, Pipelineify from PYME.recipes.tablefilters import FilterTable add_pipeline_variables = Pipelineify( self.recipe, inputFitResults='FitResults', pixelSizeNM=kwargs.get('PixelSize', 1.), outputLocalizations='Localizations') self.recipe.add_module(add_pipeline_variables) #self._get_dye_ratios_from_metadata() colour_mapper = ProcessColour(self.recipe, input='Localizations', output='colour_mapped') self.recipe.add_module(colour_mapper) self.recipe.add_module( FilterTable(self.recipe, inputName='colour_mapped', outputName='filtered_localizations', filters={ k: list(v) for k, v in self.filterKeys.items() if k in ds.keys() })) else: logger.warn( 'Opening file without clobbering recipe, filter and ratiometric colour settings might not be handled properly' ) # FIXME - should we update filter keys and/or make the filter more robust # FIXME - do we need to do anything about colour settings? self.recipe.execute() self.filterKeys = {} if 'filtered_localizations' in self.dataSources.keys(): self.selectDataSource( 'filtered_localizations') #NB - this rebuilds the pipeline else: # TODO - replace / remove this fallback with something better. This is currently required # when we use/abuse the pipeline in dh5view, but that should ideally be replaced with # something cleaner. This (and case above) should probably also be conditional on `clobber_recipe` # as if opening with an existing recipe we would likely want to keep selectedDataSource constant as well. self.selectDataSource('FitResults') # FIXME - we do this already in pipelinify, maybe we can avoid doubling up? self.ev_mappings, self.eventCharts = _processEvents( ds, self.events, self.mdh) # extract information from any events # Retrieve or estimate image bounds if False: # 'imgBounds' in kwargs.keys(): # TODO - why is this disabled? Current usage would appear to be when opening from LMAnalysis # during real-time localization, to force image bounds to match raw data, but also potentially useful # for other scenarios where metadata is not fully present. self.imageBounds = kwargs['imgBounds'] elif ('scanx' not in self.selectedDataSource.keys() or 'scany' not in self.selectedDataSource.keys() ) and 'Camera.ROIWidth' in self.mdh.getEntryNames(): self.imageBounds = ImageBounds.extractFromMetadata(self.mdh) else: self.imageBounds = ImageBounds.estimateFromSource( self.selectedDataSource)
def loadInput(self, filename, key='input'): """Load input data from a file and inject into namespace Currently only handles images (anything you can open in dh5view). TODO - extend to other types. """ #modify this to allow for different file types - currently only supports images from PYME.IO import unifiedIO import os extension = os.path.splitext(filename)[1] if extension in ['.h5r', '.h5', '.hdf']: import tables from PYME.IO import MetaDataHandler from PYME.IO import tabular with unifiedIO.local_or_temp_filename(filename) as fn: with tables.open_file(fn, mode='r') as h5f: #make sure our hdf file gets closed key_prefix = '' if key == 'input' else key + '_' try: mdh = MetaDataHandler.NestedClassMDHandler( MetaDataHandler.HDFMDHandler(h5f)) except tables.FileModeError: # Occurs if no metadata is found, since we opened the table in read-mode logger.warning( 'No metadata found, proceeding with empty metadata' ) mdh = MetaDataHandler.NestedClassMDHandler() for t in h5f.list_nodes('/'): # FIXME - The following isinstance tests are not very safe (and badly broken in some cases e.g. # PZF formatted image data, Image data which is not in an EArray, etc ...) # Note that EArray is only used for streaming data! # They should ideally be replaced with more comprehensive tests (potentially based on array or dataset # dimensionality and/or data type) - i.e. duck typing. Our strategy for images in HDF should probably # also be improved / clarified - can we use hdf attributes to hint at the data intent? How do we support # > 3D data? if isinstance(t, tables.VLArray): from PYME.IO.ragged import RaggedVLArray rag = RaggedVLArray( h5f, t.name, copy=True ) #force an in-memory copy so we can close the hdf file properly rag.mdh = mdh self.namespace[key_prefix + t.name] = rag elif isinstance(t, tables.table.Table): # pipe our table into h5r or hdf source depending on the extension tab = tabular.H5RSource( h5f, t.name ) if extension == '.h5r' else tabular.HDFSource( h5f, t.name) tab.mdh = mdh self.namespace[key_prefix + t.name] = tab elif isinstance(t, tables.EArray): # load using ImageStack._loadh5, which finds metdata im = ImageStack(filename=filename, haveGUI=False) # assume image is the main table in the file and give it the named key self.namespace[key] = im elif extension == '.csv': logger.error('loading .csv not supported yet') raise NotImplementedError elif extension in ['.xls', '.xlsx']: logger.error('loading .xls not supported yet') raise NotImplementedError else: self.namespace[key] = ImageStack(filename=filename, haveGUI=False)
def _load_model(self): from keras.models import load_model if not getattr(self, '_model_name', None) == self.model: self._model_name = self.model with unifiedIO.local_or_temp_filename(self._model_name) as fn: self._model = load_model(fn)