def parseClusterConfigFile( configFilePath ): """ Convenience function for parsing cluster configs. Returns a Namespace object. (Similar to the behavior of argparse.ArgumentParser.parse_args() ) """ schema = JsonConfigParser( ClusterConfigFields ) return schema.parseConfigFile( configFilePath )
def parseClusterConfigFile(configFilePath): """ Convenience function for parsing cluster configs. Returns a Namespace object. (Similar to the behavior of argparse.ArgumentParser.parse_args() ) """ schema = JsonConfigParser(ClusterConfigFields) return schema.parseConfigFile(configFilePath)
def __init__( self, shell, headless, workflow_cmdline_args, project_creation_args, hintoverlayFile=None, pmapoverlayFile=None, *args, **kwargs ): graph = Graph() super(SplitBodyCarvingWorkflow, self).__init__( shell, headless, workflow_cmdline_args, project_creation_args, *args, graph=graph, **kwargs ) ## Create applets self.projectMetadataApplet = ProjectMetadataApplet() self.dataSelectionApplet = DataSelectionApplet( self, "Input Data", "Input Data", supportIlastik05Import=True, batchDataGui=False ) opDataSelection = self.dataSelectionApplet.topLevelOperator opDataSelection.DatasetRoles.setValue(["Raw Data", "Pixel Probabilities", "Raveler Labels"]) self.preprocessingApplet = PreprocessingApplet( workflow=self, title="Preprocessing", projectFileGroupName="preprocessing" ) self.splitBodyCarvingApplet = SplitBodyCarvingApplet(workflow=self, projectFileGroupName="carving") self.splitBodyPostprocessingApplet = SplitBodyPostprocessingApplet(workflow=self) self.splitBodySupervoxelExportApplet = SplitBodySupervoxelExportApplet(workflow=self) # Expose to shell self._applets = [] self._applets.append(self.projectMetadataApplet) self._applets.append(self.dataSelectionApplet) self._applets.append(self.preprocessingApplet) self._applets.append(self.splitBodyCarvingApplet) self._applets.append(self.splitBodyPostprocessingApplet) self._applets.append(self.splitBodySupervoxelExportApplet) self._split_tool_params = None if workflow_cmdline_args: arg_parser = argparse.ArgumentParser(description="Specify parameters for the split-body carving workflow") arg_parser.add_argument("--split_tool_param_file", required=False) parsed_args, unused_args = arg_parser.parse_known_args(workflow_cmdline_args) if unused_args: logger.warn("Unused command-line args: {}".format(unused_args)) if parsed_args.split_tool_param_file is None: logger.warn("Missing cmd-line arg: --split_tool_param_file") else: logger.debug("Parsing split tool parameters: {}".format(parsed_args.split_tool_param_file)) json_parser = JsonConfigParser(SplitToolParamsSchema) self._split_tool_params = json_parser.parseConfigFile(parsed_args.split_tool_param_file)
def testWrite(self): configFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( TestJsonConfig.configpath ) configFields.string_setting = "This is a different sentence." configFields.int_setting = 100 configFields.bool_setting = False # Write it. newConfigFilePath = TestJsonConfig.configpath + "_2" JsonConfigParser( TestJsonConfig.TestSchema ).writeConfigFile( newConfigFilePath, configFields ) # Read it back. newConfigFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( newConfigFilePath ) assert newConfigFields == configFields, "Config field content was not preserved after writing/reading" assert configFields.__dict__.items() == configFields.__dict__.items(), "Config field ORDER was not preserved after writing/reading"
def testExceptionIfRepeatedFields(self): """ This test creates a config that has an error: A field has been repeated. We expect to see an exception from the parser telling us that we screwed up. (See decorator above.) """ testConfig = \ """ { "_schema_name" : "test-schema", "_schema_version" : 1.0, "string_setting" : "First instance", "string_setting" : "Repeated instance" } """ tempDir = tempfile.mkdtemp() configpath = os.path.join(tempDir, "config.json") logger.debug("Using config file: " + configpath) with open(configpath, 'w') as f: f.write(testConfig) try: configFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( configpath ) finally: # Clean up temporary file shutil.rmtree(tempDir)
def testWrite(self): configFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( TestJsonConfig.configpath ) configFields.string_setting = "This is a different sentence." configFields.int_setting = 100 configFields.bool_setting = False # Write it. newConfigFilePath = TestJsonConfig.configpath + "_2" JsonConfigParser( TestJsonConfig.TestSchema ).writeConfigFile( newConfigFilePath, configFields ) # Read it back. newConfigFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( newConfigFilePath ) assert newConfigFields == configFields, "Config field content was not preserved after writing/reading" assert list(configFields.__dict__.items()) == list(configFields.__dict__.items()), "Config field ORDER was not preserved after writing/reading"
def testRead(self): configFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( TestJsonConfig.configpath ) assert configFields.string_setting == "This is a sentence." assert configFields.int_setting == 42 assert configFields.auto_int_setting == 42 assert configFields.another_auto_int_setting == 43 assert configFields.bool_setting is True assert configFields.formatted_setting.format( user_name="Stuart", user_home_town="Washington, DC" ) == "Greetings, Stuart from Washington, DC!" assert configFields.roi_setting == ((1,2,3,4,5), (6,7,8,9,10)) assert isinstance(configFields.array_setting, numpy.ndarray) assert (configFields.array_setting == [1,2,3,4]).all() assert isinstance(configFields.array_from_string_setting, numpy.ndarray) assert (configFields.array_from_string_setting == [1,2,3,4]).all() # Check sub-config settings assert configFields.subconfig.sub_settingA == "yes" assert configFields.subconfig.sub_settingB == "no"
class TiledVolume(object): """ Given a directory of image tiles that make up a volume, produces numpy array volumes for arbitrary roi requests. """ #: These fields describe the schema of the description file. #: See the source code comments for a description of each field. DescriptionFields = \ { "_schema_name" : "tiled-volume-description", "_schema_version" : 1.0, "name" : str, "format" : str, "dtype" : AutoEval(), "bounds_zyx" : AutoEval(numpy.array), # Maximum coordinates (+1) "view_origin_zyx" : AutoEval(numpy.array), # Optional offset for output 'view' "view_shape_zyx" : AutoEval(numpy.array), # Shape of the output 'view'. If not provided, defaults to bounds - origin "resolution_zyx" : AutoEval(numpy.array), "tile_shape_2d_yx" : AutoEval(numpy.array), "is_rgb" : bool, # Indicates that we must convert to grayscale "username" : str, "password" : str, # This doesn't change how the data is read from the server, # but instead specifies the indexing order of the numpy volumes produced. "output_axes" : str, "cache_tiles" : bool, # Offset not supported for now... #"origin_offset" : AutoEval(numpy.array), # For now we support 3D-only, sliced across Z (TODO: Support 5D?) # We allow multiple url schemes: tiles might be addressed via pixel coordinates or row/column indexing # (z_index and z_start are synonyms here -- either is allowed) # Example: pixel-wise tile names: # "tile_url_format" : "http://my.tiles.org/my_tiles/{z_start}-{z_stop}/{y_start}-{y_stop}/{x_start}-{x_stop}.jpg" # Example: row/column-wise tile names # "tile_url_format" : "http://my.tiles.org/my_tiles/{z_index}/{y_index}/{x_index}.jpg" # Also, local tile sources (filesystem, not http) are okay: # "tile_url_format" : "/my_hard_disk/my_tiles/{z_index}/{y_index}/{x_index}.jpg" "tile_url_format" : FormattedField( requiredFields=[], optionalFields=["x_start", "y_start", "z_start", "x_stop", "y_stop", "z_stop", "x_index", "y_index", "z_index", "raveler_z_base"] ), # Special keyword for Raveler session directories. See notes below. "invert_y_axis" : bool, # For raveler volumes, the y-axis coordinate is inverted. # A list of lists, mapping src slices to destination slices (for "filling in" missing slices) # Example If slices 101,102,103 are missing data, you might want to simply repeat the data from slice 100: # "extend_slices" : [ [100, [101, 102, 103]] ] "extend_slices" : list, # Some tiled volumes have complicated mappings from "real" or "global" coordinates to url/filepath coordinates. # This field will be eval()'d before the tile is retrieved # For example, if the slices were named according to their position in nanometers instead of pixels, this might do the trick: # "z_translation_function" : "lambda z: 40*z" "z_translation_function" : str, # Optional data transform. For example: # "data_transform_function" : "lambda a: a == 0", "data_transform_function" : str } DescriptionSchema = JsonConfigParser(DescriptionFields) @classmethod def readDescription(cls, descriptionFilePath): # Read file description = TiledVolume.DescriptionSchema.parseConfigFile( descriptionFilePath) cls.updateDescription(description) return description @classmethod def updateDescription(cls, description): """ Some description fields are optional. If they aren't provided in the description JSON file, then this function provides them with default values, based on the other description fields. """ # Augment with default parameters. logger.debug(str(description)) if description.view_origin_zyx is None: description.view_origin_zyx = numpy.array( [0] * len(description.bounds_zyx)) if description.view_shape_zyx is None: description.view_shape_zyx = description.bounds_zyx - description.view_origin_zyx if not description.output_axes: description.output_axes = "zyx" assert description.output_axes is None or set(description.output_axes) == set("zyx"), \ "Axis order must include x,y,z (and nothing else)" if not description.extend_slices: description.extend_slices = [] if description.cache_tiles is None: description.cache_tiles = False def __init__(self, descriptionFilePath): self.description = TiledVolume.readDescription(descriptionFilePath) self._session = None assert self.description.format in vigra.impex.listExtensions().split(), \ "Unknown tile format: {}".format( self.description.format ) assert self.description.tile_shape_2d_yx.shape == (2, ) assert self.description.bounds_zyx.shape == (3, ) assert self.description.view_shape_zyx.shape == (3, ) shape_dict = dict(zip('zyx', self.description.view_shape_zyx)) self.output_shape = tuple(shape_dict[k] for k in self.description.output_axes) self._slice_remapping = {} for source, destinations in self.description.extend_slices: for dest in destinations: self._slice_remapping[dest] = source def close(self): if self._session: self._session.close() def read(self, view_roi, result_out): """ roi: (start, stop) tuples, ordered according to description.output_axes roi should be relative to the view """ output_axes = self.description.output_axes roi_transposed = zip(*view_roi) roi_dict = dict(zip(output_axes, roi_transposed)) view_roi = zip(*(roi_dict['z'], roi_dict['y'], roi_dict['x'])) # First, normalize roi and result to zyx order result_out = vigra.taggedView(result_out, output_axes) result_out = result_out.withAxes(*'zyx') assert numpy.array(view_roi).shape == ( 2, 3), "Invalid roi for 3D volume: {}".format(view_roi) view_roi = numpy.array(view_roi) assert (result_out.shape == (view_roi[1] - view_roi[0])).all() # User gave roi according to the view output. # Now offset it find global roi. roi = view_roi + self.description.view_origin_zyx tile_blockshape = (1, ) + tuple(self.description.tile_shape_2d_yx) tile_starts = getIntersectingBlocks(tile_blockshape, roi) pool = RequestPool() for tile_start in tile_starts: tile_roi_in = getBlockBounds(self.description.bounds_zyx, tile_blockshape, tile_start) tile_roi_in = numpy.array(tile_roi_in) # This tile's portion of the roi intersecting_roi = getIntersection(roi, tile_roi_in) intersecting_roi = numpy.array(intersecting_roi) # Compute slicing within destination array and slicing within this tile destination_relative_intersection = numpy.subtract( intersecting_roi, roi[0]) tile_relative_intersection = intersecting_roi - tile_roi_in[0] # Get a view to the output slice result_region = result_out[roiToSlice( *destination_relative_intersection)] rest_args = self._get_rest_args(tile_blockshape, tile_roi_in) if self.description.tile_url_format.startswith('http'): retrieval_fn = partial(self._retrieve_remote_tile, rest_args, tile_relative_intersection, result_region) else: retrieval_fn = partial(self._retrieve_local_tile, rest_args, tile_relative_intersection, result_region) PARALLEL_REQ = True if PARALLEL_REQ: pool.add(Request(retrieval_fn)) else: # execute serially (leave the pool empty) retrieval_fn() if PARALLEL_REQ: with Timer() as timer: pool.wait() logger.info("Loading {} tiles took a total of {}".format( len(tile_starts), timer.seconds())) def _get_rest_args(self, tile_blockshape, tile_roi_in): """ For a single tile, return a dict of all possible parameters that can be substituted into the tile_url_format string from the volume json description file. tile_blockshape: The 3D blockshape of the tile (since tiles are only 1 slice thick, the blockshape always begins with 1). tile_roi_in: The ROI within the total volume for a particular tile. (Note that the size of the ROI is usually, but not always, the same as tile_blockshape. Near the volume borders, the tile_roi_in may be smaller.) """ assert sys.version_info.major == 2, "Alert! This function has not been tested "\ "under python 3. Please remove this assetion and be wary of any strnage behavior you encounter" # Special feature: # Some slices are missing, in which case we provide fake data from a different slice. # Overwrite the rest args to pull data from an alternate source tile. z_start = tile_roi_in[0][0] if z_start in self._slice_remapping: new_source_slice = self._slice_remapping[z_start] tile_roi_in[0][0] = new_source_slice tile_roi_in[1][0] = new_source_slice + 1 tile_index = numpy.array(tile_roi_in[0]) // tile_blockshape rest_args = { 'z_start': tile_roi_in[0][0], 'z_stop': tile_roi_in[1][0], 'y_start': tile_roi_in[0][1], 'y_stop': tile_roi_in[1][1], 'x_start': tile_roi_in[0][2], 'x_stop': tile_roi_in[1][2], 'z_index': tile_index[0], 'y_index': tile_index[1], 'x_index': tile_index[2] } # Apply special z_translation_function if self.description.z_translation_function is not None: z_update_func = eval(self.description.z_translation_function) rest_args['z_index'] = rest_args['z_start'] = z_update_func( rest_args['z_index']) rest_args['z_stop'] = 1 + rest_args['z_start'] # Quick sanity check assert rest_args['z_index'] == rest_args['z_start'] # Special arg for Raveler session directories: # For files with Z < 1000, no extra directory level # For files with Z >= 1000, there is an extra directory level, # in which case the extra '/' is INCLUDED here in the rest arg. raveler_z_base = (rest_args['z_index'] // 1000) * 1000 if raveler_z_base == 0: rest_args['raveler_z_base'] = "" else: rest_args['raveler_z_base'] = str(raveler_z_base) + '/' return rest_args def _retrieve_local_tile(self, rest_args, tile_relative_intersection, data_out): tile_path = self.description.tile_url_format.format(**rest_args) logger.debug("Opening {}".format(tile_path)) if not os.path.exists(tile_path): logger.error("Tile does not exist: {}".format(tile_path)) data_out[...] = 0 return # Read the image from the disk with vigra img = vigra.impex.readImage(tile_path, dtype='NATIVE') assert img.ndim == 3 if self.description.is_rgb: # "Convert" to grayscale -- just take first channel. img = img[..., 0:1] assert img.shape[-1] == 1, "Image has more channels than expected. "\ "If it is RGB, be sure to set the is_rgb flag in your description json." # img has axes xyc, but we want zyx img = img.transpose()[None, 0, :, :] if self.description.invert_y_axis: # More special Raveler support: # Raveler's conventions for the Y-axis are the reverse for everyone else's. img = img[:, ::-1, :] # Copy just the part we need into the destination array assert img[roiToSlice( *tile_relative_intersection)].shape == data_out.shape data_out[:] = img[roiToSlice(*tile_relative_intersection)] # If there's a special transform, apply it now. if self.description.data_transform_function is not None: transform = eval(self.description.data_transform_function) data_out[:] = transform(data_out) # For late imports requests = None PIL = None TEST_MODE = False # For testing purposes only. See below. def _retrieve_remote_tile(self, rest_args, tile_relative_intersection, data_out): # Late import if not TiledVolume.requests: import requests TiledVolume.requests = requests requests = TiledVolume.requests tile_url = self.description.tile_url_format.format(**rest_args) logger.debug("Retrieving {}".format(tile_url)) try: if self._session is None: self._session = self._create_session() # Provide authentication if we have the details. if self.description.username and self.description.password: self._session.auth = (self.description.username, self.description.password) success = False tries = 0 while not success: try: # Note: We give timeout as a tuple, which requires a recent version of requests. # If you get an exception about that, upgrade your requests module. r = self._session.get(tile_url, timeout=(3.0, 20.0)) success = True except requests.ConnectionError: # This special 'pass' is here because we keep running into exceptions like this: # ConnectionError: HTTPConnectionPool(host='neurocean.int.janelia.org', port=6081): # Max retries exceeded with url: /ssd-3-tiles/abd1.5/43/24_25_0.jpg # (Caused by <class 'httplib.BadStatusLine'>: '') # So now we loop a few times and only give up if something is really wrong. if tries == 5: raise # give up tries += 1 except: # During testing, the server we're pulling from might be in our own process. # Apparently that means that it is not very responsive, leading to exceptions. # As a cheap workaround, just try one more time. if self.TEST_MODE: import time time.sleep(0.01) r = self._session.get(tile_url, timeout=(3.0, 20.0)) else: raise if r.status_code == requests.codes.not_found: logger.warn("NOTFOUND: {}".format(tile_url)) data_out[:] = 0 else: # late import if not TiledVolume.PIL: import PIL import PIL.Image TiledVolume.PIL = PIL PIL = TiledVolume.PIL img = numpy.asarray(PIL.Image.open(StringIO(r.content))) if self.description.is_rgb: # "Convert" to grayscale -- just take first channel. assert img.ndim == 3 img = img[..., 0] assert img.ndim == 2, "Image seems to be of the wrong dimension. "\ "If it is RGB, be sure to set the is_rgb flag in your description json." # img has axes xy, but we want zyx img = img[None] if self.description.invert_y_axis: # More special Raveler support: # Raveler's conventions for the Y-axis are the reverse for everyone else's. img = img[:, ::-1, :] # Copy just the part we need into the destination array assert img[roiToSlice( *tile_relative_intersection)].shape == data_out.shape data_out[:] = img[roiToSlice(*tile_relative_intersection)] # If there's a special transform, apply it now. if self.description.data_transform_function is not None: transform = eval(self.description.data_transform_function) data_out[:] = transform(data_out) @classmethod def _create_session(cls): """ Generate a requests.Session object to use for this TiledVolume. Using a session allows us to benefit from a connection pool instead of establishing a new connection for every request. """ # Late import if not TiledVolume.requests: import requests TiledVolume.requests = requests requests = TiledVolume.requests session = requests.Session() # Replace the session http adapters with ones that use larger connection pools n_threads = max(1, Request.global_thread_pool.num_workers) adapter = requests.adapters.HTTPAdapter(pool_connections=n_threads, pool_maxsize=n_threads) adapter2 = requests.adapters.HTTPAdapter(pool_connections=n_threads, pool_maxsize=n_threads) session.mount('http://', adapter) session.mount('https://', adapter2) return session
class RESTfulVolume(object): """ This class provides access to data obtained via a RESTful API (e.g. from http://openconnecto.me). A description of the remote volume must be provided via a JSON file, whose schema is specified by :py:data:`RESTfulVolume.DescriptionFields`. See the unit tests in ``tests/testRESTfulVolume.py`` for example usage. .. note:: This class does not keep track of the data you've already downloaded. Every call to :py:func:`downloadSubVolume()` results in a new download. For automatic blockwise local caching of remote datasets, see :py:class:`RESTfulBlockwiseFileset`. .. note:: See the unit tests in ``tests/testRESTfulVolume.py`` for example usage. """ #: These fields describe the schema of the description file. #: See the source code comments for a description of each field. DescriptionFields = { "_schema_name": "RESTful-volume-description", "_schema_version": 1.0, "name": str, "format": str, "axes": str, "dtype": AutoEval(), "bounds": AutoEval(numpy.array), "shape": AutoEval(numpy.array ), # Provided for you. Computed as bounds - origin_offset "origin_offset": AutoEval(numpy.array), "url_format": FormattedField( requiredFields=[ "x_start", "x_stop", "y_start", "y_stop", "z_start", "z_stop" ], optionalFields=["t_start", "t_stop", "c_start", "c_stop"], ), "hdf5_dataset": str, } DescriptionSchema = JsonConfigParser(DescriptionFields) @classmethod def readDescription(cls, descriptionFilePath): """ Parse the description file at the given path and return a :py:class:`jsonConfig.Namespace` object with the description parameters. The file will be parsed according to the schema given by :py:data:`RESTfulVolume.DescriptionFields`. Any optional parameters not provided by the user are filled in automatically. :param descriptionFilePath: The path to the description file to parse. """ # Read file description = RESTfulVolume.DescriptionSchema.parseConfigFile( descriptionFilePath) cls.updateDescription(description) return description @classmethod def updateDescription(cls, description): """ Some description fields are optional. If they aren't provided in the description JSON file, then this function provides them with default values, based on the other description fields. """ # Augment with default parameters. logger.debug(str(description)) if description.origin_offset is None: description.origin_offset = numpy.array([0] * len(description.bounds)) description.shape = description.bounds - description.origin_offset @classmethod def writeDescription(cls, descriptionFilePath, descriptionFields): """ Write a :py:class:`jsonConfig.Namespace` object to the given path. :param descriptionFilePath: The path to overwrite with the description fields. :param descriptionFields: The fields to write. """ RESTfulVolume.DescriptionSchema.writeConfigFile( descriptionFilePath, descriptionFields) def __init__(self, descriptionFilePath=None, preparsedDescription=None): """ Constructor. Uses `readDescription` interally. :param descriptionFilePath: The path to the .json file that describes the remote volume. :param preparsedDescription: (Optional) Provide pre-parsed description fields, in which case the provided description file will not be parsed. """ if preparsedDescription is not None: assert descriptionFilePath is None, "Can't provide BOTH description file and pre-parsed description fields." self.description = preparsedDescription else: assert ( descriptionFilePath is not None ), "Must provide either a description file or pre-parsed description fields" self.description = RESTfulVolume.readDescription( descriptionFilePath) # Check for errors assert False not in [ a in "txyzc" for a in self.description.axes ], "Unknown axis type. Known axes: txyzc Your axes:".format( self.description.axes) assert self.description.format == "hdf5", "Only hdf5 RESTful volumes are supported so far." assert ( self.description.hdf5_dataset is not None ), "RESTful volume description file must specify the hdf5_dataset name" if self.description.hdf5_dataset[0] != "/": self.description.hdf5_dataset = "/" + self.description.hdf5_dataset def downloadSubVolume(self, roi, outputDatasetPath): """ Download a cutout volume from the remote dataset. :param roi: The subset of the volume to download, specified as a tuple of coordinates: ``(start, stop)`` :param outputDatasetPath: The path to overwrite with the downloaded hdf5 file. """ origin_offset = numpy.array(self.description.origin_offset) accessStart = numpy.array(roi[0]) accessStart += origin_offset accessStop = numpy.array(roi[1]) accessStop += origin_offset RESTArgs = {} for axisindex, axiskey in enumerate(self.description.axes): startKey = "{}_start".format(axiskey) stopKey = "{}_stop".format(axiskey) RESTArgs[startKey] = accessStart[axisindex] RESTArgs[stopKey] = accessStop[axisindex] # Download the ROI specified in the url to a HDF5 file url = self.description.url_format.format(**RESTArgs) logger.info("Opening url for region {}..{}: {}".format( roi[0], roi[1], url)) pathComponents = PathComponents(outputDatasetPath) if pathComponents.internalPath != self.description.hdf5_dataset: # We could just open the file and rename the dataset to match what the user asked for, but that would probably be slow. # It's better just to force him to use the correct dataset name to begin with. raise RuntimeError( "The RESTful volume format uses internal dataset name '{}', but you seem to be expecting '{}'." .format(self.description.hdf5_dataset, pathComponents.internalPath)) logger.info("Downloading RESTful subvolume to file: {}".format( pathComponents.externalPath)) urllib.request.urlretrieve(url, pathComponents.externalPath) logger.info("Finished downloading file: {}".format( pathComponents.externalPath))
class RESTfulBlockwiseFileset(BlockwiseFileset): """ This class combines the functionality of :py:class:`RESTfulVolume` and :py:class:`BlockwiseFileset` to provide access to a remote dataset (e.g. from http://openconnecto.me), with all downloaded data cached locally as blocks stored in a directory tree of hdf5 files. This class must be constructed with a description of both the remote dataset and the local storage format, provided in a JSON file with a composite schema specified by :py:data:`RESTfulBlockwiseFileset.DescriptionFields`. .. note:: See the unit tests in ``tests/testRESTfulBlockwiseFileset.py`` for example usage. Here's an example description file. .. code-block:: json { "_schema_name" : "RESTful-blockwise-fileset-description", "_schema_version" : 1.0, "remote_description" : { "_schema_name" : "RESTful-volume-description", "_schema_version" : 1.0, "name" : "Bock11-level0", "format" : "hdf5", "axes" : "zyx", "## NOTE": "The origin offset determines how coordinates are translated when converted to a url.", "## NOTE": "The origin_offset for the bock11 dataset must be at least 2917, because for some reason that's where it starts.", "origin_offset" : [2917, 0, 0], "## NOTE": "The website says that the data goes up to plane 4156, but it actually errors out past 4150", "bounds" : [4150, 135424, 119808], "dtype" : "numpy.uint8", "url_format" : "http://openconnecto.me/emca/bock11/hdf5/0/{x_start},{x_stop}/{y_start},{y_stop}/{z_start},{z_stop}/", "hdf5_dataset" : "cube" }, "local_description" : { "_schema_name" : "blockwise-fileset-description", "_schema_version" : 1.0, "name" : "bock11-blocks", "format" : "hdf5", "axes" : "zyx", "shape" : "[ 4150-2917, 135424, 119808 ]", "dtype" : "numpy.uint8", "block_shape" : [32, 256, 256], "block_file_name_format" : "block-{roiString}.h5/cube", "dataset_root_dir" : "blocks-256x256x32", "## NOTE":"These optional parameters tell ilastik to view only a portion of the on-disk dataset.", "## NOTE":"view_origin MUST be aligned to a block start corner.", "## NOTE":"view_shape is optional, but recommended because volumina slows down when there are 1000s of tiles.", "view_origin" : "[0, 50*1024, 50*1024]", "view_shape" : "[4150-2917, 10*256, 10*256]" } } """ #: This member specifies the schema of the description file. #: It is merely a composite of two nested schemas: one that describes the remote volume, #: and another that describes the local storage format. See the source code to see the field names. DescriptionFields = \ { "_schema_name" : "RESTful-blockwise-fileset-description", "_schema_version" : 1.0, # Description of the RESTful Volume "remote_description" : JsonConfigParser( RESTfulVolume.DescriptionFields ), # Description of the local block layout "local_description" : JsonConfigParser( BlockwiseFileset.DescriptionFields ) } DescriptionSchema = JsonConfigParser(DescriptionFields) @classmethod def readDescription(cls, descriptionFilePath): """ Parse the description file at the given path and return a :py:class:`jsonConfig.Namespace` object with the description parameters. The file will be parsed according to the schema given by :py:data:`RESTfulBlockwiseFileset.DescriptionFields`. Any optional parameters not provided by the user are filled in automatically. :param descriptionFilePath: The path to the description file to parse. """ description = RESTfulBlockwiseFileset.DescriptionSchema.parseConfigFile( descriptionFilePath) RESTfulVolume.updateDescription(description.remote_description) return description @classmethod def writeDescription(cls, descriptionFilePath, descriptionFields): """ Write a :py:class:`jsonConfig.Namespace` object to the given path. :param descriptionFilePath: The path to overwrite with the description fields. :param descriptionFields: The fields to write. """ RESTfulBlockwiseFileset.DescriptionSchema.writeConfigFile( descriptionFilePath, descriptionFields) @classmethod def _createAndReturnBlockwiseFileset(self, descriptionFilePath, mode): try: rbfs = RESTfulBlockwiseFileset(descriptionFilePath) assert mode == 'r', "RESTfulBlockwiseFilesets may only be opened in read-only mode." except JsonConfigParser.SchemaError: rbfs = None return rbfs def __init__(self, compositeDescriptionPath): """ Constructor. Uses `readDescription` interally. :param compositeDescriptionPath: The path to a JSON file that describes both the remote volume and local storage structure. The JSON file schema is specified by :py:data:`RESTfulBlockwiseFileset.DescriptionFields`. """ # Parse the description file, which contains sub-configs for the blockwise description and RESTful description self.compositeDescription = RESTfulBlockwiseFileset.readDescription( compositeDescriptionPath) self.localDescription = self.compositeDescription.local_description self.remoteDescription = self.compositeDescription.remote_description super(RESTfulBlockwiseFileset, self).__init__(compositeDescriptionPath, 'r', preparsedDescription=self.localDescription) self._remoteVolume = RESTfulVolume( preparsedDescription=self.remoteDescription) try: if not self.localDescription.block_file_name_format.endswith( self.remoteDescription.hdf5_dataset): msg = "Your RESTful volume description file must specify an hdf5 internal dataset name that matches the one in your Blockwise Fileset description file!" msg += "RESTful volume dataset name is '{}', but blockwise fileset format is '{}'".format( self.remoteDescription.hdf5_dataset, self.localDescription.block_file_name_format) raise RuntimeError(msg) if self.localDescription.axes != self.remoteDescription.axes: raise RuntimeError( "Your RESTful volume's axes must match the blockwise dataset axes. ('{}' does not match '{}')" .format(self.remoteDescription.axes, self.localDescription.axes)) if (numpy.array(self.localDescription.shape) > numpy.array( self.remoteDescription.shape)).any(): raise RuntimeError( "Your local blockwise volume shape must be smaller in all dimensions than the remote volume shape." ) except: logger.error("Error loading dataset from {}".format( compositeDescriptionPath)) raise def readData(self, roi, out_array=None): """ Read data from the fileset. If any of the requested data is not yet available locally, download it first. :param roi: The region of interest to read from the dataset. Must be a tuple of iterables: (start, stop). :param out_array: The location to store the read data. Must be the correct size for the given roi. If not provided, an array is created for you. :returns: The requested data. If out_array was provided, returns out_array. """ assert (numpy.array(roi[1]) <= numpy.array( self.localDescription.view_shape)).all( ), "Requested roi '{}' is out of dataset bounds '{}'".format( roi, self.localDescription.view_shape) # Before reading the data, make sure all the blocks we'll need to access are available on disk. block_starts = getIntersectingBlocks(self.localDescription.block_shape, roi) self._waitForBlocks(block_starts) return super(RESTfulBlockwiseFileset, self).readData(roi, out_array) def _waitForBlocks(self, block_starts): """ Initiate downloads for those blocks that need it. (Some blocks in the list may already be downloading.) Then wait for all necessary downloads to complete (including the ones that we didn't initiate). """ # Only wait for those that are missing. missing_blocks = [] for block_start in block_starts: if self.getBlockStatus( block_start) == BlockwiseFileset.BLOCK_NOT_AVAILABLE: missing_blocks.append(block_start) # Start by creating all necessary directories. self._ensureDirectoriesExist(missing_blocks) # Attempt to lock each path we need to create. # Locks we fail to obtain are already being fetched by other processes, which is okay. acquired_locks = [] unobtained_locks = [] for block_start in missing_blocks: entire_block_roi = self.getEntireBlockRoi( block_start ) # Roi of this whole block within the whole dataset blockFilePathComponents = self.getDatasetPathComponents( block_start) fileLock = FileLock(blockFilePathComponents.externalPath) if fileLock.acquire(False): acquired_locks.append((entire_block_roi, fileLock)) else: unobtained_locks.append((entire_block_roi, fileLock)) # We are now responsible for downloading the data for the file paths we were able to lock. # Start a separate thread for each. downloadThreads = [] for block_roi, fileLock in acquired_locks: blockFilePathComponents = self.getDatasetPathComponents( block_roi[0]) th = threading.Thread( target=functools.partial(self._downloadBlock, fileLock, block_roi, blockFilePathComponents)) downloadThreads.append(th) # Start all the threads for th in downloadThreads: th.start() # Wait for them all to complete for th in downloadThreads: th.join() # Finally, wait for the blocks that we COULDN'T lock (they must be downloading in other processes somewhere...) for block_roi, fileLock in unobtained_locks: while self.getBlockStatus( block_roi[0]) == BlockwiseFileset.BLOCK_NOT_AVAILABLE: time.sleep(5) def _downloadBlock(self, fileLock, entire_block_roi, blockFilePathComponents): """ Download the data for the given block, then release its file lock. :param fileLock: The lock for the file we are about to create. MUST BE LOCKED already. :param entire_block_roi: The roi for the block to download. :param blockFilePathComponents: A lazyflow.utility.PathComponents object describing the location of the block dataset file. """ try: # The blockFilePath has already been offset to accomodate any view offset, but the roi has not. # Offset the roi coordinates before requesting them from the remote volume. translated_roi = [] translated_roi.append( numpy.add(entire_block_roi[0], self.description.view_origin)) translated_roi.append( numpy.add(entire_block_roi[1], self.description.view_origin)) logger.debug("Downloading block: {}".format(entire_block_roi[0])) self._remoteVolume.downloadSubVolume( translated_roi, blockFilePathComponents.totalPath()) self.setBlockStatus(entire_block_roi[0], BlockwiseFileset.BLOCK_AVAILABLE) finally: fileLock.release() def downloadAllBlocks(self, max_parallel, skip_preparation=False): """ Download all blocks in the local view. This is used in utility scripts for downloading an entire volume at once. This function is NOT intended to be used by multiple threads in parallel (i.e. it doesn't protect against downloading the same block twice.) """ view_shape = self.localDescription.view_shape view_roi = ([0] * len(view_shape), view_shape) block_starts = list( getIntersectingBlocks(self.localDescription.block_shape, view_roi)) if not skip_preparation: self._ensureDirectoriesExist(block_starts) # Only wait for those that are missing. blockQueue = Queue.Queue() for block_start in block_starts: if self.getBlockStatus( block_start) == BlockwiseFileset.BLOCK_NOT_AVAILABLE: blockQueue.put(block_start) num_blocks = blockQueue.qsize() logger.debug("Preparing to download {} blocks".format(num_blocks)) failedBlockQueue = Queue.Queue() threads = [] for _ in range(max_parallel): th = threading.Thread( target=functools.partial(self._downloadFromQueue, num_blocks, blockQueue, failedBlockQueue)) threads.append(th) th.start() for th in threads: th.join() errors = not failedBlockQueue.empty() while not failedBlockQueue.empty(): logger.error( "Failed to download block {}. Does it have a leftover lockfile?" .format(failedBlockQueue.get())) logger.debug("FINISHED DOWNLOADING.") if errors: logger.error( "There were errors during the download process. Check error log output!" ) def _downloadFromQueue(self, num_blocks, blockQueue, failedBlockQueue): """ Helper function for downloadAllBlocks(), above. """ try: while not blockQueue.empty(): block_start = blockQueue.get(block=False) entire_block_roi = self.getEntireBlockRoi( block_start ) # Roi of this whole block within the whole dataset blockFilePathComponents = self.getDatasetPathComponents( block_start) # Obtain lock fileLock = FileLock(blockFilePathComponents.externalPath) if not fileLock.acquire(False): failedBlockQueue.put(block_start) else: try: # Download the block # (This function releases the lock for us.) self._downloadBlock(fileLock, entire_block_roi, blockFilePathComponents) logger.debug("Finished downloading {}/{}".format( num_blocks - blockQueue.qsize(), num_blocks)) except: if fileLock.locked(): fileLock.release() self.setBlockStatus( entire_block_roi[0], BlockwiseFileset.BLOCK_NOT_AVAILABLE) failedBlockQueue.put(block_start) raise except Queue.Empty: return def _ensureDirectoriesExist(self, block_starts): """ Create all directories that the provided blocks will be stored in. """ # If the directory already exists, ignore the resulting error. for block_start in block_starts: blockDir = self.getDatasetDirectory(block_start) try: os.makedirs(blockDir) except OSError, e: if e.errno != errno.EEXIST: raise
def setup(self): """ Generate a directory with all the files needed for this test. We use the same temporary directory every time, so we don't waste time regenerating the data if the test has already been run recently. The directory consists of the following files: - reference_volume.h5 - volume_description.json - transposed_volume_description.json - [lots of png tiles..] """ global volume_description_text global port try: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: # allow the socket port to be reused if in TIME_WAIT state sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sock.bind(('localhost', port)) # try default/previous port except Exception as e: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: # allow the socket port to be reused if in TIME_WAIT state sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sock.bind(('localhost', 0)) # find free port port = sock.getsockname()[1] volume_description_text = volume_description_text.replace( '{port}', str(port)) tmp = tempfile.gettempdir() self.TILE_DIRECTORY = os.path.join(tmp, 'testTiledVolume_data') logger.debug("Using test directory: {}".format(self.TILE_DIRECTORY)) self.REFERENCE_VOL_PATH = os.path.join(self.TILE_DIRECTORY, 'reference_volume.h5/data') ref_vol_path_comp = PathComponents(self.REFERENCE_VOL_PATH) self.REFERENCE_VOL_FILE = ref_vol_path_comp.externalPath self.VOLUME_DESCRIPTION_FILE = os.path.join(self.TILE_DIRECTORY, 'volume_description.json') self.LOCAL_VOLUME_DESCRIPTION_FILE = os.path.join( self.TILE_DIRECTORY, 'local_volume_description.json') self.TRANSPOSED_VOLUME_DESCRIPTION_FILE = os.path.join( self.TILE_DIRECTORY, 'transposed_volume_description.json') self.TRANSLATED_VOLUME_DESCRIPTION_FILE = os.path.join( self.TILE_DIRECTORY, 'translated_volume_description.json') self.SPECIAL_Z_VOLUME_DESCRIPTION_FILE = os.path.join( self.TILE_DIRECTORY, 'special_z_volume_description.json') if not os.path.exists(self.TILE_DIRECTORY): print("Creating new tile directory: {}".format( self.TILE_DIRECTORY)) os.mkdir(self.TILE_DIRECTORY) if not os.path.exists(self.REFERENCE_VOL_FILE): ref_vol = numpy.random.randint(0, 255, (100, 600, 600)).astype(numpy.uint8) with h5py.File(self.REFERENCE_VOL_FILE, 'w') as ref_file: ref_file[ref_vol_path_comp.internalPath] = ref_vol else: with h5py.File(self.REFERENCE_VOL_FILE, 'r') as ref_file: ref_vol = ref_file[ref_vol_path_comp.internalPath][:] need_rewrite = False if not os.path.exists(self.VOLUME_DESCRIPTION_FILE): need_rewrite = True else: with open(self.VOLUME_DESCRIPTION_FILE, 'r') as f: if f.read() != volume_description_text: need_rewrite = True if need_rewrite: with open(self.VOLUME_DESCRIPTION_FILE, 'w') as f: f.write(volume_description_text) # Read the volume description as a JsonConfig Namespace volume_description = TiledVolume.readDescription( self.VOLUME_DESCRIPTION_FILE) # Write out a copy of the description, but with a local tile path instead of a URL config_helper = JsonConfigParser(TiledVolume.DescriptionFields) local_description = copy.copy(volume_description) local_description.tile_url_format = self.TILE_DIRECTORY + "/tile_z{z_start:05}_y{y_start:05}_x{x_start:05}.png" config_helper.writeConfigFile(self.LOCAL_VOLUME_DESCRIPTION_FILE, local_description) # Write out a copy of the description, but with custom output axes config_helper = JsonConfigParser(TiledVolume.DescriptionFields) transposed_description = copy.copy(volume_description) transposed_description.output_axes = "xyz" config_helper.writeConfigFile( self.TRANSPOSED_VOLUME_DESCRIPTION_FILE, transposed_description) # Write out another copy of the description, but with an origin translation config_helper = JsonConfigParser(TiledVolume.DescriptionFields) translated_description = copy.copy(volume_description) translated_description.view_origin_zyx = [10, 20, 30] translated_description.shape_zyx = None config_helper.writeConfigFile( self.TRANSLATED_VOLUME_DESCRIPTION_FILE, translated_description) # Write out another copy of the description, but with a special function for translating z-coordinates. config_helper = JsonConfigParser(TiledVolume.DescriptionFields) special_z_description = copy.copy(volume_description) special_z_description.z_translation_function = "lambda z: z+11" config_helper.writeConfigFile( self.SPECIAL_Z_VOLUME_DESCRIPTION_FILE, special_z_description) # Remove all old image tiles in the tile directory files = os.listdir(self.TILE_DIRECTORY) for name in files: if os.path.splitext( name)[1] == '.' + volume_description.format: os.remove(os.path.join(self.TILE_DIRECTORY, name)) # Write the new tiles export_to_tiles(ref_vol, volume_description.tile_shape_2d_yx[0], self.TILE_DIRECTORY, print_progress=False) # To support testMissingTiles (below), remove slice 2 files = os.listdir(self.TILE_DIRECTORY) for name in files: if name.startswith("tile_z00002"): p = os.path.join(self.TILE_DIRECTORY, name) print("removing:", p) os.remove(p) # lastly, start the server self._start_server()
class BlockwiseFileset(object): """ This class handles writing and reading a 'blockwise file set'. A 'blockwise file set' is a directory with a particular structure, which contains the entire dataset broken up into blocks. Important parameters (e.g. shape, dtype, blockshape) are specified in a JSON file, which must match the schema given by :py:data:`BlockwiseFileset.DescriptionFields`. The parent directory of the description file is considered to be the top-most directory in the blockwise dataset hierarchy. - Simultaneous reads are threadsafe. - NOT threadsafe for reading and writing simultaneously (or writing and writing). - NOT threadsafe for closing. Do not call close() while reading or writing. .. note:: See the unit tests in ``tests/testBlockwiseFileset.py`` for example usage. """ #: These fields describe the schema of the description file. #: See the source code comments for a description of each field. DescriptionFields = { "_schema_name": "blockwise-fileset-description", "_schema_version": 1.1, "name": str, "format": str, "axes": str, "shape": AutoEval(numpy.array), # This is the shape of the dataset on disk "dtype": AutoEval(), "drange": AutoEval(tuple), # Optional. Data range, e.g. (0.0, 1.0) "chunks": AutoEval(numpy.array), # Optional. If null, no chunking. Only used when writing data. "compression": str, # Optional. Options include 'lzf' and 'gzip', among others. Note: h5py automatically enables chunking on compressed datasets. "compression_opts": AutoEval(int), # Optional. Hdf5-specific "block_shape": AutoEval(numpy.array), "view_origin": AutoEval( numpy.array ), # Optional. Defaults to zeros. All requests will be translated before the data is accessed. # For example, if the offset is [100, 200, 300], then a request for roi([0,0,0],[2,2,2]) # will pull from the dataset on disk as though the request was ([100,200,300],[102,202,302]). # It is an error to specify an view_origin that is not a multiple of the block_shape. "view_shape": AutoEval( numpy.array ), # Optional. Defaults to (shape - view_origin) Limits the shape of the provided data. "block_file_name_format": FormattedField( requiredFields=["roiString"] ), # For hdf5, include dataset name, e.g. myfile_block{roiString}.h5/volume/data "dataset_root_dir": str, # Abs path or relative to the description file itself. Defaults to "." if left blank. "hash_id": str, # Not user-defined (clients may use this) # Added in schema v1.1 "sub_block_shape": AutoEval(numpy.array), # Optional. Must divide evenly into the block shape. } DescriptionSchema = JsonConfigParser(DescriptionFields) @classmethod def readDescription(cls, descriptionFilePath): """ Parse the description file at the given path and return a :py:class:`jsonConfig.Namespace` object with the description parameters. The file will be parsed according to the schema given by :py:data:`BlockwiseFileset.DescriptionFields`. :param descriptionFilePath: The path to the description file to parse. """ return BlockwiseFileset.DescriptionSchema.parseConfigFile(descriptionFilePath) @classmethod def writeDescription(cls, descriptionFilePath, descriptionFields): """ Write a :py:class:`jsonConfig.Namespace` object to the given path. :param descriptionFilePath: The path to overwrite with the description fields. :param descriptionFields: The fields to write. """ BlockwiseFileset.DescriptionSchema.writeConfigFile(descriptionFilePath, descriptionFields) class BlockNotReadyError(Exception): """ This exception is raised if `readData()` is called for data that isn't available on disk. """ def __init__(self, block_start): self.block_start = block_start @property def description(self): """ The :py:class:`jsonConfig.Namespace` object that describes this dataset. """ return self._description @classmethod def _createAndReturnBlockwiseFileset(self, descriptionFilePath, mode): try: bfs = BlockwiseFileset(descriptionFilePath, mode) except JsonConfigParser.SchemaError: bfs = None return bfs @classmethod def _prepare_system(cls): # None of this code is tested on Windows. # It might work, but you'll need to improve the unit tests to know for sure. assert ( platform.system() != "Windows" ), "This code is all untested on Windows, and probably needs some modification before it will work." # If you get a "Too many open files" error, this soft limit may need to be increased. # The way to set this limit in bash is via "ulimit -n 4096" # Fortunately, Python lets us increase the limit via the resource module. import resource softlimit, hardlimit = resource.getrlimit(resource.RLIMIT_NOFILE) softlimit = max(4096, softlimit) resource.setrlimit(resource.RLIMIT_NOFILE, (softlimit, hardlimit)) def __init__(self, descriptionFilePath, mode="r", preparsedDescription=None): """ Constructor. Uses `readDescription` interally. :param descriptionFilePath: The path to the .json file that describes the dataset. :param mode: Set to ``'r'`` if the fileset should be read-only. :param preparsedDescription: (Optional) Provide pre-parsed description fields, in which case the provided description file will not be parsed. """ self._prepare_system() assert mode == "r" or mode == "a", "Valid modes are 'r' or 'a', not '{}'".format(mode) self.mode = mode assert ( descriptionFilePath is not None ), "Must provide a path to the description file, even if you are providing pre-parsed fields. (Path is used to find block directory)." self._descriptionFilePath = descriptionFilePath if preparsedDescription is not None: self._description = preparsedDescription else: self._description = BlockwiseFileset.readDescription(descriptionFilePath) # Check for errors assert self._description.format == "hdf5", "Only hdf5 blockwise filesets are supported so far." if self._description.compression_opts is not None: assert ( self._description.compression is not None ), "You specified compression_opts={} without specifying a compression type".format( self._description.compression ) drange = self._description.drange if drange is not None: assert len(drange) == 2, "Invalid drange: {}".format(drange) assert drange[0] <= drange[1], "Invalid drange: {}".format(drange) sub_block_shape = self._description.sub_block_shape if sub_block_shape is not None: block_shape = self._description.block_shape block_shape_mods = numpy.mod(block_shape, sub_block_shape) != 0 nonfull_block_shape_dims = block_shape != self._description.view_shape invalid_sub_block_dims = numpy.logical_and(nonfull_block_shape_dims, block_shape_mods) assert (invalid_sub_block_dims == False).all(), ( "Each dimension of sub_block_shape must divide evenly into block_shape," " unless the total dataset is only one block wide in that dimension." ) # default view_origin if self._description.view_origin is None: self._description.view_origin = numpy.array((0,) * len(self._description.shape)) assert ( numpy.mod(self._description.view_origin, self._description.block_shape) == 0 ).all(), "view_origin is not compatible with block_shape. Must be a multiple!" # default view_shape if self._description.view_shape is None: self._description.view_shape = numpy.subtract(self._description.shape, self._description.view_origin) view_roi = ( self._description.view_origin, numpy.add(self._description.view_origin, self._description.view_shape), ) assert ( numpy.subtract(self._description.shape, view_roi[1]) >= 0 ).all(), "View ROI must not exceed on-disk shape: View roi: {}, on-disk shape: {}".format( view_roi, self._description.shape ) if self._description.dataset_root_dir is None: # Default to same directory as the description file self._description.dataset_root_dir = "." self._lock = threading.Lock() self._openBlockFiles = {} self._fileLocks = {} self._closed = False def __del__(self): if hasattr(self, "_closed") and not self._closed: self.close() def __enter__(self): return self def __exit__(self, *args): self.close() def close(self): """ Close all open block files. """ with self._lock: assert not self._closed paths = list(self._openBlockFiles.keys()) for path in paths: blockFile = self._openBlockFiles[path] blockFile.close() if self.mode == "a": fileLock = self._fileLocks[path] fileLock.release() self._openBlockFiles = {} self._fileLocks = {} self._closed = True def reopen(self, mode): assert self._closed, "Can't reopen a fileset that isn't closed." self.mode = mode self._closed = False def readData(self, roi, out_array=None): """ Read data from the fileset. :param roi: The region of interest to read from the dataset. Must be a tuple of iterables: (start, stop). :param out_array: The location to store the read data. Must be the correct size for the given roi. If not provided, an array is created for you. :returns: The requested data. If out_array was provided, returns out_array. """ if out_array is None: out_array = numpy.ndarray(shape=numpy.subtract(roi[1], roi[0]), dtype=self._description.dtype) roi_shape = numpy.subtract(roi[1], roi[0]) assert (roi_shape == out_array.shape).all(), "out_array must match roi shape" assert (roi_shape != 0).all(), "Requested roi {} has zero volume!".format(roi) self._transferData(roi, out_array, read=True) return out_array def writeData(self, roi, data): """ Write data to the fileset. :param roi: The region of interest to write the data to. Must be a tuple of iterables: (start, stop). :param data: The data to write. Must be the correct size for the given roi. """ assert self.mode != "r" assert (numpy.subtract(roi[1], roi[0]) != 0).all(), "Requested roi {} has zero volume!".format(roi) self._transferData(roi, data, read=False) def getDatasetDirectory(self, blockstart): """ Return the directory that contains the block that starts at the given coordinates. """ # Add the view origin to find the on-disk block coordinates blockstart = numpy.add(blockstart, self._description.view_origin) descriptionFileDir = os.path.split(self._descriptionFilePath)[0] absPath, _ = getPathVariants(self._description.dataset_root_dir, descriptionFileDir) blockFilePath = absPath for axis, start in zip(self._description.axes, blockstart): blockFilePath = os.path.join(blockFilePath, "{}_{:08d}".format(axis, start)) return blockFilePath def _getBlockFileName(self, block_start): """ Get the path to the block file that starts at the given coordinate. """ # Translate to find disk block start block_start = numpy.add(self._description.view_origin, block_start) # Get true (disk) block bounds (i.e. use on-disk shape, not view_shape) entire_block_roi = getBlockBounds(self._description.shape, self._description.block_shape, block_start) roiString = "{}".format((list(entire_block_roi[0]), list(entire_block_roi[1]))) datasetFilename = self._description.block_file_name_format.format(roiString=roiString) return datasetFilename def getDatasetPathComponents(self, block_start): """ Return a PathComponents object for the block file that corresponds to the given block start coordinate. """ datasetFilename = self._getBlockFileName(block_start) datasetDir = self.getDatasetDirectory(block_start) datasetPath = os.path.join(datasetDir, datasetFilename) return PathComponents(datasetPath) BLOCK_NOT_AVAILABLE = 0 BLOCK_AVAILABLE = 1 def getBlockStatus(self, blockstart): """ Check a block's status. (Just because a block file exists doesn't mean that it has valid data.) Returns a status code of either ``BlockwiseFileset.BLOCK_AVAILABLE`` or ``BlockwiseFileset.BLOCK_NOT_AVAILABLE``. """ blockDir = self.getDatasetDirectory(blockstart) statusFilePath = os.path.join(blockDir, "STATUS.txt") if not os.path.exists(statusFilePath): return BlockwiseFileset.BLOCK_NOT_AVAILABLE else: return BlockwiseFileset.BLOCK_AVAILABLE def isBlockLocked(self, blockstart): """ Return True if the block is locked for writing. Note that both 'available' and 'not available' blocks might be locked. """ datasetPathComponents = self.getDatasetPathComponents(blockstart) hdf5FilePath = datasetPathComponents.externalPath testLock = FileLock(hdf5FilePath) return not testLock.available() def setBlockStatus(self, blockstart, status): """ Set a block status on disk. We use a simple convention: If the status file exists, the block is available. Otherwise, it ain't. :param status: Must be either ``BlockwiseFileset.BLOCK_AVAILABLE`` or ``BlockwiseFileset.BLOCK_NOT_AVAILABLE``. """ blockDir = self.getDatasetDirectory(blockstart) statusFilePath = os.path.join(blockDir, "STATUS.txt") if status == BlockwiseFileset.BLOCK_AVAILABLE: # touch the status file. open(statusFilePath, "w").close() elif os.path.exists(statusFilePath): # Remove the status file os.remove(statusFilePath) def setBlockStatusesForRoi(self, roi, status): block_starts = getIntersectingBlocks(self._description.block_shape, roi) for block_start in block_starts: self.setBlockStatus(block_start, status) def getEntireBlockRoi(self, block_start): """ Return the roi for the entire block that starts at the given coordinate. """ return getBlockBounds(self._description.view_shape, self._description.block_shape, block_start) def getAllBlockRois(self): """ Return the list of rois for all VIEWED blocks in the dataset. """ entire_dataset_roi = ([0] * len(self._description.view_shape), self._description.view_shape) block_starts = getIntersectingBlocks(self._description.block_shape, entire_dataset_roi) rois = [] for block_start in block_starts: rois.append(self.getEntireBlockRoi(block_start)) return rois def _transferData(self, roi, array_data, read): """ Read or write data from/to the fileset. :param roi: The region of interest. :param array_data: If ``read`` is True, ``array_data`` is the destination array for the read data. If ``read`` is False, array_data contains the data to write to disk. :param read: If True, read data from the fileset into ``array_data``. Otherwise, write data from ``array_data`` into the fileset on disk. :type read: bool """ entire_dataset_roi = ([0] * len(self._description.view_shape), self._description.view_shape) clipped_roi = getIntersection(roi, entire_dataset_roi) assert ( numpy.array(clipped_roi) == numpy.array(roi) ).all(), "Roi {} does not fit within dataset bounds: {}".format(roi, self._description.view_shape) block_starts = getIntersectingBlocks(self._description.block_shape, roi) # TODO: Parallelize this loop? for block_start in block_starts: entire_block_roi = self.getEntireBlockRoi(block_start) # Roi of this whole block within the whole dataset transfer_block_roi = getIntersection( entire_block_roi, roi ) # Roi of data needed from this block within the whole dataset block_relative_roi = ( transfer_block_roi[0] - block_start, transfer_block_roi[1] - block_start, ) # Roi of needed data from this block, relative to the block itself array_data_roi = ( transfer_block_roi[0] - roi[0], transfer_block_roi[1] - roi[0], ) # Roi of data needed from this block within array_data array_slicing = roiToSlice(*array_data_roi) self._transferBlockData(entire_block_roi, block_relative_roi, array_data, array_slicing, read) def _transferBlockData(self, entire_block_roi, block_relative_roi, array_data, array_slicing, read): """ Read or write data to a single block in the fileset. :param entire_block_roi: The roi of the entire block, relative to the whole dataset. :param block_relative_roi: The roi of the data being read/written, relative to the block itself (not the whole dataset). :param array_data: Either the source or the destination of the data being transferred to/from the fileset on disk. :param read: If True, read data from the block into ``array_data``. Otherwise, write data from ``array_data`` into the block on disk. :type read: bool """ datasetPathComponents = self.getDatasetPathComponents(entire_block_roi[0]) if self._description.format == "hdf5": self._transferBlockDataHdf5( entire_block_roi, block_relative_roi, array_data, array_slicing, read, datasetPathComponents ) else: assert False, "Unknown format" def _transferBlockDataHdf5( self, entire_block_roi, block_relative_roi, array_data, array_slicing, read, datasetPathComponents ): """ Transfer a block of data to/from an hdf5 dataset. See _transferBlockData() for details. We use separate parameters for array_data and array_slicing to allow users to pass an hdf5 dataset for array_data. """ # For the hdf5 format, the full path format INCLUDES the dataset name, e.g. /path/to/myfile.h5/volume/data path_parts = datasetPathComponents datasetDir = path_parts.externalDirectory hdf5FilePath = path_parts.externalPath if len(path_parts.internalPath) == 0: raise RuntimeError( "Your hdf5 block filename format MUST specify an internal path, e.g. block{roiString}.h5/volume/blockdata" ) block_start = entire_block_roi[0] if read: # Check for problems before reading. if self.getBlockStatus(block_start) is not BlockwiseFileset.BLOCK_AVAILABLE: raise BlockwiseFileset.BlockNotReadyError(block_start) hdf5File = self._getOpenHdf5Blockfile(hdf5FilePath) if ( self._description.dtype != object and isinstance(array_data, numpy.ndarray) and array_data.flags.c_contiguous ): hdf5File[path_parts.internalPath].read_direct( array_data, roiToSlice(*block_relative_roi), array_slicing ) elif self._description.dtype == object: # We store arrays of dtype=object as arrays of pickle strings. array_pickled_data = hdf5File[path_parts.internalPath][roiToSlice(*block_relative_roi)] array_data[array_slicing] = vectorized_pickle_loads(array_pickled_data) else: array_data[array_slicing] = hdf5File[path_parts.internalPath][roiToSlice(*block_relative_roi)] else: # Create the directory if not os.path.exists(datasetDir): os.makedirs(datasetDir) # For debug purposes, output a copy of the settings # that were active **when this block was created** descriptionFileName = os.path.split(self._descriptionFilePath)[1] debugDescriptionFileCopyPath = os.path.join(datasetDir, descriptionFileName) BlockwiseFileset.writeDescription(debugDescriptionFileCopyPath, self._description) # Clear the block status. # The CALLER is responsible for setting it again. self.setBlockStatus(block_start, BlockwiseFileset.BLOCK_NOT_AVAILABLE) # Write the block data file hdf5File = self._getOpenHdf5Blockfile(hdf5FilePath) if path_parts.internalPath not in hdf5File: self._createDatasetInFile(hdf5File, path_parts.internalPath, entire_block_roi) dataset = hdf5File[path_parts.internalPath] data = array_data[array_slicing] if data.dtype != object: dataset[roiToSlice(*block_relative_roi)] = data else: # hdf5 can't handle datasets with dtype=object, # so we have to pickle each item first. pickled_data = vectorized_pickle_dumps(data) for index in numpy.ndindex(pickled_data.shape): block_index = index + numpy.array(block_relative_roi[0]) dataset[tuple(block_index)] = list(pickled_data[index]) def _createDatasetInFile(self, hdf5File, datasetName, roi): shape = tuple(roi[1] - roi[0]) chunks = self._description.chunks if chunks is not None: # chunks must not be bigger than the data in any dim chunks = numpy.minimum(chunks, shape) chunks = tuple(chunks) compression = self._description.compression compression_opts = self._description.compression_opts dtype = self._description.dtype if dtype == object: dtype = h5py.special_dtype(vlen=numpy.uint8) dataset = hdf5File.create_dataset( datasetName, shape=shape, dtype=dtype, chunks=chunks, compression=compression, compression_opts=compression_opts, ) # Set data attributes if self._description.drange is not None: dataset.attrs["drange"] = self._description.drange if _use_vigra: dataset.attrs["axistags"] = vigra.defaultAxistags(str(self._description.axes)).toJSON() def _getOpenHdf5Blockfile(self, blockFilePath): """ Return a handle to the open hdf5File at the given path. If we haven't opened the file yet, open it first. """ # Try once without locking if blockFilePath in list(self._openBlockFiles.keys()): return self._openBlockFiles[blockFilePath] # Obtain the lock and try again with self._lock: if blockFilePath not in list(self._openBlockFiles.keys()): try: writeLock = FileLock(blockFilePath, timeout=10) if self.mode == "a": acquired = writeLock.acquire(blocking=False) assert acquired, "Couldn't obtain an exclusive lock for writing to file: {}".format( blockFilePath ) self._fileLocks[blockFilePath] = writeLock elif self.mode == "r": assert writeLock.available(), "Can't read from a file that is being written to elsewhere." else: assert False, "Unsupported mode" self._openBlockFiles[blockFilePath] = h5py.File(blockFilePath, self.mode) except: log_exception(logger, "Couldn't open {}".format(blockFilePath)) raise return self._openBlockFiles[blockFilePath] def getOpenHdf5FileForBlock(self, block_start): """ Returns a handle to a file in this dataset. """ block_start = tuple(block_start) path_components = self.getDatasetPathComponents(block_start) return self._getOpenHdf5Blockfile(path_components.externalPath) def purgeAllLocks(self): """ Clears all .lock files from the local blockwise fileset. This may be necessary if previous processes crashed or were killed while some blocks were downloading. You must ensure that this is NOT called while more than one process (or thread) has access to the fileset. For example, in a master/worker situation, call this only from the master, before the workers have been started. """ found_lock = False view_shape = self.description.view_shape view_roi = ([0] * len(view_shape), view_shape) block_starts = list(getIntersectingBlocks(self.description.block_shape, view_roi)) for block_start in block_starts: blockFilePathComponents = self.getDatasetPathComponents(block_start) fileLock = FileLock(blockFilePathComponents.externalPath) found_lock |= fileLock.purge() if found_lock: logger.warning("Purged lock for block: {}".format(tuple(block_start))) return found_lock def exportRoiToHdf5(self, roi, exportDirectory, use_view_coordinates=True): """ Export an arbitrary roi to a single hdf5 file. The file will be placed in the given exportDirectory, and will be named according to the exported roi. :param roi: The roi to export :param exportDirectory: The directory in which the result should be placed. :param use_view_coordinates: If True, assume the roi was given relative to the view start. Otherwise, assume it was given relative to the on-disk coordinates. """ roi = list(map(TinyVector, roi)) if not use_view_coordinates: abs_roi = roi assert ( abs_roi[0] >= self.description.view_origin ), "Roi {} is out-of-bounds: must not span lower than the view origin: ".format( roi, self.description.origin ) view_roi = roi - self.description.view_origin else: view_roi = roi abs_roi = view_roi + self.description.view_origin # Always name the file according to the absolute roi roiString = "{}".format((list(abs_roi[0]), list(abs_roi[1]))) datasetPath = self._description.block_file_name_format.format(roiString=roiString) fullDatasetPath = os.path.join(exportDirectory, datasetPath) path_parts = PathComponents(fullDatasetPath) with h5py.File(path_parts.externalPath, "w") as f: self._createDatasetInFile(f, path_parts.internalPath, view_roi) dataset = f[path_parts.internalPath] self.readData(view_roi, dataset) return fullDatasetPath def exportSubset(self, roi, exportDirectory, use_view_coordinates=True): """ Create a new blockwise fileset by copying a subset of this blockwise fileset. :param roi: The portion to export. Must be along block boundaries, in ABSOLUTE coordinates. :param exportDirectory: The directory to copy the new blockwise fileset to. """ # For now, this implementation assumes it can simply copy EVERYTHING in the block directories, # including lock files. Therefore, we require that the fileset be opened in read-only mode. # If that's a problem, change this function to ignore lock files when copying (or purge them afterwards). roi = list(map(TinyVector, roi)) if not use_view_coordinates: abs_roi = roi assert ( abs_roi[0] >= self.description.view_origin ), "Roi {} is out-of-bounds: must not span lower than the view origin: ".format( roi, self.description.origin ) else: abs_roi = roi + self.description.view_origin assert self.mode == "r", "Can't export from a fileset that is open in read/write mode." block_shape = self._description.block_shape abs_shape = self._description.shape view_origin = self._description.view_origin assert (abs_roi[0] % block_shape == 0).all(), "exportSubset() requires roi to start on a block boundary" assert ( (abs_roi[1] % block_shape == 0) | (abs_roi[1] == abs_shape) ).all(), "exported subset must end on block or dataset boundary." if not os.path.exists(exportDirectory): os.makedirs(exportDirectory) source_desc_path = self._descriptionFilePath source_desc_dir, source_desc_filename = os.path.split(source_desc_path) source_root_dir = self.description.dataset_root_dir # Copy/update description file dest_desc_path = os.path.join(exportDirectory, source_desc_filename) if os.path.exists(dest_desc_path): dest_description = BlockwiseFileset.readDescription(dest_desc_path) else: dest_description = copy.copy(self._description) dest_description.view_shape = abs_roi[1] - view_origin dest_description.hash_id = None BlockwiseFileset.writeDescription(dest_desc_path, dest_description) # Determine destination root block dir if os.path.isabs(source_root_dir): source_root_dir = os.path.normpath(source_root_dir) source_root_dir_name = os.path.split(source_root_dir)[1] dest_root_dir = os.path.join(exportDirectory, source_root_dir_name) else: dest_root_dir = os.path.join(exportDirectory, source_root_dir) source_root_dir, _ = getPathVariants(source_root_dir, source_desc_dir) view_roi = abs_roi - view_origin block_starts = getIntersectingBlocks(block_shape, view_roi) for block_start in block_starts: source_block_dir = self.getDatasetDirectory(block_start) rel_block_dir = os.path.relpath(source_block_dir, source_root_dir) dest_block_dir = os.path.join(dest_root_dir, rel_block_dir) if os.path.exists(dest_block_dir): logger.info("Skipping existing block directory: {}".format(dest_block_dir)) elif not os.path.exists(source_block_dir): logger.info("Skipping missing block directory: {}".format(source_block_dir)) else: # Copy the entire block directory assert dest_block_dir[-1] != "/" dest_block_dir_parent = os.path.split(dest_block_dir)[0] if not os.path.exists(dest_block_dir_parent): os.makedirs(dest_block_dir_parent) shutil.copytree(source_block_dir, dest_block_dir) return dest_desc_path
class TestJsonConfig(object): SubConfigSchema = \ { "_schema_name" : "sub-schema", "_schema_version" : 1.1, "sub_settingA" : str, "sub_settingB" : str } TestSchema = \ { "_schema_name" : "test-schema", "_schema_version" : 1.1, "string_setting" : str, "int_setting" : int, "auto_int_setting" : AutoEval(int), "another_auto_int_setting" : AutoEval(int), "bool_setting" : bool, "formatted_setting" : FormattedField( requiredFields=["user_name", "user_home_town"]), "array_setting" : numpy.array, "array_from_string_setting" : AutoEval(numpy.array), "roi_setting" : RoiTuple(), "subconfig" : JsonConfigParser(SubConfigSchema) } @classmethod def setupClass(cls): testConfig = \ """ { "_schema_name" : "test-schema", "_schema_version" : 1.0, "string_setting" : "This is a sentence.", "int_setting" : 42, "auto_int_setting" : "7*6", "another_auto_int_setting" : 43, "bool_setting" : true, "formatted_setting" : "Greetings, {user_name} from {user_home_town}!", "array_setting" : [1,2,3,4], "array_from_string_setting" : "[1, 1*2, 1*3, 1*4]", "roi_setting" : [[1,2,3,4,5], [6,7,8,9,10]], "subconfig" : { "_schema_name" : "sub-schema", "_schema_version" : 1.0, "sub_settingA" : "yes", "sub_settingB" : "no" } } """ cls.tempDir = tempfile.mkdtemp() cls.configpath = os.path.join(cls.tempDir, "config.json") logger.debug("Using config file: " + cls.configpath) with open(cls.configpath, 'w') as f: f.write(testConfig) @classmethod def teardownClass(cls): # If the user is debugging, don't delete the test files. if logger.level > logging.DEBUG: shutil.rmtree(cls.tempDir) def testRead(self): configFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( TestJsonConfig.configpath ) assert configFields.string_setting == "This is a sentence." assert configFields.int_setting == 42 assert configFields.auto_int_setting == 42 assert configFields.another_auto_int_setting == 43 assert configFields.bool_setting is True assert configFields.formatted_setting.format( user_name="Stuart", user_home_town="Washington, DC" ) == "Greetings, Stuart from Washington, DC!" assert configFields.roi_setting == ((1,2,3,4,5), (6,7,8,9,10)) assert isinstance(configFields.array_setting, numpy.ndarray) assert (configFields.array_setting == [1,2,3,4]).all() assert isinstance(configFields.array_from_string_setting, numpy.ndarray) assert (configFields.array_from_string_setting == [1,2,3,4]).all() # Check sub-config settings assert configFields.subconfig.sub_settingA == "yes" assert configFields.subconfig.sub_settingB == "no" def testWrite(self): configFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( TestJsonConfig.configpath ) configFields.string_setting = "This is a different sentence." configFields.int_setting = 100 configFields.bool_setting = False # Write it. newConfigFilePath = TestJsonConfig.configpath + "_2" JsonConfigParser( TestJsonConfig.TestSchema ).writeConfigFile( newConfigFilePath, configFields ) # Read it back. newConfigFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( newConfigFilePath ) assert newConfigFields == configFields, "Config field content was not preserved after writing/reading" assert list(configFields.__dict__.items()) == list(configFields.__dict__.items()), "Config field ORDER was not preserved after writing/reading" @nose.tools.raises( JsonConfigParser.ParsingError ) def testExceptionIfRepeatedFields(self): """ This test creates a config that has an error: A field has been repeated. We expect to see an exception from the parser telling us that we screwed up. (See decorator above.) """ testConfig = \ """ { "_schema_name" : "test-schema", "_schema_version" : 1.0, "string_setting" : "First instance", "string_setting" : "Repeated instance" } """ tempDir = tempfile.mkdtemp() configpath = os.path.join(tempDir, "config.json") logger.debug("Using config file: " + configpath) with open(configpath, 'w') as f: f.write(testConfig) try: configFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( configpath ) finally: # Clean up temporary file shutil.rmtree(tempDir)
class TiledVolume(object): """ Given a directory of image tiles that make up a volume, produces numpy array volumes for arbitrary roi requests. """ #: These fields describe the schema of the description file. #: See the source code comments for a description of each field. DescriptionFields = \ { "_schema_name" : "tiled-volume-description", "_schema_version" : 1.0, "name" : str, "format" : str, "dtype" : AutoEval(), "bounds_zyx" : AutoEval(numpy.array), "shape_zyx" : AutoEval(numpy.array), # synonym for bounds_zyx (until we support offset_origin) "resolution_zyx" : AutoEval(numpy.array), "tile_shape_2d_yx" : AutoEval(numpy.array), # This doesn't change how the data is read from the server, # but instead specifies the indexing order of the numpy volumes produced. "output_axes" : str, "cache_tiles" : bool, # Offset not supported for now... #"origin_offset" : AutoEval(numpy.array), # For now, 3D-only, sliced across Z # TODO: support 5D. # Allow multiple url schemes: tiles might be addressed via pixel coordinates or row/column indexing # (z_index and z_start are synonyms here -- either is allowed) "tile_url_format" : FormattedField( requiredFields=[], optionalFields=["x_start", "y_start", "z_start", "x_stop", "y_stop", "z_stop", "x_index", "y_index", "z_index"] ), "extend_slices" : list } DescriptionSchema = JsonConfigParser(DescriptionFields) @classmethod def readDescription(cls, descriptionFilePath): # Read file description = TiledVolume.DescriptionSchema.parseConfigFile( descriptionFilePath) cls.updateDescription(description) return description @classmethod def updateDescription(cls, description): """ Some description fields are optional. If they aren't provided in the description JSON file, then this function provides them with default values, based on the other description fields. """ # Augment with default parameters. logger.debug(str(description)) # offset not supported yet... #if description.origin_offset is None: # description.origin_offset = numpy.array( [0]*len(description.bounds_zyx) ) #description.shape = description.bounds_zyx - description.origin_offset # for now, there's no difference between shape and bounds if description.shape_zyx is not None and description.bounds_zyx is not None: assert all(description.shape_zyx == description.bounds_zyx) if description.shape_zyx is None: description.shape_zyx = tuple(description.bounds_zyx) if description.bounds_zyx is None: description.bounds_zyx = tuple(description.shape_zyx) if not description.output_axes: description.output_axes = "zyx" assert description.output_axes is None or set(description.output_axes) == set("zyx"), \ "Axis order must include x,y,z (and nothing else)" if not description.extend_slices: description.extend_slices = [] if description.cache_tiles is None: description.cache_tiles = False def __init__(self, descriptionFilePath): self.description = TiledVolume.readDescription(descriptionFilePath) self._session = None assert self.description.format in vigra.impex.listExtensions().split(), \ "Unknown tile format: {}".format( self.description.format ) assert self.description.tile_shape_2d_yx.shape == (2, ) assert self.description.bounds_zyx.shape == (3, ) shape_dict = dict(zip('zyx', self.description.bounds_zyx)) self.output_shape = tuple(shape_dict[k] for k in self.description.output_axes) self._slice_remapping = {} for source, destinations in self.description.extend_slices: for dest in destinations: self._slice_remapping[dest] = source def close(self): self._session.close() def read(self, roi, result_out): """ roi: (start, stop) tuples, ordered according to description.output_axes """ output_axes = self.description.output_axes roi_transposed = zip(*roi) roi_dict = dict(zip(output_axes, roi_transposed)) roi = zip(*(roi_dict['z'], roi_dict['y'], roi_dict['x'])) # First, normalize roi and result to zyx order result_out = vigra.taggedView(result_out, output_axes) result_out = result_out.withAxes(*'zyx') assert numpy.array(roi).shape == ( 2, 3), "Invalid roi for 3D volume: {}".format(roi) roi = numpy.array(roi) assert (result_out.shape == (roi[1] - roi[0])).all() tile_blockshape = (1, ) + tuple(self.description.tile_shape_2d_yx) tile_starts = getIntersectingBlocks(tile_blockshape, roi) # We use a fresh tmp dir for each read to avoid conflicts between parallel reads tmpdir = tempfile.mkdtemp() pool = RequestPool() for tile_start in tile_starts: tile_roi_in = getBlockBounds(self.description.shape_zyx, tile_blockshape, tile_start) tile_roi_in = numpy.array(tile_roi_in) # This tile's portion of the roi intersecting_roi = getIntersection(roi, tile_roi_in) intersecting_roi = numpy.array(intersecting_roi) # Compute slicing within destination array and slicing within this tile destination_relative_intersection = numpy.subtract( intersecting_roi, roi[0]) tile_relative_intersection = intersecting_roi - tile_roi_in[0] # Get a view to the output slice result_region = result_out[roiToSlice( *destination_relative_intersection)] # Special feature: # Some slices are missing, in which case we provide fake data from a different slice. # Overwrite the rest args to pull data from an alternate source tile. z_start = tile_roi_in[0][0] if z_start in self._slice_remapping: new_source_slice = self._slice_remapping[z_start] tile_roi_in[0][0] = new_source_slice tile_roi_in[1][0] = new_source_slice + 1 tile_index = numpy.array(tile_roi_in[0]) / tile_blockshape rest_args = { 'z_start': tile_roi_in[0][0], 'z_stop': tile_roi_in[1][0], 'y_start': tile_roi_in[0][1], 'y_stop': tile_roi_in[1][1], 'x_start': tile_roi_in[0][2], 'x_stop': tile_roi_in[1][2], 'z_index': tile_index[0], 'y_index': tile_index[1], 'x_index': tile_index[2] } # Quick sanity check assert rest_args['z_index'] == rest_args['z_start'] retrieval_fn = partial(self._retrieve_tile, tmpdir, rest_args, tile_relative_intersection, result_region) PARALLEL_REQ = True if PARALLEL_REQ: pool.add(Request(retrieval_fn)) else: # execute serially (leave the pool empty) retrieval_fn() pool.wait() # Clean up our temp files. shutil.rmtree(tmpdir) # For late imports requests = None PIL = None TEST_MODE = False # For testing purposes only. See below. def _retrieve_tile(self, tmpdir, rest_args, tile_relative_intersection, data_out): # Late import if not TiledVolume.requests: import requests TiledVolume.requests = requests requests = TiledVolume.requests tile_url = self.description.tile_url_format.format(**rest_args) tmp_filename = 'z{z_start}_y{y_start}_x{x_start}'.format(**rest_args) tmp_filename += '.' + self.description.format tmp_filepath = os.path.join(tmpdir, tmp_filename) logger.debug("Retrieving {}".format(tile_url)) try: if self._session is None: self._session = self._create_session() success = False tries = 0 while not success: try: r = self._session.get(tile_url) success = True except requests.ConnectionError: # This special 'pass' is here because we keep running into exceptions like this: # ConnectionError: HTTPConnectionPool(host='neurocean.int.janelia.org', port=6081): # Max retries exceeded with url: /ssd-3-tiles/abd1.5/43/24_25_0.jpg # (Caused by <class 'httplib.BadStatusLine'>: '') # So now we loop a few times and only give up if something is really wrong. if tries == 5: raise # give up tries += 1 except: # During testing, the server we're pulling from might be in our own process. # Apparently that means that it is not very responsive, leading to exceptions. # As a cheap workaround, just try one more time. if self.TEST_MODE: import time time.sleep(0.01) r = self._session.get(tile_url) else: raise if r.status_code == requests.codes.not_found: logger.warn("NOTFOUND: {}".format(tile_url, tmp_filepath)) data_out[:] = 0 else: USE_PIL = True if USE_PIL: # late import if not TiledVolume.PIL: import PIL import PIL.Image TiledVolume.PIL = PIL PIL = TiledVolume.PIL img = numpy.asarray(PIL.Image.open(StringIO(r.content))) assert img.ndim == 2 # img has axes xy, but we want zyx img = img[None] #img = img.transpose()[None] else: logger.debug("saving to {}".format(tmp_filepath)) with open(tmp_filepath, 'wb') as f: CHUNK_SIZE = 10 * 1024 for chunk in r.iter_content(CHUNK_SIZE): f.write(chunk) # Read the image from the disk with vigra img = vigra.impex.readImage(tmp_filepath, dtype='NATIVE') assert img.ndim == 3 assert img.shape[-1] == 1 # img has axes xyc, but we want zyx img = img.transpose()[None, 0, :, :] # Copy just the part we need into the destination array assert img[roiToSlice( *tile_relative_intersection)].shape == data_out.shape data_out[:] = img[roiToSlice(*tile_relative_intersection)] @classmethod def _create_session(cls): """ Generate a requests.Session object to use for this TiledVolume. Using a session allows us to benefit from a connection pool instead of establishing a new connection for every request. """ # Late import if not TiledVolume.requests: import requests TiledVolume.requests = requests requests = TiledVolume.requests session = requests.Session() # Replace the session http adapters with ones that use larger connection pools n_threads = Request.global_thread_pool.num_workers adapter = requests.adapters.HTTPAdapter(pool_connections=n_threads, pool_maxsize=n_threads) adapter2 = requests.adapters.HTTPAdapter(pool_connections=n_threads, pool_maxsize=n_threads) session.mount('http://', adapter) session.mount('https://', adapter2) return session
def setup(self): """ Generate a directory with all the files needed for this test. We use the same temporary directory every time, so we don't waste time regenerating the data if the test has already been run recently. The directory consists of the following files: - reference_volume.h5 - volume_description.json - transposed_volume_description.json - [lots of png tiles..] """ tmp = tempfile.gettempdir() self.TILE_DIRECTORY = os.path.join(tmp, "testTiledVolume_data") logger.debug("Using test directory: {}".format(self.TILE_DIRECTORY)) self.REFERENCE_VOL_PATH = os.path.join(self.TILE_DIRECTORY, "reference_volume.h5/data") ref_vol_path_comp = PathComponents(self.REFERENCE_VOL_PATH) self.REFERENCE_VOL_FILE = ref_vol_path_comp.externalPath self.VOLUME_DESCRIPTION_FILE = os.path.join(self.TILE_DIRECTORY, "volume_description.json") self.LOCAL_VOLUME_DESCRIPTION_FILE = os.path.join(self.TILE_DIRECTORY, "local_volume_description.json") self.TRANSPOSED_VOLUME_DESCRIPTION_FILE = os.path.join( self.TILE_DIRECTORY, "transposed_volume_description.json" ) self.TRANSLATED_VOLUME_DESCRIPTION_FILE = os.path.join( self.TILE_DIRECTORY, "translated_volume_description.json" ) self.SPECIAL_Z_VOLUME_DESCRIPTION_FILE = os.path.join(self.TILE_DIRECTORY, "special_z_volume_description.json") if not os.path.exists(self.TILE_DIRECTORY): print "Creating new tile directory: {}".format(self.TILE_DIRECTORY) os.mkdir(self.TILE_DIRECTORY) if not os.path.exists(self.REFERENCE_VOL_FILE): ref_vol = numpy.random.randint(0, 255, (100, 600, 600)).astype(numpy.uint8) with h5py.File(self.REFERENCE_VOL_FILE, "w") as ref_file: ref_file[ref_vol_path_comp.internalPath] = ref_vol else: with h5py.File(self.REFERENCE_VOL_FILE, "r") as ref_file: ref_vol = ref_file[ref_vol_path_comp.internalPath][:] need_rewrite = False if not os.path.exists(self.VOLUME_DESCRIPTION_FILE): need_rewrite = True else: with open(self.VOLUME_DESCRIPTION_FILE, "r") as f: if f.read() != volume_description_text: need_rewrite = True if need_rewrite: with open(self.VOLUME_DESCRIPTION_FILE, "w") as f: f.write(volume_description_text) # Read the volume description as a JsonConfig Namespace volume_description = TiledVolume.readDescription(self.VOLUME_DESCRIPTION_FILE) # Write out a copy of the description, but with a local tile path instead of a URL config_helper = JsonConfigParser(TiledVolume.DescriptionFields) local_description = copy.copy(volume_description) local_description.tile_url_format = ( self.TILE_DIRECTORY + "/tile_z{z_start:05}_y{y_start:05}_x{x_start:05}.png" ) config_helper.writeConfigFile(self.LOCAL_VOLUME_DESCRIPTION_FILE, local_description) # Write out a copy of the description, but with custom output axes config_helper = JsonConfigParser(TiledVolume.DescriptionFields) transposed_description = copy.copy(volume_description) transposed_description.output_axes = "xyz" config_helper.writeConfigFile(self.TRANSPOSED_VOLUME_DESCRIPTION_FILE, transposed_description) # Write out another copy of the description, but with an origin translation config_helper = JsonConfigParser(TiledVolume.DescriptionFields) translated_description = copy.copy(volume_description) translated_description.view_origin_zyx = [10, 20, 30] translated_description.shape_zyx = None config_helper.writeConfigFile(self.TRANSLATED_VOLUME_DESCRIPTION_FILE, translated_description) # Write out another copy of the description, but with a special function for translating z-coordinates. config_helper = JsonConfigParser(TiledVolume.DescriptionFields) special_z_description = copy.copy(volume_description) special_z_description.z_translation_function = "lambda z: z+11" config_helper.writeConfigFile(self.SPECIAL_Z_VOLUME_DESCRIPTION_FILE, special_z_description) # Remove all old image tiles in the tile directory files = os.listdir(self.TILE_DIRECTORY) for name in files: if os.path.splitext(name)[1] == "." + volume_description.format: os.remove(os.path.join(self.TILE_DIRECTORY, name)) # Write the new tiles export_to_tiles(ref_vol, volume_description.tile_shape_2d_yx[0], self.TILE_DIRECTORY, print_progress=False) # To support testMissingTiles (below), remove slice 2 files = os.listdir(self.TILE_DIRECTORY) for name in files: if name.startswith("tile_z00002"): p = os.path.join(self.TILE_DIRECTORY, name) print "removing:", p os.remove(p) # lastly, start the server self._start_server()
def setup(self): """ Generate a directory with all the files needed for this test. We use the same temporary directory every time, so we don't waste time regenerating the data if the test has already been run recently. The directory consists of the following files: - reference_volume.h5 - volume_description.json - transposed_volume_description.json - [lots of png tiles..] """ tmp = tempfile.gettempdir() self.TILE_DIRECTORY = os.path.join(tmp, 'testTiledVolume_data') logger.debug("Using test directory: {}".format(self.TILE_DIRECTORY)) self.REFERENCE_VOL_PATH = os.path.join(self.TILE_DIRECTORY, 'reference_volume.h5/data') ref_vol_path_comp = PathComponents(self.REFERENCE_VOL_PATH) self.REFERENCE_VOL_FILE = ref_vol_path_comp.externalPath self.VOLUME_DESCRIPTION_FILE = os.path.join(self.TILE_DIRECTORY, 'volume_description.json') self.TRANSPOSED_VOLUME_DESCRIPTION_FILE = os.path.join( self.TILE_DIRECTORY, 'transposed_volume_description.json') if not os.path.exists(self.TILE_DIRECTORY): print "Creating new tile directory: {}".format(self.TILE_DIRECTORY) os.mkdir(self.TILE_DIRECTORY) if not os.path.exists(self.REFERENCE_VOL_FILE): ref_vol = numpy.random.randint(0, 255, (100, 600, 600)).astype(numpy.uint8) with h5py.File(self.REFERENCE_VOL_FILE, 'w') as ref_file: ref_file[ref_vol_path_comp.internalPath] = ref_vol else: with h5py.File(self.REFERENCE_VOL_FILE, 'r') as ref_file: ref_vol = ref_file[ref_vol_path_comp.internalPath][:] need_rewrite = False if not os.path.exists(self.VOLUME_DESCRIPTION_FILE): need_rewrite = True else: with open(self.VOLUME_DESCRIPTION_FILE, 'r') as f: if f.read() != volume_description_text: need_rewrite = True if need_rewrite: with open(self.VOLUME_DESCRIPTION_FILE, 'w') as f: f.write(volume_description_text) # Read the volume description as a JsonConfig Namespace volume_description = TiledVolume.readDescription( self.VOLUME_DESCRIPTION_FILE) # Write out a copy of the description, but with custom output axes config_helper = JsonConfigParser(TiledVolume.DescriptionFields) transposed_description = copy.copy(volume_description) transposed_description.output_axes = "xyz" config_helper.writeConfigFile( self.TRANSPOSED_VOLUME_DESCRIPTION_FILE, transposed_description) # Remove all old image tiles in the tile directory files = os.listdir(self.TILE_DIRECTORY) for name in files: if os.path.splitext( name)[1] == '.' + volume_description.format: os.remove(os.path.join(self.TILE_DIRECTORY, name)) # Write the new tiles export_to_tiles(ref_vol, volume_description.tile_shape_2d_yx[0], self.TILE_DIRECTORY, print_progress=False) # To support testMissingTiles (below), remove slice 2 files = os.listdir(self.TILE_DIRECTORY) for name in files: if name.startswith("tile_z00002"): p = os.path.join(self.TILE_DIRECTORY, name) print "removing:", p os.remove(p) # lastly, start the server self._start_server()
def __init__(self, shell, headless, workflow_cmdline_args, project_creation_args, hintoverlayFile=None, pmapoverlayFile=None, *args, **kwargs): graph = Graph() super(SplitBodyCarvingWorkflow, self).__init__(shell, headless, workflow_cmdline_args, project_creation_args, *args, graph=graph, **kwargs) ## Create applets self.projectMetadataApplet = ProjectMetadataApplet() self.dataSelectionApplet = DataSelectionApplet( self, "Input Data", "Input Data", supportIlastik05Import=True, batchDataGui=False) opDataSelection = self.dataSelectionApplet.topLevelOperator opDataSelection.DatasetRoles.setValue( ['Raw Data', 'Pixel Probabilities', 'Raveler Labels']) self.preprocessingApplet = PreprocessingApplet( workflow=self, title="Preprocessing", projectFileGroupName="preprocessing") self.splitBodyCarvingApplet = SplitBodyCarvingApplet( workflow=self, projectFileGroupName="carving") self.splitBodyPostprocessingApplet = SplitBodyPostprocessingApplet( workflow=self) self.splitBodySupervoxelExportApplet = SplitBodySupervoxelExportApplet( workflow=self) # Expose to shell self._applets = [] self._applets.append(self.projectMetadataApplet) self._applets.append(self.dataSelectionApplet) self._applets.append(self.preprocessingApplet) self._applets.append(self.splitBodyCarvingApplet) self._applets.append(self.splitBodyPostprocessingApplet) self._applets.append(self.splitBodySupervoxelExportApplet) self._split_tool_params = None if workflow_cmdline_args: arg_parser = argparse.ArgumentParser( description= "Specify parameters for the split-body carving workflow") arg_parser.add_argument('--split_tool_param_file', required=False) parsed_args, unused_args = arg_parser.parse_known_args( workflow_cmdline_args) if unused_args: logger.warn("Unused command-line args: {}".format(unused_args)) if parsed_args.split_tool_param_file is None: logger.warn("Missing cmd-line arg: --split_tool_param_file") else: logger.debug("Parsing split tool parameters: {}".format( parsed_args.split_tool_param_file)) json_parser = JsonConfigParser(SplitToolParamsSchema) self._split_tool_params = json_parser.parseConfigFile( parsed_args.split_tool_param_file)
from lazyflow.operators.generic import OpSingleChannelSelector from preprocessingApplet import PreprocessingApplet from lazyflow.utility.jsonConfig import JsonConfigParser import logging logger = logging.getLogger(__name__) SplitToolParamsSchema = \ { "_schema_name" : "split-body workflow params", "_schema_version" : 0.1, # Input data "raw_data_info" : JsonConfigParser( DatasetInfo.DatasetInfoSchema ), "pixel_probabilities_info" : JsonConfigParser( DatasetInfo.DatasetInfoSchema ), "raveler_labels_info" : JsonConfigParser( DatasetInfo.DatasetInfoSchema ), # Annotation (bookmarks) file "raveler_bookmarks_file" : str, # Supervoxel export file (special tool for FlyEM) "supervoxelized_bodies_export_path" : str } class SplitBodyCarvingWorkflow(Workflow): workflowName = "Split Body Tool Workflow" defaultAppletIndex = 1 # show DataSelection by default
def setup(self): """ Generate a directory with all the files needed for this test. We use the same temporary directory every time, so we don't waste time regenerating the data if the test has already been run recently. The directory consists of the following files: - reference_volume.h5 - volume_description.json - transposed_volume_description.json - [lots of png tiles..] """ tmp = tempfile.gettempdir() self.TILE_DIRECTORY = os.path.join( tmp, 'testTiledVolume_data' ) logger.debug("Using test directory: {}".format( self.TILE_DIRECTORY )) self.REFERENCE_VOL_PATH = os.path.join( self.TILE_DIRECTORY, 'reference_volume.h5/data' ) ref_vol_path_comp = PathComponents(self.REFERENCE_VOL_PATH) self.REFERENCE_VOL_FILE = ref_vol_path_comp.externalPath self.VOLUME_DESCRIPTION_FILE = os.path.join( self.TILE_DIRECTORY, 'volume_description.json' ) self.TRANSPOSED_VOLUME_DESCRIPTION_FILE = os.path.join( self.TILE_DIRECTORY, 'transposed_volume_description.json' ) if not os.path.exists(self.TILE_DIRECTORY): print "Creating new tile directory: {}".format( self.TILE_DIRECTORY ) os.mkdir(self.TILE_DIRECTORY) if not os.path.exists(self.REFERENCE_VOL_FILE): ref_vol = numpy.random.randint(0,255, (100,600,600) ).astype(numpy.uint8) with h5py.File(self.REFERENCE_VOL_FILE, 'w') as ref_file: ref_file[ref_vol_path_comp.internalPath] = ref_vol else: with h5py.File(self.REFERENCE_VOL_FILE, 'r') as ref_file: ref_vol = ref_file[ref_vol_path_comp.internalPath][:] need_rewrite = False if not os.path.exists( self.VOLUME_DESCRIPTION_FILE ): need_rewrite = True else: with open(self.VOLUME_DESCRIPTION_FILE, 'r') as f: if f.read() != volume_description_text: need_rewrite = True if need_rewrite: with open(self.VOLUME_DESCRIPTION_FILE, 'w') as f: f.write(volume_description_text) # Read the volume description as a JsonConfig Namespace volume_description = TiledVolume.readDescription(self.VOLUME_DESCRIPTION_FILE) # Write out a copy of the description, but with custom output axes config_helper = JsonConfigParser( TiledVolume.DescriptionFields ) transposed_description = copy.copy(volume_description) transposed_description.output_axes = "xyz" config_helper.writeConfigFile(self.TRANSPOSED_VOLUME_DESCRIPTION_FILE, transposed_description) # Remove all old image tiles in the tile directory files = os.listdir(self.TILE_DIRECTORY) for name in files: if os.path.splitext(name)[1] == '.' + volume_description.format: os.remove( os.path.join(self.TILE_DIRECTORY, name) ) # Write the new tiles export_to_tiles( ref_vol, volume_description.tile_shape_2d_yx[0], self.TILE_DIRECTORY, print_progress=False ) # To support testMissingTiles (below), remove slice 2 files = os.listdir(self.TILE_DIRECTORY) for name in files: if name.startswith("tile_z00002"): p = os.path.join(self.TILE_DIRECTORY, name) print "removing:", p os.remove( p ) # lastly, start the server self._start_server()