Example #1
0
class RESTfulVolume(object):
    """
    This class provides access to data obtained via a RESTful API (e.g. from http://openconnecto.me).
    A description of the remote volume must be provided via a JSON file,
    whose schema is specified by :py:data:`RESTfulVolume.DescriptionFields`.

    See the unit tests in ``tests/testRESTfulVolume.py`` for example usage.

    .. note:: This class does not keep track of the data you've already downloaded.
              Every call to :py:func:`downloadSubVolume()` results in a new download.
              For automatic blockwise local caching of remote datasets, see :py:class:`RESTfulBlockwiseFileset`.

    .. note:: See the unit tests in ``tests/testRESTfulVolume.py`` for example usage.
    """

    #: These fields describe the schema of the description file.
    #: See the source code comments for a description of each field.
    DescriptionFields = {
        "_schema_name":
        "RESTful-volume-description",
        "_schema_version":
        1.0,
        "name":
        str,
        "format":
        str,
        "axes":
        str,
        "dtype":
        AutoEval(),
        "bounds":
        AutoEval(numpy.array),
        "shape":
        AutoEval(numpy.array
                 ),  # Provided for you. Computed as bounds - origin_offset
        "origin_offset":
        AutoEval(numpy.array),
        "url_format":
        FormattedField(
            requiredFields=[
                "x_start", "x_stop", "y_start", "y_stop", "z_start", "z_stop"
            ],
            optionalFields=["t_start", "t_stop", "c_start", "c_stop"],
        ),
        "hdf5_dataset":
        str,
    }
    DescriptionSchema = JsonConfigParser(DescriptionFields)

    @classmethod
    def readDescription(cls, descriptionFilePath):
        """
        Parse the description file at the given path and return a
        :py:class:`jsonConfig.Namespace` object with the description parameters.
        The file will be parsed according to the schema given by :py:data:`RESTfulVolume.DescriptionFields`.
        Any optional parameters not provided by the user are filled in automatically.

        :param descriptionFilePath: The path to the description file to parse.
        """
        # Read file
        description = RESTfulVolume.DescriptionSchema.parseConfigFile(
            descriptionFilePath)
        cls.updateDescription(description)
        return description

    @classmethod
    def updateDescription(cls, description):
        """
        Some description fields are optional.
        If they aren't provided in the description JSON file, then this function provides
        them with default values, based on the other description fields.
        """
        # Augment with default parameters.
        logger.debug(str(description))
        if description.origin_offset is None:
            description.origin_offset = numpy.array([0] *
                                                    len(description.bounds))
        description.shape = description.bounds - description.origin_offset

    @classmethod
    def writeDescription(cls, descriptionFilePath, descriptionFields):
        """
        Write a :py:class:`jsonConfig.Namespace` object to the given path.

        :param descriptionFilePath: The path to overwrite with the description fields.
        :param descriptionFields: The fields to write.
        """
        RESTfulVolume.DescriptionSchema.writeConfigFile(
            descriptionFilePath, descriptionFields)

    def __init__(self, descriptionFilePath=None, preparsedDescription=None):
        """
        Constructor.  Uses `readDescription` interally.

        :param descriptionFilePath: The path to the .json file that describes the remote volume.
        :param preparsedDescription: (Optional) Provide pre-parsed description fields, in which
                                     case the provided description file will not be parsed.
        """
        if preparsedDescription is not None:
            assert descriptionFilePath is None, "Can't provide BOTH description file and pre-parsed description fields."
            self.description = preparsedDescription
        else:
            assert (
                descriptionFilePath is not None
            ), "Must provide either a description file or pre-parsed description fields"
            self.description = RESTfulVolume.readDescription(
                descriptionFilePath)

        # Check for errors
        assert False not in [
            a in "txyzc" for a in self.description.axes
        ], "Unknown axis type.  Known axes: txyzc  Your axes:".format(
            self.description.axes)
        assert self.description.format == "hdf5", "Only hdf5 RESTful volumes are supported so far."
        assert (
            self.description.hdf5_dataset is not None
        ), "RESTful volume description file must specify the hdf5_dataset name"

        if self.description.hdf5_dataset[0] != "/":
            self.description.hdf5_dataset = "/" + self.description.hdf5_dataset

    def downloadSubVolume(self, roi, outputDatasetPath):
        """
        Download a cutout volume from the remote dataset.

        :param roi: The subset of the volume to download, specified as a tuple of coordinates: ``(start, stop)``
        :param outputDatasetPath: The path to overwrite with the downloaded hdf5 file.
        """
        origin_offset = numpy.array(self.description.origin_offset)
        accessStart = numpy.array(roi[0])
        accessStart += origin_offset
        accessStop = numpy.array(roi[1])
        accessStop += origin_offset

        RESTArgs = {}
        for axisindex, axiskey in enumerate(self.description.axes):
            startKey = "{}_start".format(axiskey)
            stopKey = "{}_stop".format(axiskey)
            RESTArgs[startKey] = accessStart[axisindex]
            RESTArgs[stopKey] = accessStop[axisindex]

        # Download the ROI specified in the url to a HDF5 file
        url = self.description.url_format.format(**RESTArgs)
        logger.info("Opening url for region {}..{}: {}".format(
            roi[0], roi[1], url))

        pathComponents = PathComponents(outputDatasetPath)

        if pathComponents.internalPath != self.description.hdf5_dataset:
            # We could just open the file and rename the dataset to match what the user asked for, but that would probably be slow.
            # It's better just to force him to use the correct dataset name to begin with.
            raise RuntimeError(
                "The RESTful volume format uses internal dataset name '{}', but you seem to be expecting '{}'."
                .format(self.description.hdf5_dataset,
                        pathComponents.internalPath))
        logger.info("Downloading RESTful subvolume to file: {}".format(
            pathComponents.externalPath))

        urllib.request.urlretrieve(url, pathComponents.externalPath)
        logger.info("Finished downloading file: {}".format(
            pathComponents.externalPath))
Example #2
0
class TiledVolume(object):
    """
    Given a directory of image tiles that make up a volume, produces numpy array volumes for arbitrary roi requests.
    """
    #: These fields describe the schema of the description file.
    #: See the source code comments for a description of each field.
    DescriptionFields = \
    {
        "_schema_name" : "tiled-volume-description",
        "_schema_version" : 1.0,

        "name" : str,
        "format" : str,
        "dtype" : AutoEval(),
        "bounds_zyx" : AutoEval(numpy.array), # Maximum coordinates (+1)

        "view_origin_zyx" : AutoEval(numpy.array), # Optional offset for output 'view'
        "view_shape_zyx" : AutoEval(numpy.array), # Shape of the output 'view'.  If not provided, defaults to bounds - origin

        "resolution_zyx" : AutoEval(numpy.array),
        "tile_shape_2d_yx" : AutoEval(numpy.array),

        "is_rgb" : bool, # Indicates that we must convert to grayscale

        "username" : str,
        "password" : str,

        # This doesn't change how the data is read from the server,
        #  but instead specifies the indexing order of the numpy volumes produced.
        "output_axes" : str,

        "cache_tiles" : bool,

        # Offset not supported for now...
        #"origin_offset" : AutoEval(numpy.array),

        # For now we support 3D-only, sliced across Z (TODO: Support 5D?)

        # We allow multiple url schemes: tiles might be addressed via pixel coordinates or row/column indexing
        # (z_index and z_start are synonyms here -- either is allowed)
        # Example: pixel-wise tile names:
        #   "tile_url_format" : "http://my.tiles.org/my_tiles/{z_start}-{z_stop}/{y_start}-{y_stop}/{x_start}-{x_stop}.jpg"
        # Example: row/column-wise tile names
        #   "tile_url_format" : "http://my.tiles.org/my_tiles/{z_index}/{y_index}/{x_index}.jpg"

        # Also, local tile sources (filesystem, not http) are okay:
        # "tile_url_format" : "/my_hard_disk/my_tiles/{z_index}/{y_index}/{x_index}.jpg"
        "tile_url_format" : FormattedField( requiredFields=[],
                                            optionalFields=["x_start", "y_start", "z_start",
                                                            "x_stop",  "y_stop",  "z_stop",
                                                            "x_index", "y_index", "z_index",
                                                            "raveler_z_base"] ), # Special keyword for Raveler session directories.  See notes below.

        "invert_y_axis" : bool, # For raveler volumes, the y-axis coordinate is inverted.

        # A list of lists, mapping src slices to destination slices (for "filling in" missing slices)
        # Example If slices 101,102,103 are missing data, you might want to simply repeat the data from slice 100:
        # "extend_slices" : [ [100, [101, 102, 103]] ]
        "extend_slices" : list,

        # Some tiled volumes have complicated mappings from "real" or "global" coordinates to url/filepath coordinates.
        # This field will be eval()'d before the tile is retrieved
        # For example, if the slices were named according to their position in nanometers instead of pixels, this might do the trick:
        # "z_translation_function" : "lambda z: 40*z"
        "z_translation_function" : str,

        # Optional data transform.  For example:
        # "data_transform_function" : "lambda a: a == 0",
        "data_transform_function" : str
    }
    DescriptionSchema = JsonConfigParser(DescriptionFields)

    @classmethod
    def readDescription(cls, descriptionFilePath):
        # Read file
        description = TiledVolume.DescriptionSchema.parseConfigFile(
            descriptionFilePath)
        cls.updateDescription(description)
        return description

    @classmethod
    def updateDescription(cls, description):
        """
        Some description fields are optional.
        If they aren't provided in the description JSON file, then this function provides 
        them with default values, based on the other description fields.
        """
        # Augment with default parameters.
        logger.debug(str(description))

        if description.view_origin_zyx is None:
            description.view_origin_zyx = numpy.array(
                [0] * len(description.bounds_zyx))

        if description.view_shape_zyx is None:
            description.view_shape_zyx = description.bounds_zyx - description.view_origin_zyx

        if not description.output_axes:
            description.output_axes = "zyx"
        assert description.output_axes is None or set(description.output_axes) == set("zyx"), \
            "Axis order must include x,y,z (and nothing else)"

        if not description.extend_slices:
            description.extend_slices = []

        if description.cache_tiles is None:
            description.cache_tiles = False

    def __init__(self, descriptionFilePath):
        self.description = TiledVolume.readDescription(descriptionFilePath)
        self._session = None

        assert self.description.format in vigra.impex.listExtensions().split(), \
            "Unknown tile format: {}".format( self.description.format )

        assert self.description.tile_shape_2d_yx.shape == (2, )
        assert self.description.bounds_zyx.shape == (3, )
        assert self.description.view_shape_zyx.shape == (3, )

        shape_dict = dict(zip('zyx', self.description.view_shape_zyx))
        self.output_shape = tuple(shape_dict[k]
                                  for k in self.description.output_axes)

        self._slice_remapping = {}
        for source, destinations in self.description.extend_slices:
            for dest in destinations:
                self._slice_remapping[dest] = source

    def close(self):
        if self._session:
            self._session.close()

    def read(self, view_roi, result_out):
        """
        roi: (start, stop) tuples, ordered according to description.output_axes
             roi should be relative to the view
        """
        output_axes = self.description.output_axes
        roi_transposed = zip(*view_roi)
        roi_dict = dict(zip(output_axes, roi_transposed))
        view_roi = zip(*(roi_dict['z'], roi_dict['y'], roi_dict['x']))

        # First, normalize roi and result to zyx order
        result_out = vigra.taggedView(result_out, output_axes)
        result_out = result_out.withAxes(*'zyx')

        assert numpy.array(view_roi).shape == (
            2, 3), "Invalid roi for 3D volume: {}".format(view_roi)
        view_roi = numpy.array(view_roi)
        assert (result_out.shape == (view_roi[1] - view_roi[0])).all()

        # User gave roi according to the view output.
        # Now offset it find global roi.
        roi = view_roi + self.description.view_origin_zyx

        tile_blockshape = (1, ) + tuple(self.description.tile_shape_2d_yx)
        tile_starts = getIntersectingBlocks(tile_blockshape, roi)

        pool = RequestPool()
        for tile_start in tile_starts:
            tile_roi_in = getBlockBounds(self.description.bounds_zyx,
                                         tile_blockshape, tile_start)
            tile_roi_in = numpy.array(tile_roi_in)

            # This tile's portion of the roi
            intersecting_roi = getIntersection(roi, tile_roi_in)
            intersecting_roi = numpy.array(intersecting_roi)

            # Compute slicing within destination array and slicing within this tile
            destination_relative_intersection = numpy.subtract(
                intersecting_roi, roi[0])
            tile_relative_intersection = intersecting_roi - tile_roi_in[0]

            # Get a view to the output slice
            result_region = result_out[roiToSlice(
                *destination_relative_intersection)]

            rest_args = self._get_rest_args(tile_blockshape, tile_roi_in)
            if self.description.tile_url_format.startswith('http'):
                retrieval_fn = partial(self._retrieve_remote_tile, rest_args,
                                       tile_relative_intersection,
                                       result_region)
            else:
                retrieval_fn = partial(self._retrieve_local_tile, rest_args,
                                       tile_relative_intersection,
                                       result_region)

            PARALLEL_REQ = True
            if PARALLEL_REQ:
                pool.add(Request(retrieval_fn))
            else:
                # execute serially (leave the pool empty)
                retrieval_fn()

        if PARALLEL_REQ:
            with Timer() as timer:
                pool.wait()
            logger.info("Loading {} tiles took a total of {}".format(
                len(tile_starts), timer.seconds()))

    def _get_rest_args(self, tile_blockshape, tile_roi_in):
        """
        For a single tile, return a dict of all possible parameters that can be substituted 
        into the tile_url_format string from the volume json description file.
        
        tile_blockshape: The 3D blockshape of the tile 
                         (since tiles are only 1 slice thick, the blockshape always begins with 1).
        tile_roi_in: The ROI within the total volume for a particular tile.
                     (Note that the size of the ROI is usually, but not always, the same as tile_blockshape.
                     Near the volume borders, the tile_roi_in may be smaller.)
        """
        assert sys.version_info.major == 2, "Alert! This function has not been tested "\
        "under python 3. Please remove this assetion and be wary of any strnage behavior you encounter"
        # Special feature:
        # Some slices are missing, in which case we provide fake data from a different slice.
        # Overwrite the rest args to pull data from an alternate source tile.
        z_start = tile_roi_in[0][0]
        if z_start in self._slice_remapping:
            new_source_slice = self._slice_remapping[z_start]
            tile_roi_in[0][0] = new_source_slice
            tile_roi_in[1][0] = new_source_slice + 1

        tile_index = numpy.array(tile_roi_in[0]) // tile_blockshape
        rest_args = {
            'z_start': tile_roi_in[0][0],
            'z_stop': tile_roi_in[1][0],
            'y_start': tile_roi_in[0][1],
            'y_stop': tile_roi_in[1][1],
            'x_start': tile_roi_in[0][2],
            'x_stop': tile_roi_in[1][2],
            'z_index': tile_index[0],
            'y_index': tile_index[1],
            'x_index': tile_index[2]
        }

        # Apply special z_translation_function
        if self.description.z_translation_function is not None:
            z_update_func = eval(self.description.z_translation_function)
            rest_args['z_index'] = rest_args['z_start'] = z_update_func(
                rest_args['z_index'])
            rest_args['z_stop'] = 1 + rest_args['z_start']

        # Quick sanity check
        assert rest_args['z_index'] == rest_args['z_start']

        # Special arg for Raveler session directories:
        # For files with Z < 1000, no extra directory level
        # For files with Z >= 1000, there is an extra directory level,
        #  in which case the extra '/' is INCLUDED here in the rest arg.
        raveler_z_base = (rest_args['z_index'] // 1000) * 1000
        if raveler_z_base == 0:
            rest_args['raveler_z_base'] = ""
        else:
            rest_args['raveler_z_base'] = str(raveler_z_base) + '/'

        return rest_args

    def _retrieve_local_tile(self, rest_args, tile_relative_intersection,
                             data_out):
        tile_path = self.description.tile_url_format.format(**rest_args)
        logger.debug("Opening {}".format(tile_path))

        if not os.path.exists(tile_path):
            logger.error("Tile does not exist: {}".format(tile_path))
            data_out[...] = 0
            return

        # Read the image from the disk with vigra
        img = vigra.impex.readImage(tile_path, dtype='NATIVE')
        assert img.ndim == 3
        if self.description.is_rgb:
            # "Convert" to grayscale -- just take first channel.
            img = img[..., 0:1]
        assert img.shape[-1] == 1, "Image has more channels than expected.  "\
                                   "If it is RGB, be sure to set the is_rgb flag in your description json."

        # img has axes xyc, but we want zyx
        img = img.transpose()[None, 0, :, :]

        if self.description.invert_y_axis:
            # More special Raveler support:
            # Raveler's conventions for the Y-axis are the reverse for everyone else's.
            img = img[:, ::-1, :]

        # Copy just the part we need into the destination array
        assert img[roiToSlice(
            *tile_relative_intersection)].shape == data_out.shape
        data_out[:] = img[roiToSlice(*tile_relative_intersection)]

        # If there's a special transform, apply it now.
        if self.description.data_transform_function is not None:
            transform = eval(self.description.data_transform_function)
            data_out[:] = transform(data_out)

    # For late imports
    requests = None
    PIL = None

    TEST_MODE = False  # For testing purposes only. See below.

    def _retrieve_remote_tile(self, rest_args, tile_relative_intersection,
                              data_out):
        # Late import
        if not TiledVolume.requests:
            import requests
            TiledVolume.requests = requests
        requests = TiledVolume.requests

        tile_url = self.description.tile_url_format.format(**rest_args)
        logger.debug("Retrieving {}".format(tile_url))
        try:
            if self._session is None:
                self._session = self._create_session()

                # Provide authentication if we have the details.
                if self.description.username and self.description.password:
                    self._session.auth = (self.description.username,
                                          self.description.password)

            success = False
            tries = 0
            while not success:
                try:
                    # Note: We give timeout as a tuple, which requires a recent version of requests.
                    #       If you get an exception about that, upgrade your requests module.
                    r = self._session.get(tile_url, timeout=(3.0, 20.0))
                    success = True
                except requests.ConnectionError:
                    # This special 'pass' is here because we keep running into exceptions like this:
                    #   ConnectionError: HTTPConnectionPool(host='neurocean.int.janelia.org', port=6081):
                    #   Max retries exceeded with url: /ssd-3-tiles/abd1.5/43/24_25_0.jpg
                    #   (Caused by <class 'httplib.BadStatusLine'>: '')
                    # So now we loop a few times and only give up if something is really wrong.
                    if tries == 5:
                        raise  # give up
                    tries += 1
        except:
            # During testing, the server we're pulling from might be in our own process.
            # Apparently that means that it is not very responsive, leading to exceptions.
            # As a cheap workaround, just try one more time.
            if self.TEST_MODE:
                import time
                time.sleep(0.01)
                r = self._session.get(tile_url, timeout=(3.0, 20.0))
            else:
                raise

        if r.status_code == requests.codes.not_found:
            logger.warn("NOTFOUND: {}".format(tile_url))
            data_out[:] = 0
        else:
            # late import
            if not TiledVolume.PIL:
                import PIL
                import PIL.Image
                TiledVolume.PIL = PIL
            PIL = TiledVolume.PIL

            img = numpy.asarray(PIL.Image.open(StringIO(r.content)))
            if self.description.is_rgb:
                # "Convert" to grayscale -- just take first channel.
                assert img.ndim == 3
                img = img[..., 0]
            assert img.ndim == 2, "Image seems to be of the wrong dimension.  "\
                                  "If it is RGB, be sure to set the is_rgb flag in your description json."
            # img has axes xy, but we want zyx
            img = img[None]

            if self.description.invert_y_axis:
                # More special Raveler support:
                # Raveler's conventions for the Y-axis are the reverse for everyone else's.
                img = img[:, ::-1, :]

            # Copy just the part we need into the destination array
            assert img[roiToSlice(
                *tile_relative_intersection)].shape == data_out.shape
            data_out[:] = img[roiToSlice(*tile_relative_intersection)]

            # If there's a special transform, apply it now.
            if self.description.data_transform_function is not None:
                transform = eval(self.description.data_transform_function)
                data_out[:] = transform(data_out)

    @classmethod
    def _create_session(cls):
        """
        Generate a requests.Session object to use for this TiledVolume.
        Using a session allows us to benefit from a connection pool 
          instead of establishing a new connection for every request.
        """
        # Late import
        if not TiledVolume.requests:
            import requests
            TiledVolume.requests = requests
        requests = TiledVolume.requests

        session = requests.Session()

        # Replace the session http adapters with ones that use larger connection pools
        n_threads = max(1, Request.global_thread_pool.num_workers)
        adapter = requests.adapters.HTTPAdapter(pool_connections=n_threads,
                                                pool_maxsize=n_threads)
        adapter2 = requests.adapters.HTTPAdapter(pool_connections=n_threads,
                                                 pool_maxsize=n_threads)
        session.mount('http://', adapter)
        session.mount('https://', adapter2)
        return session
Example #3
0
class BlockwiseFileset(object):
    """
    This class handles writing and reading a 'blockwise file set'.
    A 'blockwise file set' is a directory with a particular structure, which contains the entire dataset broken up into blocks.
    Important parameters (e.g. shape, dtype, blockshape) are specified in a JSON file, which must match the schema given by :py:data:`BlockwiseFileset.DescriptionFields`.
    The parent directory of the description file is considered to be the top-most directory in the blockwise dataset hierarchy.

    - Simultaneous reads are threadsafe.
    - NOT threadsafe for reading and writing simultaneously (or writing and writing).
    - NOT threadsafe for closing.  Do not call close() while reading or writing.

    .. note:: See the unit tests in ``tests/testBlockwiseFileset.py`` for example usage.
    """

    #: These fields describe the schema of the description file.
    #: See the source code comments for a description of each field.
    DescriptionFields = {
        "_schema_name": "blockwise-fileset-description",
        "_schema_version": 1.1,
        "name": str,
        "format": str,
        "axes": str,
        "shape": AutoEval(numpy.array),  # This is the shape of the dataset on disk
        "dtype": AutoEval(),
        "drange": AutoEval(tuple),  # Optional. Data range, e.g. (0.0, 1.0)
        "chunks": AutoEval(numpy.array),  # Optional.  If null, no chunking. Only used when writing data.
        "compression": str,  # Optional.  Options include 'lzf' and 'gzip', among others.  Note: h5py automatically enables chunking on compressed datasets.
        "compression_opts": AutoEval(int),  # Optional. Hdf5-specific
        "block_shape": AutoEval(numpy.array),
        "view_origin": AutoEval(
            numpy.array
        ),  # Optional.  Defaults to zeros.  All requests will be translated before the data is accessed.
        # For example, if the offset is [100, 200, 300], then a request for roi([0,0,0],[2,2,2])
        #  will pull from the dataset on disk as though the request was ([100,200,300],[102,202,302]).
        # It is an error to specify an view_origin that is not a multiple of the block_shape.
        "view_shape": AutoEval(
            numpy.array
        ),  # Optional.  Defaults to (shape - view_origin) Limits the shape of the provided data.
        "block_file_name_format": FormattedField(
            requiredFields=["roiString"]
        ),  # For hdf5, include dataset name, e.g. myfile_block{roiString}.h5/volume/data
        "dataset_root_dir": str,  # Abs path or relative to the description file itself. Defaults to "." if left blank.
        "hash_id": str,  # Not user-defined (clients may use this)
        # Added in schema v1.1
        "sub_block_shape": AutoEval(numpy.array),  # Optional.  Must divide evenly into the block shape.
    }

    DescriptionSchema = JsonConfigParser(DescriptionFields)

    @classmethod
    def readDescription(cls, descriptionFilePath):
        """
        Parse the description file at the given path and return a
        :py:class:`jsonConfig.Namespace` object with the description parameters.
        The file will be parsed according to the schema given by :py:data:`BlockwiseFileset.DescriptionFields`.

        :param descriptionFilePath: The path to the description file to parse.
        """
        return BlockwiseFileset.DescriptionSchema.parseConfigFile(descriptionFilePath)

    @classmethod
    def writeDescription(cls, descriptionFilePath, descriptionFields):
        """
        Write a :py:class:`jsonConfig.Namespace` object to the given path.

        :param descriptionFilePath: The path to overwrite with the description fields.
        :param descriptionFields: The fields to write.
        """
        BlockwiseFileset.DescriptionSchema.writeConfigFile(descriptionFilePath, descriptionFields)

    class BlockNotReadyError(Exception):
        """
        This exception is raised if `readData()` is called for data that isn't available on disk.
        """

        def __init__(self, block_start):
            self.block_start = block_start

    @property
    def description(self):
        """
        The :py:class:`jsonConfig.Namespace` object that describes this dataset.
        """
        return self._description

    @classmethod
    def _createAndReturnBlockwiseFileset(self, descriptionFilePath, mode):
        try:
            bfs = BlockwiseFileset(descriptionFilePath, mode)
        except JsonConfigParser.SchemaError:
            bfs = None
        return bfs

    @classmethod
    def _prepare_system(cls):
        # None of this code is tested on Windows.
        # It might work, but you'll need to improve the unit tests to know for sure.
        assert (
            platform.system() != "Windows"
        ), "This code is all untested on Windows, and probably needs some modification before it will work."

        # If you get a "Too many open files" error, this soft limit may need to be increased.
        # The way to set this limit in bash is via "ulimit -n 4096"
        # Fortunately, Python lets us increase the limit via the resource module.
        import resource

        softlimit, hardlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
        softlimit = max(4096, softlimit)
        resource.setrlimit(resource.RLIMIT_NOFILE, (softlimit, hardlimit))

    def __init__(self, descriptionFilePath, mode="r", preparsedDescription=None):
        """
        Constructor.  Uses `readDescription` interally.

        :param descriptionFilePath: The path to the .json file that describes the dataset.
        :param mode: Set to ``'r'`` if the fileset should be read-only.
        :param preparsedDescription: (Optional) Provide pre-parsed description fields, in which case the provided description file will not be parsed.
        """
        self._prepare_system()

        assert mode == "r" or mode == "a", "Valid modes are 'r' or 'a', not '{}'".format(mode)
        self.mode = mode

        assert (
            descriptionFilePath is not None
        ), "Must provide a path to the description file, even if you are providing pre-parsed fields. (Path is used to find block directory)."
        self._descriptionFilePath = descriptionFilePath

        if preparsedDescription is not None:
            self._description = preparsedDescription
        else:
            self._description = BlockwiseFileset.readDescription(descriptionFilePath)

        # Check for errors
        assert self._description.format == "hdf5", "Only hdf5 blockwise filesets are supported so far."
        if self._description.compression_opts is not None:
            assert (
                self._description.compression is not None
            ), "You specified compression_opts={} without specifying a compression type".format(
                self._description.compression
            )
        drange = self._description.drange
        if drange is not None:
            assert len(drange) == 2, "Invalid drange: {}".format(drange)
            assert drange[0] <= drange[1], "Invalid drange: {}".format(drange)

        sub_block_shape = self._description.sub_block_shape
        if sub_block_shape is not None:
            block_shape = self._description.block_shape
            block_shape_mods = numpy.mod(block_shape, sub_block_shape) != 0
            nonfull_block_shape_dims = block_shape != self._description.view_shape
            invalid_sub_block_dims = numpy.logical_and(nonfull_block_shape_dims, block_shape_mods)
            assert (invalid_sub_block_dims == False).all(), (
                "Each dimension of sub_block_shape must divide evenly into block_shape,"
                " unless the total dataset is only one block wide in that dimension."
            )

        # default view_origin
        if self._description.view_origin is None:
            self._description.view_origin = numpy.array((0,) * len(self._description.shape))
        assert (
            numpy.mod(self._description.view_origin, self._description.block_shape) == 0
        ).all(), "view_origin is not compatible with block_shape.  Must be a multiple!"

        # default view_shape
        if self._description.view_shape is None:
            self._description.view_shape = numpy.subtract(self._description.shape, self._description.view_origin)
        view_roi = (
            self._description.view_origin,
            numpy.add(self._description.view_origin, self._description.view_shape),
        )
        assert (
            numpy.subtract(self._description.shape, view_roi[1]) >= 0
        ).all(), "View ROI must not exceed on-disk shape: View roi: {}, on-disk shape: {}".format(
            view_roi, self._description.shape
        )

        if self._description.dataset_root_dir is None:
            # Default to same directory as the description file
            self._description.dataset_root_dir = "."

        self._lock = threading.Lock()
        self._openBlockFiles = {}
        self._fileLocks = {}
        self._closed = False

    def __del__(self):
        if hasattr(self, "_closed") and not self._closed:
            self.close()

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def close(self):
        """
        Close all open block files.
        """
        with self._lock:
            assert not self._closed
            paths = list(self._openBlockFiles.keys())
            for path in paths:
                blockFile = self._openBlockFiles[path]
                blockFile.close()
                if self.mode == "a":
                    fileLock = self._fileLocks[path]
                    fileLock.release()
            self._openBlockFiles = {}
            self._fileLocks = {}
            self._closed = True

    def reopen(self, mode):
        assert self._closed, "Can't reopen a fileset that isn't closed."
        self.mode = mode
        self._closed = False

    def readData(self, roi, out_array=None):
        """
        Read data from the fileset.

        :param roi: The region of interest to read from the dataset.  Must be a tuple of iterables: (start, stop).
        :param out_array: The location to store the read data.  Must be the correct size for the given roi.  If not provided, an array is created for you.
        :returns: The requested data.  If out_array was provided, returns out_array.
        """
        if out_array is None:
            out_array = numpy.ndarray(shape=numpy.subtract(roi[1], roi[0]), dtype=self._description.dtype)
        roi_shape = numpy.subtract(roi[1], roi[0])
        assert (roi_shape == out_array.shape).all(), "out_array must match roi shape"
        assert (roi_shape != 0).all(), "Requested roi {} has zero volume!".format(roi)
        self._transferData(roi, out_array, read=True)
        return out_array

    def writeData(self, roi, data):
        """
        Write data to the fileset.

        :param roi: The region of interest to write the data to.  Must be a tuple of iterables: (start, stop).
        :param data: The data to write.  Must be the correct size for the given roi.
        """
        assert self.mode != "r"
        assert (numpy.subtract(roi[1], roi[0]) != 0).all(), "Requested roi {} has zero volume!".format(roi)

        self._transferData(roi, data, read=False)

    def getDatasetDirectory(self, blockstart):
        """
        Return the directory that contains the block that starts at the given coordinates.
        """
        # Add the view origin to find the on-disk block coordinates
        blockstart = numpy.add(blockstart, self._description.view_origin)
        descriptionFileDir = os.path.split(self._descriptionFilePath)[0]
        absPath, _ = getPathVariants(self._description.dataset_root_dir, descriptionFileDir)
        blockFilePath = absPath

        for axis, start in zip(self._description.axes, blockstart):
            blockFilePath = os.path.join(blockFilePath, "{}_{:08d}".format(axis, start))
        return blockFilePath

    def _getBlockFileName(self, block_start):
        """
        Get the path to the block file that starts at the given coordinate.
        """
        # Translate to find disk block start
        block_start = numpy.add(self._description.view_origin, block_start)
        # Get true (disk) block bounds (i.e. use on-disk shape, not view_shape)
        entire_block_roi = getBlockBounds(self._description.shape, self._description.block_shape, block_start)
        roiString = "{}".format((list(entire_block_roi[0]), list(entire_block_roi[1])))
        datasetFilename = self._description.block_file_name_format.format(roiString=roiString)
        return datasetFilename

    def getDatasetPathComponents(self, block_start):
        """
        Return a PathComponents object for the block file that corresponds to the given block start coordinate.
        """
        datasetFilename = self._getBlockFileName(block_start)
        datasetDir = self.getDatasetDirectory(block_start)
        datasetPath = os.path.join(datasetDir, datasetFilename)

        return PathComponents(datasetPath)

    BLOCK_NOT_AVAILABLE = 0
    BLOCK_AVAILABLE = 1

    def getBlockStatus(self, blockstart):
        """
        Check a block's status.
        (Just because a block file exists doesn't mean that it has valid data.)
        Returns a status code of either ``BlockwiseFileset.BLOCK_AVAILABLE`` or ``BlockwiseFileset.BLOCK_NOT_AVAILABLE``.
        """
        blockDir = self.getDatasetDirectory(blockstart)
        statusFilePath = os.path.join(blockDir, "STATUS.txt")

        if not os.path.exists(statusFilePath):
            return BlockwiseFileset.BLOCK_NOT_AVAILABLE
        else:
            return BlockwiseFileset.BLOCK_AVAILABLE

    def isBlockLocked(self, blockstart):
        """
        Return True if the block is locked for writing.
        Note that both 'available' and 'not available' blocks might be locked.
        """
        datasetPathComponents = self.getDatasetPathComponents(blockstart)
        hdf5FilePath = datasetPathComponents.externalPath
        testLock = FileLock(hdf5FilePath)
        return not testLock.available()

    def setBlockStatus(self, blockstart, status):
        """
        Set a block status on disk.
        We use a simple convention: If the status file exists, the block is available.  Otherwise, it ain't.

        :param status: Must be either ``BlockwiseFileset.BLOCK_AVAILABLE`` or ``BlockwiseFileset.BLOCK_NOT_AVAILABLE``.
        """
        blockDir = self.getDatasetDirectory(blockstart)
        statusFilePath = os.path.join(blockDir, "STATUS.txt")

        if status == BlockwiseFileset.BLOCK_AVAILABLE:
            # touch the status file.
            open(statusFilePath, "w").close()
        elif os.path.exists(statusFilePath):
            # Remove the status file
            os.remove(statusFilePath)

    def setBlockStatusesForRoi(self, roi, status):
        block_starts = getIntersectingBlocks(self._description.block_shape, roi)
        for block_start in block_starts:
            self.setBlockStatus(block_start, status)

    def getEntireBlockRoi(self, block_start):
        """
        Return the roi for the entire block that starts at the given coordinate.
        """
        return getBlockBounds(self._description.view_shape, self._description.block_shape, block_start)

    def getAllBlockRois(self):
        """
        Return the list of rois for all VIEWED blocks in the dataset.
        """
        entire_dataset_roi = ([0] * len(self._description.view_shape), self._description.view_shape)
        block_starts = getIntersectingBlocks(self._description.block_shape, entire_dataset_roi)
        rois = []
        for block_start in block_starts:
            rois.append(self.getEntireBlockRoi(block_start))
        return rois

    def _transferData(self, roi, array_data, read):
        """
        Read or write data from/to the fileset.

        :param roi: The region of interest.
        :param array_data: If ``read`` is True, ``array_data`` is the destination array for the read data.  If ``read`` is False, array_data contains the data to write to disk.
        :param read: If True, read data from the fileset into ``array_data``.  Otherwise, write data from ``array_data`` into the fileset on disk.
        :type read: bool
        """
        entire_dataset_roi = ([0] * len(self._description.view_shape), self._description.view_shape)
        clipped_roi = getIntersection(roi, entire_dataset_roi)
        assert (
            numpy.array(clipped_roi) == numpy.array(roi)
        ).all(), "Roi {} does not fit within dataset bounds: {}".format(roi, self._description.view_shape)

        block_starts = getIntersectingBlocks(self._description.block_shape, roi)

        # TODO: Parallelize this loop?
        for block_start in block_starts:
            entire_block_roi = self.getEntireBlockRoi(block_start)  # Roi of this whole block within the whole dataset
            transfer_block_roi = getIntersection(
                entire_block_roi, roi
            )  # Roi of data needed from this block within the whole dataset
            block_relative_roi = (
                transfer_block_roi[0] - block_start,
                transfer_block_roi[1] - block_start,
            )  # Roi of needed data from this block, relative to the block itself
            array_data_roi = (
                transfer_block_roi[0] - roi[0],
                transfer_block_roi[1] - roi[0],
            )  # Roi of data needed from this block within array_data

            array_slicing = roiToSlice(*array_data_roi)
            self._transferBlockData(entire_block_roi, block_relative_roi, array_data, array_slicing, read)

    def _transferBlockData(self, entire_block_roi, block_relative_roi, array_data, array_slicing, read):
        """
        Read or write data to a single block in the fileset.

        :param entire_block_roi: The roi of the entire block, relative to the whole dataset.
        :param block_relative_roi: The roi of the data being read/written, relative to the block itself (not the whole dataset).
        :param array_data: Either the source or the destination of the data being transferred to/from the fileset on disk.
        :param read: If True, read data from the block into ``array_data``.  Otherwise, write data from ``array_data`` into the block on disk.
        :type read: bool
        """
        datasetPathComponents = self.getDatasetPathComponents(entire_block_roi[0])

        if self._description.format == "hdf5":
            self._transferBlockDataHdf5(
                entire_block_roi, block_relative_roi, array_data, array_slicing, read, datasetPathComponents
            )
        else:
            assert False, "Unknown format"

    def _transferBlockDataHdf5(
        self, entire_block_roi, block_relative_roi, array_data, array_slicing, read, datasetPathComponents
    ):
        """
        Transfer a block of data to/from an hdf5 dataset.
        See _transferBlockData() for details.

        We use separate parameters for array_data and array_slicing to allow users to pass an hdf5 dataset for array_data.
        """
        # For the hdf5 format, the full path format INCLUDES the dataset name, e.g. /path/to/myfile.h5/volume/data
        path_parts = datasetPathComponents
        datasetDir = path_parts.externalDirectory
        hdf5FilePath = path_parts.externalPath
        if len(path_parts.internalPath) == 0:
            raise RuntimeError(
                "Your hdf5 block filename format MUST specify an internal path, e.g. block{roiString}.h5/volume/blockdata"
            )

        block_start = entire_block_roi[0]
        if read:
            # Check for problems before reading.
            if self.getBlockStatus(block_start) is not BlockwiseFileset.BLOCK_AVAILABLE:
                raise BlockwiseFileset.BlockNotReadyError(block_start)

            hdf5File = self._getOpenHdf5Blockfile(hdf5FilePath)

            if (
                self._description.dtype != object
                and isinstance(array_data, numpy.ndarray)
                and array_data.flags.c_contiguous
            ):
                hdf5File[path_parts.internalPath].read_direct(
                    array_data, roiToSlice(*block_relative_roi), array_slicing
                )
            elif self._description.dtype == object:
                # We store arrays of dtype=object as arrays of pickle strings.
                array_pickled_data = hdf5File[path_parts.internalPath][roiToSlice(*block_relative_roi)]
                array_data[array_slicing] = vectorized_pickle_loads(array_pickled_data)
            else:
                array_data[array_slicing] = hdf5File[path_parts.internalPath][roiToSlice(*block_relative_roi)]

        else:
            # Create the directory
            if not os.path.exists(datasetDir):
                os.makedirs(datasetDir)
                # For debug purposes, output a copy of the settings
                #  that were active **when this block was created**
                descriptionFileName = os.path.split(self._descriptionFilePath)[1]
                debugDescriptionFileCopyPath = os.path.join(datasetDir, descriptionFileName)
                BlockwiseFileset.writeDescription(debugDescriptionFileCopyPath, self._description)

            # Clear the block status.
            # The CALLER is responsible for setting it again.
            self.setBlockStatus(block_start, BlockwiseFileset.BLOCK_NOT_AVAILABLE)

            # Write the block data file
            hdf5File = self._getOpenHdf5Blockfile(hdf5FilePath)
            if path_parts.internalPath not in hdf5File:
                self._createDatasetInFile(hdf5File, path_parts.internalPath, entire_block_roi)
            dataset = hdf5File[path_parts.internalPath]
            data = array_data[array_slicing]
            if data.dtype != object:
                dataset[roiToSlice(*block_relative_roi)] = data
            else:
                # hdf5 can't handle datasets with dtype=object,
                #  so we have to pickle each item first.
                pickled_data = vectorized_pickle_dumps(data)
                for index in numpy.ndindex(pickled_data.shape):
                    block_index = index + numpy.array(block_relative_roi[0])
                    dataset[tuple(block_index)] = list(pickled_data[index])

    def _createDatasetInFile(self, hdf5File, datasetName, roi):
        shape = tuple(roi[1] - roi[0])
        chunks = self._description.chunks
        if chunks is not None:
            # chunks must not be bigger than the data in any dim
            chunks = numpy.minimum(chunks, shape)
            chunks = tuple(chunks)
        compression = self._description.compression
        compression_opts = self._description.compression_opts

        dtype = self._description.dtype
        if dtype == object:
            dtype = h5py.special_dtype(vlen=numpy.uint8)
        dataset = hdf5File.create_dataset(
            datasetName,
            shape=shape,
            dtype=dtype,
            chunks=chunks,
            compression=compression,
            compression_opts=compression_opts,
        )

        # Set data attributes
        if self._description.drange is not None:
            dataset.attrs["drange"] = self._description.drange
        if _use_vigra:
            dataset.attrs["axistags"] = vigra.defaultAxistags(str(self._description.axes)).toJSON()

    def _getOpenHdf5Blockfile(self, blockFilePath):
        """
        Return a handle to the open hdf5File at the given path.
        If we haven't opened the file yet, open it first.
        """
        # Try once without locking
        if blockFilePath in list(self._openBlockFiles.keys()):
            return self._openBlockFiles[blockFilePath]

        # Obtain the lock and try again
        with self._lock:
            if blockFilePath not in list(self._openBlockFiles.keys()):
                try:
                    writeLock = FileLock(blockFilePath, timeout=10)
                    if self.mode == "a":
                        acquired = writeLock.acquire(blocking=False)
                        assert acquired, "Couldn't obtain an exclusive lock for writing to file: {}".format(
                            blockFilePath
                        )
                        self._fileLocks[blockFilePath] = writeLock
                    elif self.mode == "r":
                        assert writeLock.available(), "Can't read from a file that is being written to elsewhere."
                    else:
                        assert False, "Unsupported mode"
                    self._openBlockFiles[blockFilePath] = h5py.File(blockFilePath, self.mode)
                except:
                    log_exception(logger, "Couldn't open {}".format(blockFilePath))
                    raise
            return self._openBlockFiles[blockFilePath]

    def getOpenHdf5FileForBlock(self, block_start):
        """
        Returns a handle to a file in this dataset.
        """
        block_start = tuple(block_start)
        path_components = self.getDatasetPathComponents(block_start)
        return self._getOpenHdf5Blockfile(path_components.externalPath)

    def purgeAllLocks(self):
        """
        Clears all .lock files from the local blockwise fileset.
        This may be necessary if previous processes crashed or were killed while some blocks were downloading.
        You must ensure that this is NOT called while more than one process (or thread) has access to the fileset.
        For example, in a master/worker situation, call this only from the master, before the workers have been started.
        """
        found_lock = False

        view_shape = self.description.view_shape
        view_roi = ([0] * len(view_shape), view_shape)
        block_starts = list(getIntersectingBlocks(self.description.block_shape, view_roi))
        for block_start in block_starts:
            blockFilePathComponents = self.getDatasetPathComponents(block_start)
            fileLock = FileLock(blockFilePathComponents.externalPath)
            found_lock |= fileLock.purge()
            if found_lock:
                logger.warning("Purged lock for block: {}".format(tuple(block_start)))

        return found_lock

    def exportRoiToHdf5(self, roi, exportDirectory, use_view_coordinates=True):
        """
        Export an arbitrary roi to a single hdf5 file.
        The file will be placed in the given exportDirectory,
        and will be named according to the exported roi.

        :param roi: The roi to export
        :param exportDirectory: The directory in which the result should be placed.
        :param use_view_coordinates: If True, assume the roi was given relative to the view start.
                                     Otherwise, assume it was given relative to the on-disk coordinates.
        """
        roi = list(map(TinyVector, roi))
        if not use_view_coordinates:
            abs_roi = roi
            assert (
                abs_roi[0] >= self.description.view_origin
            ), "Roi {} is out-of-bounds: must not span lower than the view origin: ".format(
                roi, self.description.origin
            )
            view_roi = roi - self.description.view_origin
        else:
            view_roi = roi
            abs_roi = view_roi + self.description.view_origin

        # Always name the file according to the absolute roi
        roiString = "{}".format((list(abs_roi[0]), list(abs_roi[1])))
        datasetPath = self._description.block_file_name_format.format(roiString=roiString)
        fullDatasetPath = os.path.join(exportDirectory, datasetPath)
        path_parts = PathComponents(fullDatasetPath)

        with h5py.File(path_parts.externalPath, "w") as f:
            self._createDatasetInFile(f, path_parts.internalPath, view_roi)
            dataset = f[path_parts.internalPath]
            self.readData(view_roi, dataset)

        return fullDatasetPath

    def exportSubset(self, roi, exportDirectory, use_view_coordinates=True):
        """
        Create a new blockwise fileset by copying a subset of this blockwise fileset.

        :param roi: The portion to export.  Must be along block boundaries, in ABSOLUTE coordinates.
        :param exportDirectory: The directory to copy the new blockwise fileset to.
        """
        # For now, this implementation assumes it can simply copy EVERYTHING in the block directories,
        #  including lock files.  Therefore, we require that the fileset be opened in read-only mode.
        # If that's a problem, change this function to ignore lock files when copying (or purge them afterwards).
        roi = list(map(TinyVector, roi))
        if not use_view_coordinates:
            abs_roi = roi
            assert (
                abs_roi[0] >= self.description.view_origin
            ), "Roi {} is out-of-bounds: must not span lower than the view origin: ".format(
                roi, self.description.origin
            )
        else:
            abs_roi = roi + self.description.view_origin

        assert self.mode == "r", "Can't export from a fileset that is open in read/write mode."

        block_shape = self._description.block_shape
        abs_shape = self._description.shape
        view_origin = self._description.view_origin

        assert (abs_roi[0] % block_shape == 0).all(), "exportSubset() requires roi to start on a block boundary"
        assert (
            (abs_roi[1] % block_shape == 0) | (abs_roi[1] == abs_shape)
        ).all(), "exported subset must end on block or dataset boundary."

        if not os.path.exists(exportDirectory):
            os.makedirs(exportDirectory)

        source_desc_path = self._descriptionFilePath
        source_desc_dir, source_desc_filename = os.path.split(source_desc_path)
        source_root_dir = self.description.dataset_root_dir

        # Copy/update description file
        dest_desc_path = os.path.join(exportDirectory, source_desc_filename)
        if os.path.exists(dest_desc_path):
            dest_description = BlockwiseFileset.readDescription(dest_desc_path)
        else:
            dest_description = copy.copy(self._description)
            dest_description.view_shape = abs_roi[1] - view_origin
            dest_description.hash_id = None

        BlockwiseFileset.writeDescription(dest_desc_path, dest_description)

        # Determine destination root block dir
        if os.path.isabs(source_root_dir):
            source_root_dir = os.path.normpath(source_root_dir)
            source_root_dir_name = os.path.split(source_root_dir)[1]
            dest_root_dir = os.path.join(exportDirectory, source_root_dir_name)
        else:
            dest_root_dir = os.path.join(exportDirectory, source_root_dir)

        source_root_dir, _ = getPathVariants(source_root_dir, source_desc_dir)

        view_roi = abs_roi - view_origin
        block_starts = getIntersectingBlocks(block_shape, view_roi)
        for block_start in block_starts:
            source_block_dir = self.getDatasetDirectory(block_start)
            rel_block_dir = os.path.relpath(source_block_dir, source_root_dir)
            dest_block_dir = os.path.join(dest_root_dir, rel_block_dir)

            if os.path.exists(dest_block_dir):
                logger.info("Skipping existing block directory: {}".format(dest_block_dir))
            elif not os.path.exists(source_block_dir):
                logger.info("Skipping missing block directory: {}".format(source_block_dir))
            else:
                # Copy the entire block directory
                assert dest_block_dir[-1] != "/"
                dest_block_dir_parent = os.path.split(dest_block_dir)[0]
                if not os.path.exists(dest_block_dir_parent):
                    os.makedirs(dest_block_dir_parent)
                shutil.copytree(source_block_dir, dest_block_dir)

        return dest_desc_path
class TestJsonConfig(object):
    
    SubConfigSchema = \
    {
        "_schema_name" : "sub-schema",
        "_schema_version" : 1.1,
        
        "sub_settingA" : str,
        "sub_settingB" : str
    }
    
    TestSchema = \
    {
        "_schema_name" : "test-schema",
        "_schema_version" : 1.1,

        "string_setting" : str,
        "int_setting" : int,
        "auto_int_setting" : AutoEval(int),
        "another_auto_int_setting" : AutoEval(int),
        "bool_setting" : bool,
        "formatted_setting" : FormattedField( requiredFields=["user_name", "user_home_town"]),
        "array_setting" : numpy.array,
        "array_from_string_setting" : AutoEval(numpy.array),
        "roi_setting" : RoiTuple(),
        
        "subconfig" : JsonConfigParser(SubConfigSchema)
    }
    
    @classmethod
    def setupClass(cls):
        testConfig = \
        """
        {
            "_schema_name" : "test-schema",
            "_schema_version" : 1.0,

            "string_setting" : "This is a sentence.",
            "int_setting" : 42,
            "auto_int_setting" : "7*6",
            "another_auto_int_setting" : 43,
            "bool_setting" : true,
            "formatted_setting" : "Greetings, {user_name} from {user_home_town}!",
            "array_setting" : [1,2,3,4],
            "array_from_string_setting" : "[1, 1*2, 1*3, 1*4]",
            "roi_setting" : [[1,2,3,4,5], [6,7,8,9,10]],
            
            "subconfig" :   {
                                "_schema_name" : "sub-schema",
                                "_schema_version" : 1.0,
                                
                                "sub_settingA" : "yes",
                                "sub_settingB" : "no"
                            }
        }
        """
        cls.tempDir = tempfile.mkdtemp()
        cls.configpath = os.path.join(cls.tempDir, "config.json")
        logger.debug("Using config file: " + cls.configpath)
        with open(cls.configpath, 'w') as f:
            f.write(testConfig)
    
    @classmethod
    def teardownClass(cls):
        # If the user is debugging, don't delete the test files.
        if logger.level > logging.DEBUG:
            shutil.rmtree(cls.tempDir)
    
    def testRead(self):
        configFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( TestJsonConfig.configpath )

        assert configFields.string_setting == "This is a sentence."
        assert configFields.int_setting == 42
        assert configFields.auto_int_setting == 42
        assert configFields.another_auto_int_setting == 43
        assert configFields.bool_setting is True
        assert configFields.formatted_setting.format( user_name="Stuart", user_home_town="Washington, DC" ) == "Greetings, Stuart from Washington, DC!"
        assert configFields.roi_setting == ((1,2,3,4,5), (6,7,8,9,10))
        
        assert isinstance(configFields.array_setting, numpy.ndarray)
        assert (configFields.array_setting == [1,2,3,4]).all()
        assert isinstance(configFields.array_from_string_setting, numpy.ndarray)
        assert (configFields.array_from_string_setting == [1,2,3,4]).all()
        
        # Check sub-config settings
        assert configFields.subconfig.sub_settingA == "yes"
        assert configFields.subconfig.sub_settingB == "no"

    def testWrite(self):
        configFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( TestJsonConfig.configpath )
        configFields.string_setting = "This is a different sentence."
        configFields.int_setting = 100
        configFields.bool_setting = False
        
        # Write it.
        newConfigFilePath = TestJsonConfig.configpath + "_2"
        JsonConfigParser( TestJsonConfig.TestSchema ).writeConfigFile( newConfigFilePath, configFields )
        
        # Read it back.
        newConfigFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( newConfigFilePath )
        assert newConfigFields == configFields, "Config field content was not preserved after writing/reading"
        assert list(configFields.__dict__.items()) == list(configFields.__dict__.items()), "Config field ORDER was not preserved after writing/reading"

    @nose.tools.raises( JsonConfigParser.ParsingError )
    def testExceptionIfRepeatedFields(self):
        """
        This test creates a config that has an error: A field has been repeated.
        We expect to see an exception from the parser telling us that we screwed up.
        (See decorator above.)
        """

        testConfig = \
        """
        {
            "_schema_name" : "test-schema",
            "_schema_version" : 1.0,

            "string_setting" : "First instance",
            "string_setting" : "Repeated instance"
        }
        """
        tempDir = tempfile.mkdtemp()
        configpath = os.path.join(tempDir, "config.json")
        logger.debug("Using config file: " + configpath)
        with open(configpath, 'w') as f:
            f.write(testConfig)

        try:
            configFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( configpath )
        finally:
            # Clean up temporary file
            shutil.rmtree(tempDir)
Example #5
0
###############################################################################
from lazyflow.utility.jsonConfig import JsonConfigParser, AutoEval, FormattedField

#: Schema for all cluster config options
#: (Doesn't specify which are required and which aren't.)
ClusterConfigFields = \
{
    "_schema_name" : "cluster-execution-configuration",
    "_schema_version" : 1.0,

    "workflow_type" : str,
    "output_slot_id" : str,

    "sys_tmp_dir" : str,
    "task_subrequest_shape" : dict, # Optional.  Output description sub_block_shape overrides this now.
    "task_parallel_subrequests" : AutoEval(int),
    "task_threadpool_size" : AutoEval(int),
    "task_timeout_secs" : AutoEval(int),
    "use_node_local_scratch" : bool,
    "use_master_local_scratch" : bool,
    "node_output_compression_cmd" :   FormattedField( requiredFields=["compressed_file", "uncompressed_file"]),
    "node_output_decompression_cmd" : FormattedField( requiredFields=["compressed_file", "uncompressed_file"]),
    "task_progress_update_command" : FormattedField( requiredFields=["progress"] ),
    "task_launch_server" : str,
    "output_log_directory" : str,
    "server_working_directory" : str,
    "command_format" : FormattedField( requiredFields=["task_args"], optionalFields=["task_name"] ),
    "debug_option_use_previous_node_files" : bool
}

Example #6
0
#
# See the LICENSE file for details. License information is also available
# on the ilastik web site at:
# 		   http://ilastik.org/license.html
###############################################################################
from lazyflow.utility.jsonConfig import JsonConfigParser, AutoEval, FormattedField

#: Schema for all cluster config options
#: (Doesn't specify which are required and which aren't.)
ClusterConfigFields = {
    "_schema_name": "cluster-execution-configuration",
    "_schema_version": 1.0,
    "workflow_type": str,
    "output_slot_id": str,
    "sys_tmp_dir": str,
    "task_threadpool_size": AutoEval(int),
    "task_total_ram_mb": AutoEval(int),
    "task_timeout_secs": AutoEval(int),
    "use_node_local_scratch": bool,
    "use_master_local_scratch": bool,
    "node_output_compression_cmd": FormattedField(requiredFields=["compressed_file", "uncompressed_file"]),
    "node_output_decompression_cmd": FormattedField(requiredFields=["compressed_file", "uncompressed_file"]),
    "task_progress_update_command": FormattedField(requiredFields=["progress"]),
    "task_launch_server": str,
    "output_log_directory": str,
    "server_working_directory": str,
    "command_format": FormattedField(requiredFields=["task_args"], optionalFields=["task_name"]),
    "debug_option_use_previous_node_files": bool,
}

Example #7
0
class TiledVolume(object):
    """
    Given a directory of image tiles that make up a volume, produces numpy array volumes for arbitrary roi requests.
    """
    #: These fields describe the schema of the description file.
    #: See the source code comments for a description of each field.
    DescriptionFields = \
    {
        "_schema_name" : "tiled-volume-description",
        "_schema_version" : 1.0,

        "name" : str,
        "format" : str,
        "dtype" : AutoEval(),
        "bounds_zyx" : AutoEval(numpy.array),
        "shape_zyx" : AutoEval(numpy.array), # synonym for bounds_zyx (until we support offset_origin)
        "resolution_zyx" : AutoEval(numpy.array),

        "tile_shape_2d_yx" : AutoEval(numpy.array),

        # This doesn't change how the data is read from the server,
        #  but instead specifies the indexing order of the numpy volumes produced.
        "output_axes" : str,

        "cache_tiles" : bool,

        # Offset not supported for now...
        #"origin_offset" : AutoEval(numpy.array),

        # For now, 3D-only, sliced across Z
        # TODO: support 5D.
        # Allow multiple url schemes: tiles might be addressed via pixel coordinates or row/column indexing
        # (z_index and z_start are synonyms here -- either is allowed)
        "tile_url_format" : FormattedField( requiredFields=[],
                                            optionalFields=["x_start", "y_start", "z_start",
                                                            "x_stop",  "y_stop",  "z_stop",
                                                            "x_index", "y_index", "z_index"] ),
        "extend_slices" : list
    }
    DescriptionSchema = JsonConfigParser(DescriptionFields)

    @classmethod
    def readDescription(cls, descriptionFilePath):
        # Read file
        description = TiledVolume.DescriptionSchema.parseConfigFile(
            descriptionFilePath)
        cls.updateDescription(description)
        return description

    @classmethod
    def updateDescription(cls, description):
        """
        Some description fields are optional.
        If they aren't provided in the description JSON file, then this function provides 
        them with default values, based on the other description fields.
        """
        # Augment with default parameters.
        logger.debug(str(description))

        # offset not supported yet...
        #if description.origin_offset is None:
        #    description.origin_offset = numpy.array( [0]*len(description.bounds_zyx) )
        #description.shape = description.bounds_zyx - description.origin_offset

        # for now, there's no difference between shape and bounds
        if description.shape_zyx is not None and description.bounds_zyx is not None:
            assert all(description.shape_zyx == description.bounds_zyx)
        if description.shape_zyx is None:
            description.shape_zyx = tuple(description.bounds_zyx)
        if description.bounds_zyx is None:
            description.bounds_zyx = tuple(description.shape_zyx)

        if not description.output_axes:
            description.output_axes = "zyx"
        assert description.output_axes is None or set(description.output_axes) == set("zyx"), \
            "Axis order must include x,y,z (and nothing else)"

        if not description.extend_slices:
            description.extend_slices = []

        if description.cache_tiles is None:
            description.cache_tiles = False

    def __init__(self, descriptionFilePath):
        self.description = TiledVolume.readDescription(descriptionFilePath)
        self._session = None

        assert self.description.format in vigra.impex.listExtensions().split(), \
            "Unknown tile format: {}".format( self.description.format )

        assert self.description.tile_shape_2d_yx.shape == (2, )
        assert self.description.bounds_zyx.shape == (3, )

        shape_dict = dict(zip('zyx', self.description.bounds_zyx))
        self.output_shape = tuple(shape_dict[k]
                                  for k in self.description.output_axes)

        self._slice_remapping = {}
        for source, destinations in self.description.extend_slices:
            for dest in destinations:
                self._slice_remapping[dest] = source

    def close(self):
        self._session.close()

    def read(self, roi, result_out):
        """
        roi: (start, stop) tuples, ordered according to description.output_axes
        """
        output_axes = self.description.output_axes
        roi_transposed = zip(*roi)
        roi_dict = dict(zip(output_axes, roi_transposed))
        roi = zip(*(roi_dict['z'], roi_dict['y'], roi_dict['x']))

        # First, normalize roi and result to zyx order
        result_out = vigra.taggedView(result_out, output_axes)
        result_out = result_out.withAxes(*'zyx')

        assert numpy.array(roi).shape == (
            2, 3), "Invalid roi for 3D volume: {}".format(roi)
        roi = numpy.array(roi)
        assert (result_out.shape == (roi[1] - roi[0])).all()

        tile_blockshape = (1, ) + tuple(self.description.tile_shape_2d_yx)
        tile_starts = getIntersectingBlocks(tile_blockshape, roi)

        # We use a fresh tmp dir for each read to avoid conflicts between parallel reads
        tmpdir = tempfile.mkdtemp()

        pool = RequestPool()
        for tile_start in tile_starts:
            tile_roi_in = getBlockBounds(self.description.shape_zyx,
                                         tile_blockshape, tile_start)
            tile_roi_in = numpy.array(tile_roi_in)

            # This tile's portion of the roi
            intersecting_roi = getIntersection(roi, tile_roi_in)
            intersecting_roi = numpy.array(intersecting_roi)

            # Compute slicing within destination array and slicing within this tile
            destination_relative_intersection = numpy.subtract(
                intersecting_roi, roi[0])
            tile_relative_intersection = intersecting_roi - tile_roi_in[0]

            # Get a view to the output slice
            result_region = result_out[roiToSlice(
                *destination_relative_intersection)]

            # Special feature:
            # Some slices are missing, in which case we provide fake data from a different slice.
            # Overwrite the rest args to pull data from an alternate source tile.
            z_start = tile_roi_in[0][0]
            if z_start in self._slice_remapping:
                new_source_slice = self._slice_remapping[z_start]
                tile_roi_in[0][0] = new_source_slice
                tile_roi_in[1][0] = new_source_slice + 1

            tile_index = numpy.array(tile_roi_in[0]) / tile_blockshape
            rest_args = {
                'z_start': tile_roi_in[0][0],
                'z_stop': tile_roi_in[1][0],
                'y_start': tile_roi_in[0][1],
                'y_stop': tile_roi_in[1][1],
                'x_start': tile_roi_in[0][2],
                'x_stop': tile_roi_in[1][2],
                'z_index': tile_index[0],
                'y_index': tile_index[1],
                'x_index': tile_index[2]
            }

            # Quick sanity check
            assert rest_args['z_index'] == rest_args['z_start']

            retrieval_fn = partial(self._retrieve_tile, tmpdir, rest_args,
                                   tile_relative_intersection, result_region)

            PARALLEL_REQ = True
            if PARALLEL_REQ:
                pool.add(Request(retrieval_fn))
            else:
                # execute serially (leave the pool empty)
                retrieval_fn()

        pool.wait()

        # Clean up our temp files.
        shutil.rmtree(tmpdir)

    # For late imports
    requests = None
    PIL = None

    TEST_MODE = False  # For testing purposes only. See below.

    def _retrieve_tile(self, tmpdir, rest_args, tile_relative_intersection,
                       data_out):
        # Late import
        if not TiledVolume.requests:
            import requests
            TiledVolume.requests = requests
        requests = TiledVolume.requests

        tile_url = self.description.tile_url_format.format(**rest_args)

        tmp_filename = 'z{z_start}_y{y_start}_x{x_start}'.format(**rest_args)
        tmp_filename += '.' + self.description.format
        tmp_filepath = os.path.join(tmpdir, tmp_filename)

        logger.debug("Retrieving {}".format(tile_url))
        try:
            if self._session is None:
                self._session = self._create_session()

            success = False
            tries = 0
            while not success:
                try:
                    r = self._session.get(tile_url)
                    success = True
                except requests.ConnectionError:
                    # This special 'pass' is here because we keep running into exceptions like this:
                    #   ConnectionError: HTTPConnectionPool(host='neurocean.int.janelia.org', port=6081):
                    #   Max retries exceeded with url: /ssd-3-tiles/abd1.5/43/24_25_0.jpg
                    #   (Caused by <class 'httplib.BadStatusLine'>: '')
                    # So now we loop a few times and only give up if something is really wrong.
                    if tries == 5:
                        raise  # give up
                    tries += 1
        except:
            # During testing, the server we're pulling from might be in our own process.
            # Apparently that means that it is not very responsive, leading to exceptions.
            # As a cheap workaround, just try one more time.
            if self.TEST_MODE:
                import time
                time.sleep(0.01)
                r = self._session.get(tile_url)
            else:
                raise

        if r.status_code == requests.codes.not_found:
            logger.warn("NOTFOUND: {}".format(tile_url, tmp_filepath))
            data_out[:] = 0
        else:
            USE_PIL = True
            if USE_PIL:
                # late import
                if not TiledVolume.PIL:
                    import PIL
                    import PIL.Image
                    TiledVolume.PIL = PIL
                PIL = TiledVolume.PIL

                img = numpy.asarray(PIL.Image.open(StringIO(r.content)))
                assert img.ndim == 2
                # img has axes xy, but we want zyx
                img = img[None]
                #img = img.transpose()[None]
            else:
                logger.debug("saving to {}".format(tmp_filepath))
                with open(tmp_filepath, 'wb') as f:
                    CHUNK_SIZE = 10 * 1024
                    for chunk in r.iter_content(CHUNK_SIZE):
                        f.write(chunk)

                # Read the image from the disk with vigra
                img = vigra.impex.readImage(tmp_filepath, dtype='NATIVE')
                assert img.ndim == 3
                assert img.shape[-1] == 1

                # img has axes xyc, but we want zyx
                img = img.transpose()[None, 0, :, :]

            # Copy just the part we need into the destination array
            assert img[roiToSlice(
                *tile_relative_intersection)].shape == data_out.shape
            data_out[:] = img[roiToSlice(*tile_relative_intersection)]

    @classmethod
    def _create_session(cls):
        """
        Generate a requests.Session object to use for this TiledVolume.
        Using a session allows us to benefit from a connection pool 
          instead of establishing a new connection for every request.
        """
        # Late import
        if not TiledVolume.requests:
            import requests
            TiledVolume.requests = requests
        requests = TiledVolume.requests

        session = requests.Session()

        # Replace the session http adapters with ones that use larger connection pools
        n_threads = Request.global_thread_pool.num_workers
        adapter = requests.adapters.HTTPAdapter(pool_connections=n_threads,
                                                pool_maxsize=n_threads)
        adapter2 = requests.adapters.HTTPAdapter(pool_connections=n_threads,
                                                 pool_maxsize=n_threads)
        session.mount('http://', adapter)
        session.mount('https://', adapter2)
        return session