예제 #1
0
class _WeightsEntryBase(_BioImageIOSchema):
    authors = fields.List(
        fields.Nested(Author()),
        bioimageio_description="A list of authors. If this is the root weight (it does not have a `parent` field): the "
        "person(s) that have trained this model. If this is a child weight (it has a `parent` field): the person(s) "
        "who have converted the weights to this format.",
    )  # todo: copy root authors if missing
    attachments = fields.Dict(
        fields.String(),
        fields.List(fields.Union([fields.URI(), fields.Raw()])),
        bioimageio_description="Dictionary of text keys and list values (that may contain any valid yaml) to "
        "additional, relevant files that are specific to the current weight format. A list of URIs can be listed under"
        " the `files` key to included additional files for generating the model package.",
    )
    parent = fields.String(
        bioimageio_description="The source weights used as input for converting the weights to this format. For "
        "example, if the weights were converted from the format `pytorch_state_dict` to `pytorch_script`, the parent "
        "is `pytorch_state_dict`. All weight entries except one (the initial set of weights resulting from training "
        "the model), need to have this field."
    )
    sha256 = fields.String(
        validate=field_validators.Length(equal=64),
        bioimageio_description="SHA256 checksum of the source file specified. " + _common_sha256_hint,
    )
    source = fields.Union(
        [fields.URI(), fields.RelativeLocalPath()],
        required=True,
        bioimageio_description="URI or path to the weights file. Preferably a url.",
    )
    weights_format = fields.String(
        validate=field_validators.OneOf(get_args(raw_nodes.WeightsFormat)), required=True, load_only=True
    )

    @post_load
    def make_object(self, data, **kwargs):
        data.pop("weights_format")  # weights_format was only used to identify correct WeightsEntry schema
        return super().make_object(data, **kwargs)

    @pre_dump
    def raise_on_weights_format_mismatch(self, raw_node, **kwargs):
        """
        ensures to serialize a raw_nodes.<Special>WeightsEntry with the corresponding schema.<Special>WeightsEntry

        This check is required, because no validation is performed by marshmallow on serialization,
        which disables the Union field to select the appropriate nested schema for serialization.
        """
        if self.__class__.__name__ != raw_node.__class__.__name__:
            raise TypeError(f"Cannot serialize {raw_node} with {self}")

        return raw_node
예제 #2
0
class ParametrizedInputShape(SharedBioImageIOSchema):
    min = fields.List(
        fields.Integer(), required=True, bioimageio_description="The minimum input shape with same length as `axes`"
    )
    step = fields.List(
        fields.Integer(), required=True, bioimageio_description="The minimum shape change with same length as `axes`"
    )

    @validates_schema
    def matching_lengths(self, data, **kwargs):
        min_ = data["min"]
        step = data["step"]
        if min_ is None or step is None:
            return

        if len(min_) != len(step):
            raise ValidationError(f"'min' and 'step' have to have the same length! (min: {min_}, step: {step})")
예제 #3
0
class InputTensor(_TensorBase):
    shape = fields.Union(
        [
            fields.ExplicitShape(
                bioimageio_description=
                "Exact shape with same length as `axes`, e.g. `shape: [1, 512, 512, 1]`"
            ),
            fields.Nested(
                ParametrizedInputShape(),
                bioimageio_description=
                "A sequence of valid shapes given by `shape = min + k * step for k in {0, 1, ...}`.",
            ),
        ],
        required=True,
        bioimageio_description="Specification of input tensor shape.",
    )
    preprocessing = fields.List(
        fields.Nested(Preprocessing()),
        bioimageio_description=
        "Description of how this input should be preprocessed.")
    processing_name = "preprocessing"

    @validates_schema
    def zero_batch_step_and_one_batch_size(self, data, **kwargs):
        axes = data.get("axes")
        shape = data.get("shape")

        if axes is None or shape is None:
            raise ValidationError(
                "Failed to validate batch_step=0 and batch_size=1 due to other validation errors"
            )

        axes = data["axes"]
        shape = data["shape"]

        bidx = axes.find("b")
        if bidx == -1:
            return

        if isinstance(shape, raw_nodes.ParametrizedInputShape):
            step = shape.step
            shape = shape.min

        elif isinstance(shape, list):
            step = [0] * len(shape)
        else:
            raise ValidationError(f"Unknown shape type {type(shape)}")

        if step[bidx] != 0:
            raise ValidationError(
                "Input shape step has to be zero in the batch dimension (the batch dimension can always be "
                "increased, but `step` should specify how to increase the minimal shape to find the largest "
                "single batch shape)")

        if shape[bidx] != 1:
            raise ValidationError(
                "Input shape has to be 1 in the batch dimension b.")
예제 #4
0
class OutputTensor(_TensorBase):
    shape = fields.Union(
        [
            fields.ExplicitShape(),
            fields.Nested(
                ImplicitOutputShape(),
                bioimageio_description=
                "In reference to the shape of an input tensor, the shape of the output "
                "tensor is `shape = shape(input_tensor) * scale + 2 * offset`.",
            ),
        ],
        required=True,
        bioimageio_description="Specification of output tensor shape.",
    )
    halo = fields.List(
        fields.Integer(),
        bioimageio_description=
        "The halo to crop from the output tensor (for example to crop away boundary effects or "
        "for tiling). The halo should be cropped from both sides, i.e. `shape_after_crop = shape - 2 * halo`. The "
        "`halo` is not cropped by the bioimage.io model, but is left to be cropped by the consumer software. Use "
        "`shape:offset` if the model output itself is cropped and input and output shapes not fixed.",
    )
    postprocessing = fields.List(
        fields.Nested(Postprocessing()),
        bioimageio_description=
        "Description of how this output should be postprocessed.",
    )
    processing_name = "postprocessing"

    @validates_schema
    def matching_halo_length(self, data, **kwargs):
        shape = data.get("shape")
        halo = data.get("halo")
        if halo is None:
            return
        elif isinstance(shape, list) or isinstance(
                shape, raw_nodes.ImplicitOutputShape):
            if shape is None or len(halo) != len(shape):
                raise ValidationError(
                    f"halo {halo} has to have same length as shape {shape}!")
        else:
            raise NotImplementedError(type(shape))
예제 #5
0
class ImplicitOutputShape(SharedBioImageIOSchema):
    reference_tensor = fields.String(required=True, bioimageio_description="Name of the reference tensor.")
    scale = fields.List(
        fields.Float(), required=True, bioimageio_description="'output_pix/input_pix' for each dimension."
    )
    offset = fields.List(
        fields.Float(), required=True, bioimageio_description="Position of origin wrt to input. Multiple of 0.5."
    )

    @validates_schema
    def matching_lengths(self, data, **kwargs):
        scale = data["scale"]
        offset = data["offset"]
        if len(scale) != len(offset):
            raise ValidationError(f"scale {scale} has to have same length as offset {offset}!")

    @validates("offset")
    def double_offset_is_int(self, value: List[float]):
        for v in value:
            if 2 * v != int(2 * v):
                raise ValidationError(f"offset {v} in {value} not a multiple of 0.5!")
예제 #6
0
class OutputTensor(_TensorBase):
    shape = fields.Union(
        [
            fields.ExplicitShape(),
            fields.Nested(
                ImplicitOutputShape(),
                bioimageio_description="In reference to the shape of an input tensor, the shape of the output "
                "tensor is `shape = shape(input_tensor) * scale + 2 * offset`.",
            ),
        ],
        required=True,
        bioimageio_description="Specification of output tensor shape.",
    )
    halo = fields.List(
        fields.Integer(),
        bioimageio_description=lambda: "Hint to describe the potentially corrupted edge region of the output tensor, due to "
        "boundary effects. "
        "The `halo` is not cropped by the bioimage.io model, but is left to be cropped by the consumer software. "
        f"An example implementation of prediction with tiling, accounting for the halo can be found [here]("
        f"{get_ref_url('function', '_predict_with_tiling_impl', 'https://github.com/bioimage-io/core-bioimage-io-python/blob/main/bioimageio/core/prediction.py')}). "
        "Use `shape:offset` if the model output itself is cropped and input and output shapes not fixed. ",
    )
    postprocessing = fields.List(
        fields.Nested(Postprocessing()),
        bioimageio_description="Description of how this output should be postprocessed.",
    )
    processing_name = "postprocessing"

    @validates_schema
    def matching_halo_length(self, data, **kwargs):
        shape = data["shape"]
        halo = data.get("halo")
        if halo is None:
            return
        elif isinstance(shape, list) or isinstance(shape, raw_nodes.ImplicitOutputShape):
            if len(halo) != len(shape):
                raise ValidationError(f"halo {halo} has to have same length as shape {shape}!")
        else:
            raise NotImplementedError(type(shape))
예제 #7
0
class Collection(_BioImageIOSchema, WithUnknown, RDF):
    bioimageio_description = f"""# BioImage.IO Collection Resource Description File Specification {get_args(raw_nodes.FormatVersion)[-1]}
This specification defines the fields used in a BioImage.IO-compliant resource description file (`RDF`) for describing collections of other resources.
These fields are typically stored in a YAML file which we call Collection Resource Description File or `collection RDF`.

The collection RDF YAML file contains mandatory and optional fields. In the following description, optional fields are indicated by _optional_.
_optional*_ with an asterisk indicates the field is optional depending on the value in another field.
"""
    collection = fields.List(
        fields.Nested(CollectionEntry()),
        bioimageio_description=
        "Collection entries. Each entry needs to specify a valid RDF with an id. "
        "Each collection entry RDF is based on the collection RDF itself, "
        "updated by rdf_source content if rdf_source is specified, "
        "and updated by any fields specified directly in the entry. "
        "In this context 'update' refers to overwriting RDF root fields by name."
        "Except for the `id` field, which appends to the collection RDF `id` "
        "such that full_collection_entry_id=<collection_id>/<entry_id>",
        required=True,
    )

    @validates("collection")
    def unique_ids(self, value: List[Union[dict, raw_nodes.CollectionEntry]]):
        ids = [(v.get("id", missing),
                v.get("rdf_source", missing)) if isinstance(v, dict) else
               (v.rdf_update.get("id", missing), v.rdf_source) for v in value]
        # skip check for id only specified in remote source
        ids = [
            vid for vid, vs in ids
            if not (vid is missing and vs is not missing)
        ]

        if missing in ids:
            raise ValueError(f"Missing ids in collection entries")

        non_string_ids = [v for v in ids if not isinstance(v, str)]
        if non_string_ids:
            raise ValueError(f"Non-string ids in collection: {non_string_ids}")

        seen = set()
        duplicates = []
        for v in ids:
            if v in seen:
                duplicates.append(v)
            else:
                seen.add(v)

        if duplicates:
            raise ValueError(f"Duplicate ids in collection: {duplicates}")
예제 #8
0
def test_cite_field_option1():
    """only way we allow to specify listed, nested schemas.

    Limitation to allow better exception and warning messages and make the code in general more concise.
    """
    from bioimageio.spec.rdf.schema import CiteEntry

    data = [{
        "text": "Title",
        "doi": "https://doi.org/10.1109/5.771073",
        "url": "https://ieeexplore.ieee.org/document/771073",
    }] * 2

    cite_field = fields.List(fields.Nested(CiteEntry()), required=True)
    cite_field.deserialize(data)
예제 #9
0
class Model(rdf.schema.RDF):
    raw_nodes = raw_nodes

    class Meta:
        unknown = RAISE

    bioimageio_description = f"""# BioImage.IO Model Resource Description File Specification {get_args(raw_nodes.FormatVersion)[-1]}
This specification defines the fields used in a BioImage.IO-compliant resource description file (`RDF`) for describing AI models with pretrained weights.
These fields are typically stored in YAML files which we called Model Resource Description Files or `model RDF`.
The model RDFs can be downloaded or uploaded to the bioimage.io website, produced or consumed by BioImage.IO-compatible consumers(e.g. image analysis software or other website).

The model RDF YAML file contains mandatory and optional fields. In the following description, optional fields are indicated by _optional_.
_optional*_ with an asterisk indicates the field is optional depending on the value in another field.
"""
    # todo: unify authors with RDF (optional or required?)
    authors = fields.List(
        fields.Nested(Author()), required=True, bioimageio_description=rdf.schema.RDF.authors_bioimageio_description
    )

    badges = missing_
    cite = fields.List(
        fields.Nested(CiteEntry()),
        required=True,  # todo: unify authors with RDF (optional or required?)
        bioimageio_description=rdf.schema.RDF.cite_bioimageio_description,
    )

    documentation = fields.Union(
        [
            fields.URL(),
            fields.RelativeLocalPath(
                validate=field_validators.Attribute(
                    "suffix",
                    field_validators.Equal(
                        ".md", error="{!r} is invalid; expected markdown file with '.md' extension."
                    ),
                )
            ),
        ],
        required=True,
        bioimageio_description="Relative path to file with additional documentation in markdown. This means: 1) only "
        "relative file path is allowed 2) the file must be in markdown format with `.md` file name extension 3) URL is "
        "not allowed. It is recommended to use `README.md` as the documentation name.",
    )

    download_url = missing_

    dependencies = fields.Dependencies(  # todo: add validation (0.4.0?)
        bioimageio_description="Dependency manager and dependency file, specified as `<dependency manager>:<relative "
        "path to file>`. For example: 'conda:./environment.yaml', 'maven:./pom.xml', or 'pip:./requirements.txt'"
    )

    format_version = fields.String(
        validate=field_validators.OneOf(get_args_flat(raw_nodes.FormatVersion)),
        required=True,
        bioimageio_description_order=0,
        bioimageio_description=f"""Version of the BioImage.IO Model Resource Description File Specification used.
This is mandatory, and important for the consumer software to verify before parsing the fields.
The recommended behavior for the implementation is to keep backward compatibility and throw an error if the model yaml
is in an unsupported format version. The current format version described here is
{get_args(raw_nodes.FormatVersion)[-1]}""",
    )

    framework = fields.String(
        validate=field_validators.OneOf(get_args(raw_nodes.Framework)),
        bioimageio_description=f"The deep learning framework of the source code. One of: "
        f"{', '.join(get_args(raw_nodes.Framework))}. This field is only required if the field `source` is present.",
    )

    git_repo = fields.String(
        validate=field_validators.URL(schemes=["http", "https"]),
        bioimageio_description=rdf.schema.RDF.git_repo_bioimageio_description
        + "If the model is contained in a subfolder of a git repository, then a url to the exact folder"
        + "(which contains the configuration yaml file) should be used.",
    )

    icon = missing_

    kwargs = fields.Kwargs(
        bioimageio_description="Keyword arguments for the implementation specified by `source`. "
        "This field is only required if the field `source` is present."
    )

    language = fields.String(
        validate=field_validators.OneOf(get_args(raw_nodes.Language)),
        bioimageio_maybe_required=True,
        bioimageio_description=f"Programming language of the source code. One of: "
        f"{', '.join(get_args(raw_nodes.Language))}. This field is only required if the field `source` is present.",
    )

    license = fields.String(
        required=True,  # todo: unify license with RDF (optional or required?)
        bioimageio_description=rdf.schema.RDF.license_bioimageio_description,
    )

    name = fields.String(
        # validate=field_validators.Length(max=36),  # todo: enforce in future version (0.4.0?)
        required=True,
        bioimageio_description="Name of this model. It should be human-readable and only contain letters, numbers, "
        "underscore '_',  minus '-' or spaces and not be longer than 36 characters.",
    )

    packaged_by = fields.List(
        fields.Nested(Author()),
        bioimageio_description=f"The persons that have packaged and uploaded this model. Only needs to be specified if "
        f"different from `authors` in root or any entry in `weights`.",
    )

    parent = fields.Nested(
        ModelParent(),
        bioimageio_description="Parent model from which the trained weights of this model have been derived, e.g. by "
        "finetuning the weights of this model on a different dataset. For format changes of the same trained model "
        "checkpoint, see `weights`.",
    )

    run_mode = fields.Nested(
        RunMode(),
        bioimageio_description="Custom run mode for this model: for more complex prediction procedures like test time "
        "data augmentation that currently cannot be expressed in the specification. "
        "No standard run modes are defined yet.",
    )

    sha256 = fields.String(
        validate=field_validators.Length(equal=64),
        bioimageio_description="SHA256 checksum of the model source code file."
        + _common_sha256_hint
        + " This field is only required if the field source is present.",
    )

    source = fields.ImportableSource(
        bioimageio_maybe_required=True,
        bioimageio_description="Language and framework specific implementation. As some weights contain the model "
        "architecture, the source is optional depending on the present weight formats. `source` can either point to a "
        "local implementation: `<relative path to file>:<identifier of implementation within the source file>` or the "
        "implementation in an available dependency: `<root-dependency>.<sub-dependency>.<identifier>`.\nFor example: "
        "`my_function.py:MyImplementation` or `core_library.some_module.some_function`.",
    )

    timestamp = fields.DateTime(
        required=True,
        bioimageio_description="Timestamp of the initial creation of this model in [ISO 8601]"
        "(#https://en.wikipedia.org/wiki/ISO_8601) format.",
    )

    weights = fields.Dict(
        fields.String(
            validate=field_validators.OneOf(get_args(raw_nodes.WeightsFormat)),
            required=True,
            bioimageio_description=f"Format of this set of weights. Weight formats can define additional (optional or "
            f"required) fields. See [weight_formats_spec_0_3.md]"
            f"(https://github.com/bioimage-io/spec-bioimage-io/blob/gh-pages/weight_formats_spec_0_3.md). "
            f"One of: {', '.join(get_args(raw_nodes.WeightsFormat))}",
        ),
        fields.Union([fields.Nested(we()) for we in get_args(WeightsEntry)]),
        required=True,
        bioimageio_description="The weights for this model. Weights can be given for different formats, but should "
        "otherwise be equivalent. The available weight formats determine which consumers can use this model.",
    )

    @pre_load
    def add_weights_format_key_to_weights_entry_value(self, data: dict, many=False, partial=False, **kwargs):
        data = deepcopy(data)  # Schema.validate() calls pre_load methods, thus we should not modify the input data
        if many or partial:
            raise NotImplementedError

        for weights_format, weights_entry in data.get("weights", {}).items():
            if "weights_format" in weights_entry:
                raise ValidationError(f"Got unexpected key 'weights_format' in weights entry {weights_format}")

            weights_entry["weights_format"] = weights_format

        return data

    inputs = fields.List(
        fields.Nested(InputTensor()), bioimageio_description="Describes the input tensors expected by this model."
    )
    outputs = fields.List(
        fields.Nested(OutputTensor()), bioimageio_description="Describes the output tensors from this model."
    )

    test_inputs = fields.List(
        fields.Union([fields.URI(), fields.RelativeLocalPath()]),
        required=True,
        bioimageio_description="List of URIs or local relative paths to test inputs as described in inputs for "
        "**a single test case**. "
        "This means if your model has more than one input, you should provide one URI for each input. "
        "Each test input should be a file with a ndarray in "
        "[numpy.lib file format](https://numpy.org/doc/stable/reference/generated/numpy.lib.format.html#module-numpy.lib.format)."
        "The extension must be '.npy'.",
    )
    test_outputs = fields.List(
        fields.Union([fields.URI(), fields.RelativeLocalPath()]),
        required=True,
        bioimageio_description="Analog to to test_inputs.",
    )

    sample_inputs = fields.List(
        fields.Union([fields.URI(), fields.RelativeLocalPath()]),
        bioimageio_description="List of URIs/local relative paths to sample inputs to illustrate possible inputs for "
        "the model, for example stored as png or tif images. "
        "The model is not tested with these sample files that serve to inform a human user about an example use case.",
    )
    sample_outputs = fields.List(
        fields.Union([fields.URI(), fields.RelativeLocalPath()]),
        bioimageio_description="List of URIs/local relative paths to sample outputs corresponding to the "
        "`sample_inputs`.",
    )

    config = fields.YamlDict(
        bioimageio_description=rdf.schema.RDF.config_bioimageio_description
        + """

    For example:
    ```yaml
    config:
      # custom config for DeepImageJ, see https://github.com/bioimage-io/configuration/issues/23
      deepimagej:
        model_keys:
          # In principle the tag "SERVING" is used in almost every tf model
          model_tag: tf.saved_model.tag_constants.SERVING
          # Signature definition to call the model. Again "SERVING" is the most general
          signature_definition: tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
        test_information:
          input_size: [2048x2048] # Size of the input images
          output_size: [1264x1264 ]# Size of all the outputs
          device: cpu # Device used. In principle either cpu or GPU
          memory_peak: 257.7 Mb # Maximum memory consumed by the model in the device
          runtime: 78.8s # Time it took to run the model
          pixel_size: [9.658E-4µmx9.658E-4µm] # Size of the pixels of the input
    ```
"""
    )

    @validates_schema
    def language_and_framework_match(self, data, **kwargs):
        field_names = ("language", "framework")
        valid_combinations = [
            ("python", "scikit-learn"),  # todo: remove
            ("python", "pytorch"),
            ("python", "tensorflow"),
            ("java", "tensorflow"),
        ]
        if "source" not in data:
            valid_combinations.append((missing_, missing_))
            valid_combinations.append(("python", missing_))
            valid_combinations.append(("java", missing_))

        combination = tuple(data.get(name, missing_) for name in field_names)
        if combination not in valid_combinations:
            raise ValidationError(f"invalid combination of {dict(zip(field_names, combination))}")

    @validates_schema
    def source_specified_if_required(self, data, **kwargs):
        if "source" in data:
            return

        weights_format_requires_source = {
            "pytorch_state_dict": True,
            "pytorch_script": False,
            "keras_hdf5": False,
            "tensorflow_js": False,
            "tensorflow_saved_model_bundle": False,
            "onnx": False,
        }
        require_source = {wf for wf in data["weights"] if weights_format_requires_source[wf]}
        if require_source:
            raise ValidationError(
                f"These specified weight formats require source code to be specified: {require_source}"
            )

    @validates_schema
    def validate_reference_tensor_names(self, data, **kwargs):
        valid_input_tensor_references = [ipt.name for ipt in data["inputs"]]
        for out in data["outputs"]:
            if out.postprocessing is missing_:
                continue

            for postpr in out.postprocessing:
                if postpr.kwargs is missing_:
                    continue

                ref_tensor = postpr.kwargs.get("reference_tensor", missing_)
                if ref_tensor is not missing_ and ref_tensor not in valid_input_tensor_references:
                    raise ValidationError(f"{ref_tensor} not found in inputs")

    @validates_schema
    def weights_entries_match_weights_formats(self, data, **kwargs):
        weights: typing.Dict[str, _WeightsEntryBase] = data["weights"]
        for weights_format, weights_entry in weights.items():
            if weights_format in ["keras_hdf5", "tensorflow_js", "tensorflow_saved_model_bundle"]:
                assert isinstance(
                    weights_entry,
                    (
                        raw_nodes.KerasHdf5WeightsEntry,
                        raw_nodes.TensorflowJsWeightsEntry,
                        raw_nodes.TensorflowSavedModelBundleWeightsEntry,
                    ),
                )
                if weights_entry.tensorflow_version is missing_:
                    # todo: raise ValidationError (allow -> require)?
                    warnings.warn(f"missing 'tensorflow_version' entry for weights format {weights_format}")

            if weights_format == "onnx":
                assert isinstance(weights_entry, raw_nodes.OnnxWeightsEntry)
                if weights_entry.opset_version is missing_:
                    # todo: raise ValidationError?
                    warnings.warn(f"missing 'opset_version' entry for weights format {weights_format}")
예제 #10
0
class RDF(_BioImageIOSchema):
    class Meta:
        unknown = EXCLUDE

    bioimageio_description = f"""# BioImage.IO Resource Description File Specification {get_args(FormatVersion)[-1]}
This specification defines the fields used in a general BioImage.IO-compliant resource description file (`RDF`).
An RDF is stored as a YAML file and describes resources such as models, datasets, applications and notebooks. 
Note that models are described with an extended Model RDF specification.

The RDF contains mandatory and optional fields. In the following description, optional fields are indicated by 
_optional_. _optional*_ with an asterisk indicates the field is optional depending on the value in another field.
If no specialized RDF exists for the specified type (like model RDF for type='model') additional fields may be 
specified.
"""

    attachments = fields.Nested(
        Attachments(),
        bioimageio_description="Additional unknown keys are allowed.")

    authors_bioimageio_description = (
        "A list of authors. The authors are the creators of the specifications and the primary points of contact."
    )
    authors = fields.List(
        fields.Nested(Author()),
        bioimageio_description=authors_bioimageio_description)

    badges = fields.List(fields.Nested(Badge()),
                         bioimageio_description="a list of badges")

    cite_bioimageio_description = """A list of citation entries.
Each entry contains a mandatory `text` field and either one or both of `doi` and `url`.
E.g. the citation for the model architecture and/or the training data used."""
    cite = fields.List(fields.Nested(CiteEntry()),
                       bioimageio_description=cite_bioimageio_description)

    config_bioimageio_description = (
        "A custom configuration field that can contain any keys not present in the RDF spec. "
        "This means you should not store, for example, github repo URL in `config` since we already have the "
        "`git_repo` key defined in the spec.\n"
        "Keys in `config` may be very specific to a tool or consumer software. To avoid conflicted definitions, "
        "it is recommended to wrap configuration into a sub-field named with the specific domain or tool name, "
        """for example:

```yaml
   config:
      bioimage_io:  # here is the domain name
        my_custom_key: 3837283
        another_key:
           nested: value
      imagej:
        macro_dir: /path/to/macro/file
```
"""
        "If possible, please use [`snake_case`](https://en.wikipedia.org/wiki/Snake_case) for keys in `config`."
    )
    config = fields.YamlDict(
        bioimageio_descriptio=config_bioimageio_description)

    covers = fields.List(
        fields.Union([fields.URL(), fields.RelativeLocalPath()]),
        bioimageio_description=
        "A list of cover images provided by either a relative path to the model folder, or a "
        "hyperlink starting with 'http[s]'. Please use an image smaller than 500KB and an aspect ratio width to height "
        "of 2:1. The supported image formats are: 'jpg', 'png', 'gif'.",  # todo: field_validators image format
    )

    description = fields.String(
        required=True,
        bioimageio_description="A string containing a brief description.")

    documentation = fields.Union(
        [
            fields.URL(),
            fields.RelativeLocalPath(validate=field_validators.Attribute(
                "suffix",
                field_validators.Equal(
                    ".md",
                    error=
                    "{!r} is invalid; expected markdown file with '.md' extension."
                ),
            )),
        ],
        bioimageio_description=
        "URL or relative path to markdown file with additional documentation. "
        "For markdown files the recommended documentation file name is `README.md`.",
    )

    download_url = fields.URL(
        bioimageio_description="optional url to download the resource from")

    format_version = fields.String(
        required=True,
        bioimageio_description_order=0,
        bioimageio_description=
        ("Version of the BioImage.IO Resource Description File Specification used."
         f"The current general format version described here is {get_args(FormatVersion)[-1]}. "
         "Note: The general RDF format is not to be confused with specialized RDF format like the Model RDF format."
         ),
    )

    @validates_schema
    def format_version_matches_type(self, data, **kwargs):
        format_version = data.get("format_version")
        type_ = data.get("type")
        try:
            patched_format_version = get_patched_format_version(
                type_, format_version)
            if format_version.split(".") > patched_format_version.split("."):
                raise ValueError(
                    f"Unknown format_version {format_version} (latest patch: {patched_format_version}; latest format version: )"
                )
        except Exception as e:
            raise ValidationError(
                f"Invalid format_version {format_version} for RDF type {type_}. (error: {e})"
            )

    git_repo_bioimageio_description = "A url to the git repository, e.g. to Github or Gitlab."
    git_repo = fields.URL(
        bioimageio_description=git_repo_bioimageio_description)

    icon = fields.String(
        bioimageio_description="an icon for the resource"
    )  # todo: limit length? validate=field_validators.Length(max=1)

    id = fields.String(
        bioimageio_description="Unique id within a collection of resources.")
    license_bioimageio_description = (
        "A [SPDX license identifier](https://spdx.org/licenses/)(e.g. `CC-BY-4.0`, `MIT`, "
        "`BSD-2-Clause`). We don't support custom license beyond the SPDX license list, if you need that please send "
        "an Github issue to discuss your intentions with the community.")
    license = fields.String(  # todo: make mandatory?
        # validate=field_validators.OneOf(LICENSES),  # enforce license id
        bioimageio_description=license_bioimageio_description)

    @validates("license")
    def warn_about_deprecated_spdx_license(self, value: str):
        license_info = LICENSES.get(value)
        if license_info is None:
            self.warn(
                "license",
                f"{value} is not a recognized SPDX license identifier. See https://spdx.org/licenses/"
            )
        else:
            if license_info.get("isDeprecatedLicenseId", False):
                self.warn("license",
                          f"{value} ({license_info['name']}) is deprecated.")

            if not license_info.get("isFsfLibre", False):
                self.warn(
                    "license",
                    f"{value} ({license_info['name']}) is not FSF Free/libre.")

    links = fields.List(
        fields.String(),
        bioimageio_description="links to other bioimage.io resources")

    maintainers = fields.List(
        fields.Nested(Maintainer()),
        bioimageio_description="Maintainers of this resource.")

    name = fields.String(
        required=True,
        bioimageio_description="name of the resource, a human-friendly name")

    @validates
    def warn_about_long_name(self, value: str):
        if isinstance(value, str):
            if len(value) > 64:
                self.warn(
                    "name",
                    f"Length of name ({len(value)}) exceeds the recommended maximum length of 64 characters."
                )
        else:
            self.warn("name", f"Could not check length of name {value}.")

    rdf_source = fields.Union(
        [fields.URL(), fields.DOI()],
        bioimageio_description=
        "url or doi to the source of the resource definition")
    source = fields.Union(
        [fields.URI(), fields.RelativeLocalPath()],
        bioimageio_description=
        "url or local relative path to the source of the resource",
    )

    tags = fields.List(fields.String(),
                       bioimageio_description="A list of tags.")

    @validates("tags")
    def warn_about_tag_categories(self, value):
        if BIOIMAGEIO_SITE_CONFIG is None:
            error = BIOIMAGEIO_SITE_CONFIG_ERROR
        else:
            missing_categories = []
            try:
                categories = {
                    c["type"]: c.get("tag_categories", {})
                    for c in BIOIMAGEIO_SITE_CONFIG["resource_categories"]
                }.get(self.__class__.__name__.lower(), {})
                for cat, entries in categories.items():
                    if not any(e in value for e in entries):
                        missing_categories.append({cat: entries})
            except Exception as e:
                error = str(e)
            else:
                error = None
                if missing_categories:
                    self.warn(
                        "tags",
                        f"Missing tags for categories: {missing_categories}")

        if error is not None:
            self.warn("tags", f"could not check tag categories ({error})")

    type = fields.String(required=True)

    # todo: restrict valid RDF types?
    @validates("type")
    def validate_type(self, value):
        schema_type = self.__class__.__name__.lower()
        if value != schema_type:
            self.warn(
                "type",
                f"Unrecognized type '{value}'. Validating as {schema_type}.")

    version = fields.Version(
        bioimageio_description=
        "The version number of the model. The version number format must be a string in "
        "`MAJOR.MINOR.PATCH` format following the guidelines in Semantic Versioning 2.0.0 (see https://semver.org/), "
        "e.g. the initial version number should be `0.1.0`.")
예제 #11
0
class Attachments(_BioImageIOSchema, WithUnknown):
    files = fields.List(
        fields.Union([fields.URI(), fields.RelativeLocalPath()]),
        bioimageio_description=
        "File attachments; included when packaging the resource.",
    )
예제 #12
0
class Model(rdf.schema.RDF):
    raw_nodes = raw_nodes

    class Meta:
        unknown = RAISE
        exclude = ("source",
                   )  # while RDF does have a source field, Model does not

    bioimageio_description = f"""# BioImage.IO Model Resource Description File Specification {get_args(raw_nodes.FormatVersion)[-1]}
This specification defines the fields used in a BioImage.IO-compliant resource description file (`RDF`) for describing AI models with pretrained weights.
These fields are typically stored in YAML files which we call Model Resource Description Files or `model RDF`.
The model RDFs can be downloaded or uploaded to the bioimage.io website, produced or consumed by BioImage.IO-compatible consumers(e.g. image analysis software or other website).

The model RDF YAML file contains mandatory and optional fields. In the following description, optional fields are indicated by _optional_.
_optional*_ with an asterisk indicates the field is optional depending on the value in another field.
"""
    # todo: sync authors with RDF
    authors = fields.List(
        fields.Nested(rdf.schema.Author()),
        validate=field_validators.Length(min=1),
        required=True,
        bioimageio_description=rdf.schema.RDF.authors_bioimageio_description,
    )

    badges = missing  # todo: allow badges for Model (RDF has it)
    cite = fields.List(
        fields.Nested(rdf.schema.CiteEntry()),
        required=True,  # todo: unify authors with RDF (optional or required?)
        validate=field_validators.Length(min=1),
        bioimageio_description=rdf.schema.RDF.cite_bioimageio_description,
    )

    config = fields.YamlDict(
        bioimageio_description=rdf.schema.RDF.config_bioimageio_description +
        """
For example:
```yaml
config:
  # custom config for DeepImageJ, see https://github.com/bioimage-io/configuration/issues/23
  deepimagej:
    model_keys:
      # In principle the tag "SERVING" is used in almost every tf model
      model_tag: tf.saved_model.tag_constants.SERVING
      # Signature definition to call the model. Again "SERVING" is the most general
      signature_definition: tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
    test_information:
      input_size: [2048x2048] # Size of the input images
      output_size: [1264x1264 ]# Size of all the outputs
      device: cpu # Device used. In principle either cpu or GPU
      memory_peak: 257.7 Mb # Maximum memory consumed by the model in the device
      runtime: 78.8s # Time it took to run the model
      pixel_size: [9.658E-4µmx9.658E-4µm] # Size of the pixels of the input
```
""")

    documentation = fields.Union(
        [
            fields.URL(),
            fields.RelativeLocalPath(validate=field_validators.Attribute(
                "suffix",
                field_validators.Equal(
                    ".md",
                    error=
                    "{!r} is invalid; expected markdown file with '.md' extension."
                ),
            )),
        ],
        required=True,
        bioimageio_description=
        "Relative path or URL to file with additional documentation in markdown. "
        "The file must be in markdown format with `.md` file name extension"
        "It is recommended to use `README.md` as the documentation name.",
    )

    format_version = fields.String(
        validate=field_validators.OneOf(get_args_flat(
            raw_nodes.FormatVersion)),
        required=True,
        bioimageio_description_order=0,
        bioimageio_description=
        f"""Version of the BioImage.IO Model Resource Description File Specification used.
This is mandatory, and important for the consumer software to verify before parsing the fields.
The recommended behavior for the implementation is to keep backward compatibility and throw an error if the model yaml
is in an unsupported format version. The current format version described here is
{get_args(raw_nodes.FormatVersion)[-1]}""",
    )

    git_repo = fields.URL(
        bioimageio_description=rdf.schema.RDF.git_repo_bioimageio_description +
        "If the model is contained in a subfolder of a git repository, then a url to the exact folder"
        + "(which contains the configuration yaml file) should be used.")

    inputs = fields.List(
        fields.Nested(InputTensor()),
        validate=field_validators.Length(min=1),
        required=True,
        bioimageio_description=
        "Describes the input tensors expected by this model.",
    )

    @validates("inputs")
    def no_duplicate_input_tensor_names(
            self, value: typing.List[raw_nodes.InputTensor]):
        if not isinstance(value, list) or not all(
                isinstance(v, raw_nodes.InputTensor) for v in value):
            raise ValidationError(
                "Could not check for duplicate input tensor names due to another validation error."
            )

        names = [t.name for t in value]
        if len(names) > len(set(names)):
            raise ValidationError(
                "Duplicate input tensor names are not allowed.")

    license = fields.String(
        validate=field_validators.OneOf(LICENSES),
        required=True,
        bioimageio_description=rdf.schema.RDF.license_bioimageio_description,
    )

    name = fields.String(
        required=True,
        bioimageio_description=
        "Name of this model. It should be human-readable and only contain letters, numbers, "
        "underscore '_', minus '-' or spaces and not be longer than 64 characters.",
    )

    outputs = fields.List(
        fields.Nested(OutputTensor()),
        validate=field_validators.Length(min=1),
        bioimageio_description="Describes the output tensors from this model.",
    )

    @validates("outputs")
    def no_duplicate_output_tensor_names(
            self, value: typing.List[raw_nodes.OutputTensor]):
        if not isinstance(value, list) or not all(
                isinstance(v, raw_nodes.OutputTensor) for v in value):
            raise ValidationError(
                "Could not check for duplicate output tensor names due to another validation error."
            )

        names = [t["name"] if isinstance(t, dict) else t.name for t in value]
        if len(names) > len(set(names)):
            raise ValidationError(
                "Duplicate output tensor names are not allowed.")

    @validates_schema
    def inputs_and_outputs(self, data, **kwargs):
        ipts: typing.List[raw_nodes.InputTensor] = data.get("inputs")
        outs: typing.List[raw_nodes.OutputTensor] = data.get("outputs")
        if any([
                not isinstance(ipts, list),
                not isinstance(outs, list),
                not all(isinstance(v, raw_nodes.InputTensor) for v in ipts),
                not all(isinstance(v, raw_nodes.OutputTensor) for v in outs),
        ]):
            raise ValidationError(
                "Could not check for duplicate tensor names due to another validation error."
            )

        # no duplicate tensor names
        names = [t.name for t in ipts + outs]  # type: ignore
        if len(names) > len(set(names)):
            raise ValidationError("Duplicate tensor names are not allowed.")

        tensors_by_name: typing.Dict[str,
                                     typing.Union[raw_nodes.InputTensor,
                                                  raw_nodes.OutputTensor]] = {
                                                      t.name: t
                                                      for t in ipts +
                                                      outs  # type: ignore
                                                  }

        # minimum shape leads to valid output:
        # output with subtracted halo has to result in meaningful output even for the minimal input
        # see https://github.com/bioimage-io/spec-bioimage-io/issues/392
        def get_min_shape(t) -> numpy.ndarray:
            if isinstance(t.shape, raw_nodes.ParametrizedInputShape):
                shape = numpy.array(t.shape.min)
            elif isinstance(t.shape, raw_nodes.ImplicitOutputShape):
                shape = get_min_shape(tensors_by_name[t.shape.reference_tensor]
                                      ) * t.shape.scale + 2 * numpy.array(
                                          t.shape.offset)
            else:
                shape = numpy.array(t.shape)

            return shape

        for out in outs:
            if isinstance(
                    out.shape,
                    raw_nodes.ImplicitOutputShape) and len(out.shape) != len(
                        tensors_by_name[out.shape.reference_tensor].shape):
                raise ValidationError(
                    f"Referenced tensor {out.shape.reference_tensor} "
                    f"with {len(tensors_by_name[out.shape.reference_tensor].shape)} dimensions does not match "
                    f"output tensor {out.name} with {len(out.shape)} dimensions."
                )

            min_out_shape = get_min_shape(out)
            if out.halo:
                halo = out.halo
                halo_msg = f" for halo {out.halo}"
            else:
                halo = [0] * len(min_out_shape)
                halo_msg = ""

            if any([s - 2 * h < 1 for s, h in zip(min_out_shape, halo)]):
                raise ValidationError(
                    f"Minimal shape {min_out_shape} of output {out.name} is too small{halo_msg}."
                )

    packaged_by = fields.List(
        fields.Nested(rdf.schema.Author()),
        bioimageio_description=
        f"The persons that have packaged and uploaded this model. Only needs to be specified if "
        f"different from `authors` in root or any entry in `weights`.",
    )

    parent = fields.Nested(
        ModelParent(),
        bioimageio_description=
        "The model from which this model is derived, e.g. by fine-tuning the weights.",
    )

    run_mode = fields.Nested(
        RunMode(),
        bioimageio_description=
        "Custom run mode for this model: for more complex prediction procedures like test time "
        "data augmentation that currently cannot be expressed in the specification. "
        "No standard run modes are defined yet.",
    )

    sample_inputs = fields.List(
        fields.Union([fields.URI(), fields.RelativeLocalPath()]),
        validate=field_validators.Length(min=1),
        bioimageio_description=
        "List of URIs/local relative paths to sample inputs to illustrate possible inputs for "
        "the model, for example stored as png or tif images. "
        "The model is not tested with these sample files that serve to inform a human user about an example use case.",
    )
    sample_outputs = fields.List(
        fields.Union([fields.URI(), fields.RelativeLocalPath()]),
        validate=field_validators.Length(min=1),
        bioimageio_description=
        "List of URIs/local relative paths to sample outputs corresponding to the "
        "`sample_inputs`.",
    )

    test_inputs = fields.List(
        fields.Union([fields.URI(), fields.RelativeLocalPath()]),
        validate=field_validators.Length(min=1),
        required=True,
        bioimageio_description=
        "List of URIs or local relative paths to test inputs as described in inputs for "
        "**a single test case**. "
        "This means if your model has more than one input, you should provide one URI for each input."
        "Each test input should be a file with a ndarray in "
        "[numpy.lib file format](https://numpy.org/doc/stable/reference/generated/numpy.lib.format.html#module-numpy.lib.format)."
        "The extension must be '.npy'.",
    )
    test_outputs = fields.List(
        fields.Union([fields.URI(), fields.RelativeLocalPath()]),
        validate=field_validators.Length(min=1),
        required=True,
        bioimageio_description="Analog to test_inputs.",
    )

    timestamp = fields.DateTime(
        required=True,
        bioimageio_description=
        "Timestamp of the initial creation of this model in [ISO 8601]"
        "(#https://en.wikipedia.org/wiki/ISO_8601) format.",
    )

    training_data = fields.Union(
        [fields.Nested(Dataset()),
         fields.Nested(LinkedDataset())])

    weights = fields.Dict(
        fields.String(
            validate=field_validators.OneOf(get_args(raw_nodes.WeightsFormat)),
            required=True,
            bioimageio_description="Format of this set of weights. "
            f"One of: {', '.join(get_args(raw_nodes.WeightsFormat))}",
        ),
        fields.Union(
            [fields.Nested(we()) for we in get_args(WeightsEntry)],
            short_bioimageio_description=
            ("The weights for this model. Weights can be given for different formats, but should "
             "otherwise be equivalent. "
             "See [weight_formats_spec_0_4.md]"
             "(https://github.com/bioimage-io/spec-bioimage-io/blob/gh-pages/weight_formats_spec_0_4.md) "
             "for the required and optional fields per weight format. "
             "The available weight formats determine which consumers can use this model."
             ),
        ),
        required=True,
    )

    @pre_load
    def add_weights_format_key_to_weights_entry_value(self,
                                                      data: dict,
                                                      many=False,
                                                      partial=False,
                                                      **kwargs):
        data = deepcopy(
            data
        )  # Schema.validate() calls pre_load methods, thus we should not modify the input data
        if many or partial:
            raise NotImplementedError

        for weights_format, weights_entry in data.get("weights", {}).items():
            if "weights_format" in weights_entry:
                raise ValidationError(
                    f"Got unexpected key 'weights_format' in weights entry {weights_format}"
                )

            weights_entry["weights_format"] = weights_format

        return data

    @validates_schema
    def validate_reference_tensor_names(self, data, **kwargs):
        def get_tnames(tname: str):
            return [
                t.get("name") if isinstance(t, dict) else t.name
                for t in data.get(tname, [])
            ]

        valid_input_tensor_references = get_tnames("inputs")
        ins = data.get("inputs", [])
        outs = data.get("outputs", [])
        if not isinstance(ins, list) or not isinstance(outs, list):
            raise ValidationError(
                f"Failed to validate reference tensor names due to other validation errors in inputs/outputs."
            )

        for t in outs:
            if not isinstance(t, raw_nodes.OutputTensor):
                raise ValidationError(
                    "Failed to validate reference tensor names due to validation errors in outputs"
                )

            if t.postprocessing is missing:
                continue

            for postpr in t.postprocessing:
                if postpr.kwargs is missing:
                    continue

                ref_tensor = postpr.kwargs.get("reference_tensor", missing)
                if ref_tensor is not missing and ref_tensor not in valid_input_tensor_references:
                    raise ValidationError(f"{ref_tensor} not found in inputs")

        for t in ins:
            if not isinstance(t, raw_nodes.InputTensor):
                raise ValidationError(
                    "Failed to validate reference tensor names due to validation errors in inputs"
                )

            if t.preprocessing is missing:
                continue

            for prep in t.preprocessing:
                if prep.kwargs is missing:
                    continue

                ref_tensor = prep.kwargs.get("reference_tensor", missing)
                if ref_tensor is not missing and ref_tensor not in valid_input_tensor_references:
                    raise ValidationError(f"{ref_tensor} not found in inputs")

                if ref_tensor == t.name:
                    raise ValidationError(
                        f"invalid self reference for preprocessing of tensor {t.name}"
                    )

    @validates_schema
    def weights_entries_match_weights_formats(self, data, **kwargs):
        weights: typing.Dict[str, WeightsEntry] = data.get("weights", {})
        for weights_format, weights_entry in weights.items():
            if not isinstance(weights_entry, get_args(raw_nodes.WeightsEntry)):
                raise ValidationError(
                    "Cannot validate keys in weights field due to other validation errors."
                )

            if weights_format in ["pytorch_state_dict", "torchscript"]:
                if weights_format == "pytorch_state_dict":
                    assert isinstance(weights_entry,
                                      raw_nodes.PytorchStateDictWeightsEntry)
                elif weights_format == "torchscript":
                    assert isinstance(weights_entry,
                                      raw_nodes.TorchscriptWeightsEntry)
                else:
                    raise NotImplementedError

                if weights_entry.dependencies is missing and weights_entry.pytorch_version is missing:
                    self.warn(f"weights:{weights_format}",
                              "missing 'pytorch_version'")

            if weights_format in [
                    "keras_hdf5", "tensorflow_js",
                    "tensorflow_saved_model_bundle"
            ]:
                if weights_format == "keras_hdf5":
                    assert isinstance(weights_entry,
                                      raw_nodes.KerasHdf5WeightsEntry)
                elif weights_format == "tensorflow_js":
                    assert isinstance(weights_entry,
                                      raw_nodes.TensorflowJsWeightsEntry)
                elif weights_format == "tensorflow_saved_model_bundle":
                    assert isinstance(
                        weights_entry,
                        raw_nodes.TensorflowSavedModelBundleWeightsEntry)
                else:
                    raise NotImplementedError

                if weights_entry.dependencies is missing and weights_entry.tensorflow_version is missing:
                    self.warn(f"weights:{weights_format}",
                              "missing 'tensorflow_version'")

            if weights_format == "onnx":
                assert isinstance(weights_entry, raw_nodes.OnnxWeightsEntry)
                if weights_entry.dependencies is missing and weights_entry.opset_version is missing:
                    self.warn(f"weights:{weights_format}",
                              "missing 'opset_version'")