コード例 #1
0
ファイル: metadata.py プロジェクト: nf-osi/schematic
class MetadataModel(object):
    """Metadata model wrapper around schema.org specification graph.

    Provides basic utilities to:

    1) manipulate the metadata model
    2) generate metadata model views:
        - generate manifest view of the metadata model
        - generate validation schema view of the metadata model
    """

    def __init__(self,
                inputMModelLocation: str,
                inputMModelLocationType: str,
                ) -> None:

        """Instantiates a MetadataModel object.

        Args:
            inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
            inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
        """
        # extract extension of 'inputMModelLocation'
        # ensure that it is necessarily pointing to a '.jsonld' file
        if inputMModelLocation.rpartition('.')[-1] == "jsonld":
            logger.debug(f"Initializing SchemaGenerator object from {inputMModelLocation} schema.")
            self.inputMModelLocation = inputMModelLocation

            self.sg = SchemaGenerator(inputMModelLocation)
        else:
            raise TypeError(f"Please make sure {inputMModelLocation} is a .jsonld file.")

        # check if the type of MModel file is "local"
        # currently, the application only supports reading from local JSON-LD files
        if inputMModelLocationType == "local":
            self.inputMModelLocationType = inputMModelLocationType
        else:
            raise ValueError(f"The type '{inputMModelLocationType}' is currently not supported.")

    # business logic: expose metadata model "views" depending on "controller" logic
    # (somewhat analogous to Model View Controller pattern for GUI/web applications)
    # i.e. jsonschemas, annotation manifests, metadata/annotation dictionary web explorer
    # are all "views" of the metadata model.
    # The "business logic" in this MetadataModel class provides functions exposing relevant parts
    # of the metadata model needed so that these views can be generated by user facing components;
    # controller components are (loosely speaking) responsible for handling the interaction between views and the model
    # some of these components right now reside in the Bundle class

    def getModelSubgraph(self, rootNode: str,
                        subgraphType: str) -> nx.DiGraph:
        """Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType.

        Args:
            rootNode: a schema node label (i.e. term).
            subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels).

        Returns:
            A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants.

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        pass

    def getOrderedModelNodes(self, rootNode: str, relationshipType: str) -> List[str]:
        """Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type.

        Args:
            rootNode: a schema object/node label (i.e. term)
            relationshipType: edge label type of the schema subgraph (e.g. requiresDependency)

        Returns:
            An ordered list of objects, that are all descendants of rootNode.

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        ordered_nodes = self.sg.get_descendants_by_edge_type(rootNode, relationshipType, connected=True, ordered=True)

        ordered_nodes.reverse()

        return ordered_nodes


    def getModelManifest(self, title: str, rootNode: str, jsonSchema: str = None, filenames: list = None) -> str:
        """Gets data from the annotations manifest file.

        TBD: Does this method belong here or in manifest generator?

        Args:
            rootNode: a schema node label (i.e. term).

        Returns:
            A manifest URI (assume Google doc for now).

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        additionalMetadata = {}
        if filenames:
            additionalMetadata["Filename"] = filenames

        mg = ManifestGenerator(path_to_json_ld=self.inputMModelLocation, 
                               title=title, 
                               root=rootNode, 
                               additional_metadata=additionalMetadata)

        if jsonSchema:
            return mg.get_manifest(json_schema=jsonSchema)

        return mg.get_manifest()


    def get_component_requirements(self, source_component: str) -> List[str]:
        """Given a source model component (see https://w3id.org/biolink/vocab/category for definnition of component), return all components required by it.
        Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes;
        Can be utilized to track metadata completion progress across multiple categories of attributes.

        Args:
            source_component: an attribute label indicating the source component.

        Returns:
            A list of required components associated with the source component.
        """
        # get metadata model schema graph
        # mm_graph = self.se.get_nx_schema()

        # get required components for the input/source component
        req_components = self.sg.get_component_requirements(source_component)
        # req_components = get_component_requirements(mm_graph, source_component)

        return req_components


    # TODO: abstract validation in its own module
    def validateModelManifest(self, manifestPath: str, rootNode: str, jsonSchema: str = None) -> List[str]:
        """Check if provided annotations manifest dataframe satisfies all model requirements.

        Args:
            rootNode: a schema node label (i.e. term).
            manifestPath: a path to the manifest csv file containing annotations.

        Returns:
            A validation status message; if there is an error the message.
            contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record.

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        # get validation schema for a given node in the data model, if the user has not provided input validation schema
        if not jsonSchema:
            jsonSchema = self.sg.get_json_schema_requirements(rootNode, rootNode + "_validation")

        errors = []

        # get annotations from manifest (array of json annotations corresponding to manifest rows)
        manifest = pd.read_csv(manifestPath)    # read manifest csv file as is from manifest path
        manifest = trim_commas_df(manifest).fillna("")  # apply cleaning logic as part of pre-processing step
 
        # handler for mismatched components/data types
        # throw TypeError if the value(s) in the "Component" column differ from the selected template type
        if ('Component' in manifest.columns) and (
            (len(manifest['Component'].unique()) > 1) or (manifest['Component'].unique()[0] != rootNode)
            ):
            logging.error(f"The 'Component' column value(s) {manifest['Component'].unique()} do not match the "
                          f"selected template type '{rootNode}'.")
            
            # row indexes for all rows where 'Component' is rootNode
            row_idxs = manifest.index[manifest['Component'] != rootNode].tolist()
            # column index value for the 'Component' column
            col_idx = manifest.columns.get_loc('Component')
            # Series with index and 'Component' values from manifest
            mismatched_ser = manifest.iloc[row_idxs, col_idx]
            for index, component in mismatched_ser.items():
                errors.append([
                    index + 2,
                    'Component',
                    f"Component value provided is: '{component}', whereas the Template Type is: '{rootNode}'",

                    # tuple of the component in the manifest and selected template type
                    # check: R/Reticulate cannnot handle dicts? So returning tuple
                    (component, rootNode)
                ])
                
            return errors

        # check if each of the provided annotation columns has validation rule 'list'
        # if so, assume annotation for this column are comma separated list of multi-value annotations
        # convert multi-valued annotations to list
        for col in manifest.columns:

            # remove trailing/leading whitespaces from manifest
            manifest.applymap(lambda x: x.strip() if isinstance(x, str) else x)

            # convert manifest values to string
            # TODO: when validation handles annotation types as validation rules
            # would have to avoid converting everything to string
            manifest[col] = manifest[col].astype(str)

            # if the validation rule is set to list, convert items in the
            # annotations manifest to a list and strip each value from leading/trailing spaces
            if "list" in self.sg.get_node_validation_rules(col):
                manifest[col] = manifest[col].apply(lambda x: [s.strip() for s in str(x).split(",")])

        annotations = json.loads(manifest.to_json(orient='records'))
        for i, annotation in enumerate(annotations):
            v = Draft7Validator(jsonSchema)

            for error in sorted(v.iter_errors(annotation), key=exceptions.relevance):
                errorRow = i + 2
                errorCol = error.path[-1] if len(error.path) > 0 else "Wrong schema"
                errorMsg = error.message[0:500]
                errorVal = error.instance if len(error.path) > 0 else "Wrong schema"

                errors.append([errorRow, errorCol, errorMsg, errorVal])

        return errors


    def populateModelManifest(self, title, manifestPath: str, rootNode: str) -> str:
        """Populate an existing annotations manifest based on a dataframe.

        Args:
            rootNode: a schema node label (i.e. term).
            manifestPath: a path to the manifest csv file containing annotations.

        Returns:
            A link to the filled in model manifest (e.g. google sheet).

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        mg = ManifestGenerator(path_to_json_ld=self.inputMModelLocation, 
                               title=title, 
                               root=rootNode)

        emptyManifestURL = mg.get_manifest()

        return mg.populate_manifest_spreadsheet(manifestPath, emptyManifestURL)


    def submit_metadata_manifest(self, manifest_path: str, dataset_id: str, validate_component: str = None) -> bool:
        """Wrap methods that are responsible for validation of manifests for a given component, and association of the
        same manifest file with a specified dataset.
        Args:
            manifest_path: Path to the manifest file, which contains the metadata.
            dataset_id: Synapse ID of the dataset on Synapse containing the metadata manifest file.
            validate_component: Component from the schema.org schema based on which the manifest template has been generated.
        Returns:
            True: If both validation and association were successful.
        Exceptions:
            ValueError: When validate_component is provided, but it cannot be found in the schema.
            ValidationError: If validation against data model was not successful.
        """
        syn_store = SynapseStorage()

        # check if user wants to perform validation or not
        if validate_component is not None:

            try:
                # check if the component ("class" in schema) passed as argument is valid (present in schema) or not
                self.sg.se.is_class_in_schema(validate_component)
            except:
                # a KeyError exception is raised when validate_component fails in the try-block above
                # here, we are suppressing the KeyError exception and replacing it with a more
                # descriptive ValueError exception
                raise ValueError("The component {} could not be found "
                                 "in the schema.".format(validate_component))

            # automatic JSON schema generation and validation with that JSON schema
            val_errors = self.validateModelManifest(manifestPath=manifest_path, rootNode=validate_component)

            # if there are no errors in validation process
            if not val_errors:

                # upload manifest file from `manifest_path` path to entity with Syn ID `dataset_id`
                syn_store.associateMetadataWithFiles(metadataManifestPath=manifest_path, datasetId=dataset_id)

                logger.info(f"No validation errors occured during validation.")
                return True
            else:
                raise ValidationError("Manifest could not be validated under provided data model. "
                                      f"Validation failed with the following errors: {val_errors}")

        # no need to perform validation, just submit/associate the metadata manifest file
        syn_store.associateMetadataWithFiles(metadataManifestPath=manifest_path, datasetId=dataset_id)

        logger.debug("Optional validation was not performed on manifest before association.")
        
        return True
コード例 #2
0
class MetadataModel(object):
    """Metadata model wrapper around schema.org specification graph.

    Provides basic utilities to:

    1) manipulate the metadata model
    2) generate metadata model views:
        - generate manifest view of the metadata model
        - generate validation schema view of the metadata model
    """
    def __init__(
        self,
        inputMModelLocation: str,
        inputMModelLocationType: str,
    ) -> None:
        """Instantiates a MetadataModel object.

        Args:
            inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
            inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
        """
        # extract extension of 'inputMModelLocation'
        # ensure that it is necessarily pointing to a '.jsonld' file
        if inputMModelLocation.rpartition(".")[-1] == "jsonld":
            logger.debug(
                f"Initializing SchemaGenerator object from {inputMModelLocation} schema."
            )
            self.inputMModelLocation = inputMModelLocation

            self.sg = SchemaGenerator(inputMModelLocation)
        else:
            raise TypeError(
                f"Please make sure {inputMModelLocation} is a .jsonld file.")

        # check if the type of MModel file is "local"
        # currently, the application only supports reading from local JSON-LD files
        if inputMModelLocationType == "local":
            self.inputMModelLocationType = inputMModelLocationType
        else:
            raise ValueError(
                f"The type '{inputMModelLocationType}' is currently not supported."
            )

    def getModelSubgraph(self, rootNode: str, subgraphType: str) -> nx.DiGraph:
        """Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType.

        Args:
            rootNode: a schema node label (i.e. term).
            subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels).

        Returns:
            A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants.

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        pass

    def getOrderedModelNodes(self, rootNode: str,
                             relationshipType: str) -> List[str]:
        """Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type.

        Args:
            rootNode: a schema object/node label (i.e. term)
            relationshipType: edge label type of the schema subgraph (e.g. requiresDependency)

        Returns:
            An ordered list of objects, that are all descendants of rootNode.

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        ordered_nodes = self.sg.get_descendants_by_edge_type(rootNode,
                                                             relationshipType,
                                                             connected=True,
                                                             ordered=True)

        ordered_nodes.reverse()

        return ordered_nodes

    def getModelManifest(
        self,
        title: str,
        rootNode: str,
        datasetId: str = None,
        jsonSchema: str = None,
        filenames: list = None,
        useAnnotations: bool = False,
        sheetUrl: bool = True,
    ) -> str:
        """Gets data from the annotations manifest file.

        TBD: Does this method belong here or in manifest generator?

        Args:
            rootNode: a schema node label (i.e. term).
            useAnnotations: whether to populate manifest with current file annotations (True) or not (False, default).

        Returns:
            A manifest URI (assume Google doc for now).

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        additionalMetadata = {}
        if filenames:
            additionalMetadata["Filename"] = filenames

        mg = ManifestGenerator(
            path_to_json_ld=self.inputMModelLocation,
            title=title,
            root=rootNode,
            additional_metadata=additionalMetadata,
            use_annotations=useAnnotations,
        )

        if datasetId:
            return mg.get_manifest(dataset_id=datasetId,
                                   json_schema=jsonSchema,
                                   sheet_url=sheetUrl)

        return mg.get_manifest(sheet_url=sheetUrl)

    def get_component_requirements(self,
                                   source_component: str,
                                   as_graph: bool = False) -> List:
        """Given a source model component (see https://w3id.org/biolink/vocab/category for definnition of component), return all components required by it.
        Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes;
        Can be utilized to track metadata completion progress across multiple categories of attributes.

        Args:
            source_component: an attribute label indicating the source component.
            as_graph: if False return component requirements as a list; if True return component requirements as a dependency graph (i.e. a DAG)

        Returns:
            A list of required components associated with the source component.
        """

        # get required components for the input/source component
        req_components = self.sg.get_component_requirements(source_component)

        # retreive components as graph
        if as_graph:
            req_components_graph = self.sg.get_component_requirements_graph(
                source_component)

            # serialize component dependencies DAG to a edge list of node tuples
            req_components = list(req_components_graph.edges())

            return req_components

        return req_components

    # TODO: abstract validation in its own module
    def validateModelManifest(
        self,
        manifestPath: str,
        rootNode: str,
        restrict_rules: bool = False,
        jsonSchema: str = None,
        project_scope: List = None,
    ) -> List[str]:
        """Check if provided annotations manifest dataframe satisfies all model requirements.

        Args:
            rootNode: a schema node label (i.e. term).
            manifestPath: a path to the manifest csv file containing annotations.
            restrict_rules: bypass great expectations and restrict rule options to those implemented in house

        Returns:
            A validation status message; if there is an error the message.
            contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record.

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        # get validation schema for a given node in the data model, if the user has not provided input validation schema

        if not jsonSchema:
            jsonSchema = self.sg.get_json_schema_requirements(
                rootNode, rootNode + "_validation")

        errors = []
        warnings = []

        load_args = {"dtype": "string"}
        # get annotations from manifest (array of json annotations corresponding to manifest rows)
        manifest = load_df(
            manifestPath,
            preserve_raw_input=False,
            **load_args,
        )  # read manifest csv file as is from manifest path

        # handler for mismatched components/data types
        # throw TypeError if the value(s) in the "Component" column differ from the selected template type
        if ("Component" in manifest.columns) and (
            (len(manifest["Component"].unique()) > 1) or
            (manifest["Component"].unique()[0] != rootNode)):
            logging.error(
                f"The 'Component' column value(s) {manifest['Component'].unique()} do not match the "
                f"selected template type '{rootNode}'.")

            # row indexes for all rows where 'Component' is rootNode
            row_idxs = manifest.index[
                manifest["Component"] != rootNode].tolist()
            # column index value for the 'Component' column
            col_idx = manifest.columns.get_loc("Component")
            # Series with index and 'Component' values from manifest
            mismatched_ser = manifest.iloc[row_idxs, col_idx]
            for index, component in mismatched_ser.items():
                errors.append([
                    index + 2,
                    "Component",
                    f"Component value provided is: '{component}', whereas the Template Type is: '{rootNode}'",
                    # tuple of the component in the manifest and selected template type
                    # check: R/Reticulate cannnot handle dicts? So returning tuple
                    (component, rootNode),
                ])

            return errors, warnings

        errors, warnings, manifest = validate_all(self, errors, warnings,
                                                  manifest, manifestPath,
                                                  self.sg, jsonSchema,
                                                  restrict_rules,
                                                  project_scope)
        return errors, warnings

    def populateModelManifest(self, title, manifestPath: str,
                              rootNode: str) -> str:
        """Populate an existing annotations manifest based on a dataframe.
            TODO: Remove this method; always use getModelManifest instead

        Args:
            rootNode: a schema node label (i.e. term).
            manifestPath: a path to the manifest csv file containing annotations.

        Returns:
            A link to the filled in model manifest (e.g. google sheet).

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        mg = ManifestGenerator(path_to_json_ld=self.inputMModelLocation,
                               title=title,
                               root=rootNode)

        emptyManifestURL = mg.get_manifest()

        return mg.populate_manifest_spreadsheet(manifestPath, emptyManifestURL)

    def submit_metadata_manifest(
        self,
        manifest_path: str,
        path_to_json_ld: str,
        dataset_id: str,
        manifest_record_type: str,
        restrict_rules: bool,
        validate_component: str = None,
        use_schema_label: bool = True,
        hide_blanks: bool = False,
        input_token: str = None,
        project_scope: List = None,
    ) -> string:
        """Wrap methods that are responsible for validation of manifests for a given component, and association of the
        same manifest file with a specified dataset.
        Args:
            manifest_path: Path to the manifest file, which contains the metadata.
            dataset_id: Synapse ID of the dataset on Synapse containing the metadata manifest file.
            validate_component: Component from the schema.org schema based on which the manifest template has been generated.
        Returns:
            Manifest ID: If both validation and association were successful.
        Exceptions:
            ValueError: When validate_component is provided, but it cannot be found in the schema.
            ValidationError: If validation against data model was not successful.
        """

        #TODO: avoid explicitly exposing Synapse store functionality
        # just instantiate a Store class and let it decide at runtime/config
        # the store type
        syn_store = SynapseStorage(input_token=input_token)
        manifest_id = None
        censored_manifest_id = None
        restrict_maniest = False
        censored_manifest_path = manifest_path.replace('.csv', '_censored.csv')
        # check if user wants to perform validation or not
        if validate_component is not None:

            try:
                # check if the component ("class" in schema) passed as argument is valid (present in schema) or not
                self.sg.se.is_class_in_schema(validate_component)
            except:
                # a KeyError exception is raised when validate_component fails in the try-block above
                # here, we are suppressing the KeyError exception and replacing it with a more
                # descriptive ValueError exception
                raise ValueError("The component {} could not be found "
                                 "in the schema.".format(validate_component))

            # automatic JSON schema generation and validation with that JSON schema
            val_errors, val_warnings = self.validateModelManifest(
                manifestPath=manifest_path,
                rootNode=validate_component,
                restrict_rules=restrict_rules,
                project_scope=project_scope,
            )

            # if there are no errors in validation process
            if val_errors == []:
                # upload manifest file from `manifest_path` path to entity with Syn ID `dataset_id`
                if exists(censored_manifest_path):
                    censored_manifest_id = syn_store.associateMetadataWithFiles(
                        schemaGenerator=self.sg,
                        metadataManifestPath=censored_manifest_path,
                        datasetId=dataset_id,
                        manifest_record_type=manifest_record_type,
                        useSchemaLabel=use_schema_label,
                        hideBlanks=hide_blanks,
                    )
                    restrict_maniest = True

                manifest_id = syn_store.associateMetadataWithFiles(
                    schemaGenerator=self.sg,
                    metadataManifestPath=manifest_path,
                    datasetId=dataset_id,
                    manifest_record_type=manifest_record_type,
                    useSchemaLabel=use_schema_label,
                    hideBlanks=hide_blanks,
                    restrict_manifest=restrict_maniest,
                )

                logger.info(f"No validation errors occured during validation.")
                return manifest_id

            else:
                raise ValidationError(
                    "Manifest could not be validated under provided data model. "
                    f"Validation failed with the following errors: {val_errors}"
                )

        # no need to perform validation, just submit/associate the metadata manifest file
        if exists(censored_manifest_path):
            censored_manifest_id = syn_store.associateMetadataWithFiles(
                schemaGenerator=self.sg,
                metadataManifestPath=censored_manifest_path,
                datasetId=dataset_id,
                manifest_record_type=manifest_record_type,
                useSchemaLabel=use_schema_label,
                hideBlanks=hide_blanks,
            )
            restrict_maniest = True

        manifest_id = syn_store.associateMetadataWithFiles(
            schemaGenerator=self.sg,
            metadataManifestPath=manifest_path,
            datasetId=dataset_id,
            manifest_record_type=manifest_record_type,
            useSchemaLabel=use_schema_label,
            hideBlanks=hide_blanks,
            restrict_manifest=restrict_maniest,
        )

        logger.debug(
            "Optional validation was not performed on manifest before association."
        )

        return manifest_id