Example #1
0
    def __init__(self,
                inputMModelLocation: str,
                inputMModelLocationType: str,
                ) -> None:

        """Instantiates a MetadataModel object.

        Args:
            inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
            inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
        """
        # extract extension of 'inputMModelLocation'
        # ensure that it is necessarily pointing to a '.jsonld' file
        if inputMModelLocation.rpartition('.')[-1] == "jsonld":
            logger.debug(f"Initializing SchemaGenerator object from {inputMModelLocation} schema.")
            self.inputMModelLocation = inputMModelLocation

            self.sg = SchemaGenerator(inputMModelLocation)
        else:
            raise TypeError(f"Please make sure {inputMModelLocation} is a .jsonld file.")

        # check if the type of MModel file is "local"
        # currently, the application only supports reading from local JSON-LD files
        if inputMModelLocationType == "local":
            self.inputMModelLocationType = inputMModelLocationType
        else:
            raise ValueError(f"The type '{inputMModelLocationType}' is currently not supported.")
Example #2
0
    def __init__(
        self,
        path_to_json_ld:
        str,  # JSON-LD file to be used for generating the manifest
        title: str = None,  # manifest sheet title
        root: str = None,
        additional_metadata: Dict = None,
        oauth: bool = True,
        use_annotations: bool = False,
    ) -> None:
        """TODO: read in a config file instead of hardcoding paths to credential files...
        """

        if oauth:
            # if user wants to use OAuth for Google authentication
            # use credentials.json and create token.pickle file
            services_creds = build_credentials()
        else:
            # if not oauth then use service account credentials
            services_creds = build_service_account_creds()

        # google service for Sheet API
        self.sheet_service = services_creds["sheet_service"]

        # google service for Drive API
        self.drive_service = services_creds["drive_service"]

        # google service credentials object
        self.creds = services_creds["creds"]

        # schema root
        self.root = root

        # manifest title
        self.title = title
        if self.title is None:
            self.title = f"{self.root} - Manifest"

        # Whether to use existing annotations during manifest generation
        self.use_annotations = use_annotations

        # Warn about limited feature support for `use_annotations`
        if self.use_annotations:
            logger.warning(
                "The `use_annotations` option is currently only supported "
                "when there is no manifest file for the dataset in question.")

        # SchemaGenerator() object
        self.sg = SchemaGenerator(path_to_json_ld)

        # additional metadata to add to manifest
        self.additional_metadata = additional_metadata

        # Determine whether current data type is file-based
        is_file_based = False
        if self.root:
            is_file_based = "Filename" in self.sg.get_node_dependencies(
                self.root)
        self.is_file_based = is_file_based
def main():
    args = get_args()
    schemas = []

    # get all required data types from data model jsonld
    sg = SchemaGenerator(path_to_json_ld=args.jsonld_path)
    component_digraph = sg.se.get_digraph_by_edge_type('requiresComponent')
    components = component_digraph.nodes()

    # save display_name, schema_name, assay type to list
    for schema in components:
        cp = any(re.findall(r'clinical|biospecimen', schema, re.IGNORECASE))
        assay_type = 'record' if cp else 'file'
        schemas.append({
            'display_name': unCamel(schema),
            'schema_name': schema,
            'type': assay_type
        })

    # write out the config.json including some versions
    config = {
        'manifest_schemas': schemas,
        'community': args.project_name,
        'schema-version': args.schema_version,
        'model-version': args.model_version
    }
    with open(f'{args.out_dir}/config.json', 'w') as o:
        o.write(json.dumps(config, indent=2, separators=(',', ': ')))
    def get_message_level(
        val_rule: str,
        sg: SchemaGenerator,
        attribute_name: str,
    ) -> str:
        """
        Purpose:
            Determine whether an error or warning message should be logged and displayed

            Types of error/warning included:
                - recommended - Raised when an attribute is empty and recommended but not required.
                - unique - Raised when attribute values are not unique.
                - protectAges - Raised when an attribute contains ages below 18YO or over 90YO that should be censored.
        Input:
                val_rule: str, defined in the schema.
                sg: schemaGenerator object
                attribute_name: str, attribute being validated
        Returns:
            'error' or 'warning'
        """

        rule_parts = val_rule.split(" ")

        #See if the node is required, if it is and the column is missing then a requirement error will be raised later; no error or waring logged here if recommended and required but missing
        if val_rule.startswith('recommended') and sg.is_node_required(
                node_display_name=attribute_name):
            level = None

        #if not required, use the message level specified in the rule
        elif rule_parts[-1].lower() == 'error':
            level = 'error'

        elif rule_parts[-1].lower() == 'warning':
            level = 'warning'

        #if no level specified, the default level is warning
        else:
            level = 'warning'

        return level
Example #5
0
class MetadataModel(object):
    """Metadata model wrapper around schema.org specification graph.

    Provides basic utilities to:

    1) manipulate the metadata model
    2) generate metadata model views:
        - generate manifest view of the metadata model
        - generate validation schema view of the metadata model
    """

    def __init__(self,
                inputMModelLocation: str,
                inputMModelLocationType: str,
                ) -> None:

        """Instantiates a MetadataModel object.

        Args:
            inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
            inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
        """
        # extract extension of 'inputMModelLocation'
        # ensure that it is necessarily pointing to a '.jsonld' file
        if inputMModelLocation.rpartition('.')[-1] == "jsonld":
            logger.debug(f"Initializing SchemaGenerator object from {inputMModelLocation} schema.")
            self.inputMModelLocation = inputMModelLocation

            self.sg = SchemaGenerator(inputMModelLocation)
        else:
            raise TypeError(f"Please make sure {inputMModelLocation} is a .jsonld file.")

        # check if the type of MModel file is "local"
        # currently, the application only supports reading from local JSON-LD files
        if inputMModelLocationType == "local":
            self.inputMModelLocationType = inputMModelLocationType
        else:
            raise ValueError(f"The type '{inputMModelLocationType}' is currently not supported.")

    # business logic: expose metadata model "views" depending on "controller" logic
    # (somewhat analogous to Model View Controller pattern for GUI/web applications)
    # i.e. jsonschemas, annotation manifests, metadata/annotation dictionary web explorer
    # are all "views" of the metadata model.
    # The "business logic" in this MetadataModel class provides functions exposing relevant parts
    # of the metadata model needed so that these views can be generated by user facing components;
    # controller components are (loosely speaking) responsible for handling the interaction between views and the model
    # some of these components right now reside in the Bundle class

    def getModelSubgraph(self, rootNode: str,
                        subgraphType: str) -> nx.DiGraph:
        """Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType.

        Args:
            rootNode: a schema node label (i.e. term).
            subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels).

        Returns:
            A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants.

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        pass

    def getOrderedModelNodes(self, rootNode: str, relationshipType: str) -> List[str]:
        """Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type.

        Args:
            rootNode: a schema object/node label (i.e. term)
            relationshipType: edge label type of the schema subgraph (e.g. requiresDependency)

        Returns:
            An ordered list of objects, that are all descendants of rootNode.

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        ordered_nodes = self.sg.get_descendants_by_edge_type(rootNode, relationshipType, connected=True, ordered=True)

        ordered_nodes.reverse()

        return ordered_nodes


    def getModelManifest(self, title: str, rootNode: str, jsonSchema: str = None, filenames: list = None) -> str:
        """Gets data from the annotations manifest file.

        TBD: Does this method belong here or in manifest generator?

        Args:
            rootNode: a schema node label (i.e. term).

        Returns:
            A manifest URI (assume Google doc for now).

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        additionalMetadata = {}
        if filenames:
            additionalMetadata["Filename"] = filenames

        mg = ManifestGenerator(path_to_json_ld=self.inputMModelLocation, 
                               title=title, 
                               root=rootNode, 
                               additional_metadata=additionalMetadata)

        if jsonSchema:
            return mg.get_manifest(json_schema=jsonSchema)

        return mg.get_manifest()


    def get_component_requirements(self, source_component: str) -> List[str]:
        """Given a source model component (see https://w3id.org/biolink/vocab/category for definnition of component), return all components required by it.
        Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes;
        Can be utilized to track metadata completion progress across multiple categories of attributes.

        Args:
            source_component: an attribute label indicating the source component.

        Returns:
            A list of required components associated with the source component.
        """
        # get metadata model schema graph
        # mm_graph = self.se.get_nx_schema()

        # get required components for the input/source component
        req_components = self.sg.get_component_requirements(source_component)
        # req_components = get_component_requirements(mm_graph, source_component)

        return req_components


    # TODO: abstract validation in its own module
    def validateModelManifest(self, manifestPath: str, rootNode: str, jsonSchema: str = None) -> List[str]:
        """Check if provided annotations manifest dataframe satisfies all model requirements.

        Args:
            rootNode: a schema node label (i.e. term).
            manifestPath: a path to the manifest csv file containing annotations.

        Returns:
            A validation status message; if there is an error the message.
            contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record.

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        # get validation schema for a given node in the data model, if the user has not provided input validation schema
        if not jsonSchema:
            jsonSchema = self.sg.get_json_schema_requirements(rootNode, rootNode + "_validation")

        errors = []

        # get annotations from manifest (array of json annotations corresponding to manifest rows)
        manifest = pd.read_csv(manifestPath)    # read manifest csv file as is from manifest path
        manifest = trim_commas_df(manifest).fillna("")  # apply cleaning logic as part of pre-processing step
 
        # handler for mismatched components/data types
        # throw TypeError if the value(s) in the "Component" column differ from the selected template type
        if ('Component' in manifest.columns) and (
            (len(manifest['Component'].unique()) > 1) or (manifest['Component'].unique()[0] != rootNode)
            ):
            logging.error(f"The 'Component' column value(s) {manifest['Component'].unique()} do not match the "
                          f"selected template type '{rootNode}'.")
            
            # row indexes for all rows where 'Component' is rootNode
            row_idxs = manifest.index[manifest['Component'] != rootNode].tolist()
            # column index value for the 'Component' column
            col_idx = manifest.columns.get_loc('Component')
            # Series with index and 'Component' values from manifest
            mismatched_ser = manifest.iloc[row_idxs, col_idx]
            for index, component in mismatched_ser.items():
                errors.append([
                    index + 2,
                    'Component',
                    f"Component value provided is: '{component}', whereas the Template Type is: '{rootNode}'",

                    # tuple of the component in the manifest and selected template type
                    # check: R/Reticulate cannnot handle dicts? So returning tuple
                    (component, rootNode)
                ])
                
            return errors

        # check if each of the provided annotation columns has validation rule 'list'
        # if so, assume annotation for this column are comma separated list of multi-value annotations
        # convert multi-valued annotations to list
        for col in manifest.columns:

            # remove trailing/leading whitespaces from manifest
            manifest.applymap(lambda x: x.strip() if isinstance(x, str) else x)

            # convert manifest values to string
            # TODO: when validation handles annotation types as validation rules
            # would have to avoid converting everything to string
            manifest[col] = manifest[col].astype(str)

            # if the validation rule is set to list, convert items in the
            # annotations manifest to a list and strip each value from leading/trailing spaces
            if "list" in self.sg.get_node_validation_rules(col):
                manifest[col] = manifest[col].apply(lambda x: [s.strip() for s in str(x).split(",")])

        annotations = json.loads(manifest.to_json(orient='records'))
        for i, annotation in enumerate(annotations):
            v = Draft7Validator(jsonSchema)

            for error in sorted(v.iter_errors(annotation), key=exceptions.relevance):
                errorRow = i + 2
                errorCol = error.path[-1] if len(error.path) > 0 else "Wrong schema"
                errorMsg = error.message[0:500]
                errorVal = error.instance if len(error.path) > 0 else "Wrong schema"

                errors.append([errorRow, errorCol, errorMsg, errorVal])

        return errors


    def populateModelManifest(self, title, manifestPath: str, rootNode: str) -> str:
        """Populate an existing annotations manifest based on a dataframe.

        Args:
            rootNode: a schema node label (i.e. term).
            manifestPath: a path to the manifest csv file containing annotations.

        Returns:
            A link to the filled in model manifest (e.g. google sheet).

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        mg = ManifestGenerator(path_to_json_ld=self.inputMModelLocation, 
                               title=title, 
                               root=rootNode)

        emptyManifestURL = mg.get_manifest()

        return mg.populate_manifest_spreadsheet(manifestPath, emptyManifestURL)


    def submit_metadata_manifest(self, manifest_path: str, dataset_id: str, validate_component: str = None) -> bool:
        """Wrap methods that are responsible for validation of manifests for a given component, and association of the
        same manifest file with a specified dataset.
        Args:
            manifest_path: Path to the manifest file, which contains the metadata.
            dataset_id: Synapse ID of the dataset on Synapse containing the metadata manifest file.
            validate_component: Component from the schema.org schema based on which the manifest template has been generated.
        Returns:
            True: If both validation and association were successful.
        Exceptions:
            ValueError: When validate_component is provided, but it cannot be found in the schema.
            ValidationError: If validation against data model was not successful.
        """
        syn_store = SynapseStorage()

        # check if user wants to perform validation or not
        if validate_component is not None:

            try:
                # check if the component ("class" in schema) passed as argument is valid (present in schema) or not
                self.sg.se.is_class_in_schema(validate_component)
            except:
                # a KeyError exception is raised when validate_component fails in the try-block above
                # here, we are suppressing the KeyError exception and replacing it with a more
                # descriptive ValueError exception
                raise ValueError("The component {} could not be found "
                                 "in the schema.".format(validate_component))

            # automatic JSON schema generation and validation with that JSON schema
            val_errors = self.validateModelManifest(manifestPath=manifest_path, rootNode=validate_component)

            # if there are no errors in validation process
            if not val_errors:

                # upload manifest file from `manifest_path` path to entity with Syn ID `dataset_id`
                syn_store.associateMetadataWithFiles(metadataManifestPath=manifest_path, datasetId=dataset_id)

                logger.info(f"No validation errors occured during validation.")
                return True
            else:
                raise ValidationError("Manifest could not be validated under provided data model. "
                                      f"Validation failed with the following errors: {val_errors}")

        # no need to perform validation, just submit/associate the metadata manifest file
        syn_store.associateMetadataWithFiles(metadataManifestPath=manifest_path, datasetId=dataset_id)

        logger.debug("Optional validation was not performed on manifest before association.")
        
        return True
Example #6
0
class ManifestGenerator(object):
    def __init__(
        self,
        path_to_json_ld:
        str,  # JSON-LD file to be used for generating the manifest
        title: str = None,  # manifest sheet title
        root: str = None,
        additional_metadata: Dict = None,
        oauth: bool = True,
        use_annotations: bool = False,
    ) -> None:
        """TODO: read in a config file instead of hardcoding paths to credential files...
        """

        if oauth:
            # if user wants to use OAuth for Google authentication
            # use credentials.json and create token.pickle file
            services_creds = build_credentials()
        else:
            # if not oauth then use service account credentials
            services_creds = build_service_account_creds()

        # google service for Sheet API
        self.sheet_service = services_creds["sheet_service"]

        # google service for Drive API
        self.drive_service = services_creds["drive_service"]

        # google service credentials object
        self.creds = services_creds["creds"]

        # schema root
        self.root = root

        # manifest title
        self.title = title
        if self.title is None:
            self.title = f"{self.root} - Manifest"

        # Whether to use existing annotations during manifest generation
        self.use_annotations = use_annotations

        # Warn about limited feature support for `use_annotations`
        if self.use_annotations:
            logger.warning(
                "The `use_annotations` option is currently only supported "
                "when there is no manifest file for the dataset in question.")

        # SchemaGenerator() object
        self.sg = SchemaGenerator(path_to_json_ld)

        # additional metadata to add to manifest
        self.additional_metadata = additional_metadata

        # Determine whether current data type is file-based
        is_file_based = False
        if self.root:
            is_file_based = "Filename" in self.sg.get_node_dependencies(
                self.root)
        self.is_file_based = is_file_based

    def _attribute_to_letter(self, attribute, manifest_fields):
        """Map attribute to column letter in a google sheet
        """

        # find index of attribute in manifest field
        column_idx = manifest_fields.index(attribute)

        # return the google sheet letter representation of the column index
        return self._column_to_letter(column_idx)

    def _column_to_letter(self, column):
        """Find google sheet letter representation of a column index integer
         """
        character = chr(ord('A') + column % 26)
        remainder = column // 26
        if column >= 26:
            return self._column_to_letter(remainder - 1) + character
        else:
            return character

    def _columns_to_sheet_ranges(self, column_idxs):
        """map a set of column indexes to a set of Google sheet API ranges: each range includes exactly one column
        """
        ranges = []

        for column_idx in column_idxs:
            col_range = {
                "startColumnIndex": column_idx,
                "endColumnIndex": column_idx + 1
            }

            ranges.append(col_range)

        return ranges

    def _column_to_cond_format_eq_rule(self,
                                       column_idx: int,
                                       condition_argument: str,
                                       required: bool = False) -> dict:
        """Given a column index and an equality argument (e.g. one of valid values for the given column fields), generate a conditional formatting rule based on a custom formula encoding the logic:

        'if a cell in column idx is equal to condition argument, then set specified formatting'
        """

        col_letter = self._column_to_letter(column_idx)

        if not required:
            bg_color = CONFIG["style"]["google_manifest"].get(
                "opt_bg_color", {
                    "red": 1.0,
                    "green": 1.0,
                    "blue": 0.9019,
                })
        else:
            bg_color = CONFIG["style"]["google_manifest"].get(
                "req_bg_color", {
                    "red": 0.9215,
                    "green": 0.9725,
                    "blue": 0.9803,
                })

        boolean_rule = {
            "condition": {
                "type":
                "CUSTOM_FORMULA",
                "values": [{
                    "userEnteredValue":
                    '=$' + col_letter + '1 = "' + condition_argument + '"'
                }]
            },
            "format": {
                'backgroundColor': bg_color
            }
        }

        return boolean_rule

    def _gdrive_copy_file(self, origin_file_id, copy_title):
        """Copy an existing file.

        Args:
            origin_file_id: ID of the origin file to copy.
            copy_title: Title of the copy.

        Returns:
            The copied file if successful, None otherwise.
        """
        copied_file = {'name': copy_title}

        # return new copy sheet ID
        return self.drive_service.files().copy(
            fileId=origin_file_id, body=copied_file).execute()["id"]

    def _create_empty_manifest_spreadsheet(self, title):
        if CONFIG["style"]["google_manifest"]["master_template_id"]:

            # if provided with a template manifest google sheet, use it
            spreadsheet_id = self._gdrive_copy_file(
                CONFIG["style"]["google_manifest"]["master_template_id"],
                title)

        else:
            # if no template, create an empty spreadsheet
            spreadsheet = self.sheet_service.spreadsheets().create(
                body=spreadsheet, fields='spreadsheetId').execute()
            spreadsheet_id = spreadsheet.get('spreadsheetId')

        return spreadsheet_id

    def _get_cell_borders(self, cell_range):

        #set border style request
        color = {
            "red": 226.0 / 255.0,
            "green": 227.0 / 255.0,
            "blue": 227.0 / 255.0,
        }

        border_style_req = {
            "updateBorders": {
                "range": cell_range,
                "top": {
                    "style": "SOLID",
                    "width": 2,
                    "color": color
                },
                "bottom": {
                    "style": "SOLID",
                    "width": 2,
                    "color": color
                },
                "left": {
                    "style": "SOLID",
                    "width": 2,
                    "color": color
                },
                "right": {
                    "style": "SOLID",
                    "width": 2,
                    "color": color
                },
                "innerHorizontal": {
                    "style": "SOLID",
                    "width": 2,
                    "color": color
                },
                "innerVertical": {
                    "style": "SOLID",
                    "width": 2,
                    "color": color
                }
            }
        }

        return border_style_req

    def _set_permissions(self, fileId):
        def callback(request_id, response, exception):
            if exception:
                # Handle error
                logger.error(exception)
            else:
                logger.info(f"Permission Id: {response.get('id')}")

        batch = self.drive_service.new_batch_http_request(callback=callback)

        worldPermission = {'type': 'anyone', 'role': 'writer'}

        batch.add(self.drive_service.permissions().create(
            fileId=fileId,
            body=worldPermission,
            fields='id',
        ))
        batch.execute()

    def _get_column_data_validation_values(
            self,
            spreadsheet_id,
            valid_values,
            column_id,
            validation_type="ONE_OF_LIST",
            strict=True,
            custom_ui=True,
            input_message="Choose one from dropdown"):

        # get valid values w/o google sheet header
        values = [
            valid_value["userEnteredValue"] for valid_value in valid_values
        ]

        if validation_type == "ONE_OF_RANGE":

            # store valid values explicitly in workbook at the provided range to use as validation values
            target_col_letter = self._column_to_letter(column_id)
            body = {"majorDimension": "COLUMNS", "values": [values]}
            target_range = 'Sheet2!' + target_col_letter + '2:' + target_col_letter + str(
                len(values) + 1)
            valid_values = [{"userEnteredValue": "=" + target_range}]

            response = self.sheet_service.spreadsheets().values().update(
                spreadsheetId=spreadsheet_id,
                range=target_range,
                valueInputOption="RAW",
                body=body).execute()

        # setup validation data request body
        validation_body = {
            "requests": [{
                'setDataValidation': {
                    'range': {
                        'startRowIndex': 1,
                        'startColumnIndex': column_id,
                        'endColumnIndex': column_id + 1,
                    },
                    'rule': {
                        'condition': {
                            'type': validation_type,
                            'values': valid_values
                        },
                        'inputMessage': input_message,
                        'strict': strict,
                        'showCustomUi': custom_ui
                    }
                }
            }]
        }

        return validation_body

    def _get_valid_values_from_jsonschema_property(self,
                                                   prop: dict) -> List[str]:
        """Get valid values for a manifest attribute based on the corresponding
        values of node's properties in JSONSchema

        Args:
            prop: node properties - jsonschema dictionary

        Returns:
            List of valid values
        """

        if "enum" in prop:
            return prop["enum"]
        elif "items" in prop:
            return prop["items"]["enum"]
        else:
            return []

    def get_empty_manifest(self, json_schema_filepath=None):
        # TODO: Refactor get_manifest method
        # - abstract function for requirements gathering
        # - abstract google sheet API requests as functions
        # --- specifying row format
        # --- setting valid values in dropdowns for columns/cells
        # --- setting notes/comments to cells

        spreadsheet_id = self._create_empty_manifest_spreadsheet(self.title)

        if not json_schema_filepath:
            # if no json schema is provided; there must be
            # schema explorer defined for schema.org schema
            # o.w. this will throw an error
            # TODO: catch error
            json_schema = self.sg.get_json_schema_requirements(
                self.root, self.title)
        else:
            with open(json_schema_filepath) as jsonfile:
                json_schema = json.load(jsonfile)

        required_metadata_fields = {}

        # gathering dependency requirements and corresponding allowed values constraints (i.e. valid values) for root node
        for req in json_schema["properties"].keys():
            required_metadata_fields[
                req] = self._get_valid_values_from_jsonschema_property(
                    json_schema["properties"][req])
            # the following line may not be needed
            json_schema["properties"][req]["enum"] = required_metadata_fields[
                req]

        # gathering dependency requirements and allowed value constraints for conditional dependencies if any
        if "allOf" in json_schema:
            for conditional_reqs in json_schema["allOf"]:
                if "required" in conditional_reqs["if"]:
                    for req in conditional_reqs["if"]["required"]:
                        if req in conditional_reqs["if"]["properties"]:
                            if not req in required_metadata_fields:
                                if req in json_schema["properties"]:
                                    required_metadata_fields[
                                        req] = self._get_valid_values_from_jsonschema_property(
                                            json_schema["properties"][req])
                                else:
                                    required_metadata_fields[
                                        req] = self._get_valid_values_from_jsonschema_property(
                                            conditional_reqs["if"]
                                            ["properties"][req])

                    for req in conditional_reqs["then"]["required"]:
                        if not req in required_metadata_fields:
                            if req in json_schema["properties"]:
                                required_metadata_fields[
                                    req] = self._get_valid_values_from_jsonschema_property(
                                        json_schema["properties"][req])

        # if additional metadata is provided append columns (if those do not exist already)
        if self.additional_metadata:
            for column in self.additional_metadata.keys():
                if not column in required_metadata_fields:
                    required_metadata_fields[column] = []

        # if 'component' is in column set (see your input jsonld schema for definition of 'component', if the 'component' attribute is present), add the root node as an additional metadata component entry
        if 'Component' in required_metadata_fields.keys():
            # check if additional metadata has actually been instantiated in the constructor (it's optional)
            # if not, instantiate it
            if not self.additional_metadata:
                self.additional_metadata = {}

            self.additional_metadata['Component'] = [self.root]

        # adding columns to manifest sheet
        end_col = len(required_metadata_fields.keys())
        end_col_letter = self._column_to_letter(end_col)

        # order columns header (since they are generated based on a json schema, which is a dict)
        ordered_metadata_fields = [list(required_metadata_fields.keys())]

        ordered_metadata_fields[0] = self.sort_manifest_fields(
            ordered_metadata_fields[0])

        body = {"values": ordered_metadata_fields}

        #determining columns range
        end_col = len(required_metadata_fields.keys())
        end_col_letter = self._column_to_letter(end_col)

        range = "Sheet1!A1:" + str(end_col_letter) + "1"

        # adding columns
        self.sheet_service.spreadsheets().values().update(
            spreadsheetId=spreadsheet_id,
            range=range,
            valueInputOption="RAW",
            body=body).execute()

        # adding columns to 2nd sheet that can be used for storing data validation ranges (this avoids limitations on number of dropdown items in excel and openoffice)
        range = "Sheet2!A1:" + str(end_col_letter) + "1"
        self.sheet_service.spreadsheets().values().update(
            spreadsheetId=spreadsheet_id,
            range=range,
            valueInputOption="RAW",
            body=body).execute()

        # format column header row
        header_format_body = {
            "requests": [{
                "repeatCell": {
                    "range": {
                        "startRowIndex": 0,
                        "endRowIndex": 1
                    },
                    "cell": {
                        "userEnteredFormat": {
                            "backgroundColor": {
                                "red": 224.0 / 255,
                                "green": 224.0 / 255,
                                "blue": 224.0 / 255
                            },
                            "horizontalAlignment": "CENTER",
                            "textFormat": {
                                "foregroundColor": {
                                    "red": 0.0 / 255,
                                    "green": 0.0 / 255,
                                    "blue": 0.0 / 255
                                },
                                "fontSize": 8,
                                "bold": True
                            }
                        }
                    },
                    "fields":
                    "userEnteredFormat(backgroundColor,textFormat,horizontalAlignment)"
                }
            }, {
                "updateSheetProperties": {
                    "properties": {
                        "gridProperties": {
                            "frozenRowCount": 1
                        }
                    },
                    "fields": "gridProperties.frozenRowCount"
                }
            }, {
                "autoResizeDimensions": {
                    "dimensions": {
                        "dimension": "COLUMNS",
                        "startIndex": 0
                    }
                }
            }]
        }

        response = self.sheet_service.spreadsheets().batchUpdate(
            spreadsheetId=spreadsheet_id, body=header_format_body).execute()

        # adding additional metadata values if needed
        # adding value-constraints from data model as dropdowns

        # fix for issue #410
        # batch google API request to create metadata template
        data = []

        for i, req in enumerate(ordered_metadata_fields[0]):
            values = required_metadata_fields[req]

            if self.additional_metadata and req in self.additional_metadata:
                values = self.additional_metadata[req]
                target_col_letter = self._column_to_letter(i)

                range_vals = target_col_letter + '2:' + target_col_letter + str(
                    len(values) + 1)

                data.append({
                    "range": range_vals,
                    "majorDimension": "COLUMNS",
                    "values": [values]
                })

        batch_update_values_request_body = {
            # How the input data should be interpreted.
            "valueInputOption": "RAW",

            # The new values to apply to the spreadsheet.
            "data": data
        }

        response = self.sheet_service.spreadsheets().values().batchUpdate(
            spreadsheetId=spreadsheet_id,
            body=batch_update_values_request_body).execute()

        # end of fix for issue #410

        #store all requests to execute at once
        requests_body = {}
        requests_body["requests"] = []
        for i, req in enumerate(ordered_metadata_fields[0]):
            values = required_metadata_fields[req]

            # adding description to headers
            # this is not executed if only JSON schema is defined
            # TODO: abstract better and document

            # also formatting required columns
            if self.sg.se:

                # get node definition
                note = self.sg.get_node_definition(req)

                notes_body = {
                    "requests": [{
                        "updateCells": {
                            "range": {
                                "startRowIndex": 0,
                                "endRowIndex": 1,
                                "startColumnIndex": i,
                                "endColumnIndex": i + 1
                            },
                            "rows": [{
                                "values": [{
                                    "note": note
                                }]
                            }],
                            "fields": "note"
                        }
                    }]
                }

                requests_body["requests"].append(notes_body["requests"])

            # get node validation rules if any
            validation_rules = self.sg.get_node_validation_rules(req)

            # if 'list' in validation rules add a note with instructions on
            # adding a list of multiple values
            # TODO: add validation and QC rules "compiler/generator" class elsewhere
            # for now have the list logic here
            if "list" in validation_rules:
                note = "From 'Selection options' menu above, go to 'Select multiple values', check all items that apply, and click 'Save selected values'"
                notes_body = {
                    "requests": [{
                        "repeatCell": {
                            "range": {
                                "startRowIndex": 1,
                                "startColumnIndex": i,
                                "endColumnIndex": i + 1
                            },
                            "cell": {
                                "note": note
                            },
                            "fields": "note"
                        }
                    }]
                }

                requests_body["requests"].append(notes_body["requests"])

            # update background colors so that columns that are required are highlighted
            # check if attribute is required and set a corresponding color
            if req in json_schema["required"]:
                bg_color = CONFIG["style"]["google_manifest"].get(
                    "req_bg_color", {
                        "red": 0.9215,
                        "green": 0.9725,
                        "blue": 0.9803,
                    })

                req_format_body = {
                    "requests": [{
                        "repeatCell": {
                            "range": {
                                "startColumnIndex": i,
                                "endColumnIndex": i + 1
                            },
                            "cell": {
                                "userEnteredFormat": {
                                    "backgroundColor": bg_color
                                }
                            },
                            "fields": "userEnteredFormat(backgroundColor)"
                        }
                    }]
                }

                requests_body["requests"].append(req_format_body["requests"])

            # adding value-constraints if any
            req_vals = [{
                "userEnteredValue": value
            } for value in values if value]

            if not req_vals:
                continue

            # generating sheet api request to populate a dropdown or a multi selection UI
            if len(req_vals) > 0 and not "list" in validation_rules:
                # if more than 0 values in dropdown use ONE_OF_RANGE type of validation since excel and openoffice
                # do not support other kinds of data validation for larger number of items (even if individual items are not that many
                # excel has a total number of characters limit per dropdown...)
                validation_body = self._get_column_data_validation_values(
                    spreadsheet_id,
                    req_vals,
                    i,
                    validation_type="ONE_OF_RANGE")

            elif "list" in validation_rules:
                # if list is in validation rule attempt to create a multi-value
                # selection UI, which requires explicit valid values range in
                # the spreadsheet
                validation_body = self._get_column_data_validation_values(
                    spreadsheet_id,
                    req_vals,
                    i,
                    strict=False,
                    custom_ui=False,
                    input_message="",
                    validation_type="ONE_OF_RANGE")

            else:
                validation_body = self._get_column_data_validation_values(
                    spreadsheet_id, req_vals, i)

            requests_body["requests"].append(validation_body["requests"])

            # generate a conditional format rule for each required value (i.e. valid value)
            # for this field (i.e. if this field is set to a valid value that may require additional
            # fields to be filled in, these additional fields will be formatted in a custom style (e.g. red background)
            for req_val in req_vals:
                # get this required/valid value's node label in schema, based on display name (i.e. shown to the user in a dropdown to fill in)
                req_val = req_val["userEnteredValue"]

                req_val_node_label = self.sg.get_node_label(req_val)
                if not req_val_node_label:
                    # if this node is not in the graph
                    # continue - there are no dependencies for it
                    continue

                # check if this required/valid value has additional dependency attributes
                val_dependencies = self.sg.get_node_dependencies(
                    req_val_node_label, schema_ordered=False)

                # prepare request calls
                dependency_formatting_body = {"requests": []}

                if val_dependencies:
                    # if there are additional attribute dependencies find the corresponding
                    # fields that need to be filled in and construct conditional formatting rules
                    # indicating the dependencies need to be filled in

                    # set target ranges for this rule
                    # i.e. dependency attribute columns that will be formatted

                    # find dependency column indexes
                    # note that dependencies values must be in index
                    # TODO: catch value error that shouldn't happen
                    column_idxs = [
                        ordered_metadata_fields[0].index(val_dep)
                        for val_dep in val_dependencies
                    ]

                    # construct ranges based on dependency column indexes
                    rule_ranges = self._columns_to_sheet_ranges(column_idxs)
                    # go over valid value dependencies
                    for j, val_dep in enumerate(val_dependencies):
                        is_required = False

                        if self.sg.is_node_required(val_dep):
                            is_required = True
                        else:
                            is_required = False

                        # construct formatting rule
                        formatting_rule = self._column_to_cond_format_eq_rule(
                            i, req_val, required=is_required)

                        # construct conditional format rule
                        conditional_format_rule = {
                            "addConditionalFormatRule": {
                                "rule": {
                                    "ranges": rule_ranges[j],
                                    "booleanRule": formatting_rule,
                                },
                                "index": 0
                            }
                        }
                        dependency_formatting_body["requests"].append(
                            conditional_format_rule)

                # check if dependency formatting rules have been added and update sheet if so
                if dependency_formatting_body["requests"]:
                    requests_body["requests"].append(
                        dependency_formatting_body["requests"])

        # setting cell borders
        cell_range = {
            "sheetId": 0,
            "startRowIndex": 0,
        }
        requests_body["requests"].append(self._get_cell_borders(cell_range))

        execute_google_api_requests(self.sheet_service,
                                    requests_body,
                                    service_type="batch_update",
                                    spreadsheet_id=spreadsheet_id)

        # setting up spreadsheet permissions (setup so that anyone with the link can edit)
        self._set_permissions(spreadsheet_id)

        # generating spreadsheet URL
        manifest_url = "https://docs.google.com/spreadsheets/d/" + spreadsheet_id

        # print("========================================================================================================")
        # print("Manifest successfully generated from schema!")
        # print("URL: " + manifest_url)
        # print("========================================================================================================")

        return manifest_url

    def set_dataframe_by_url(self, manifest_url: str,
                             manifest_df: pd.DataFrame) -> ps.Spreadsheet:
        """Update Google Sheets using given pandas DataFrame.

        Args:
            manifest_url (str): Google Sheets URL.
            manifest_df (pd.DataFrame): Data frame to "upload".

        Returns:
            ps.Spreadsheet: A Google Sheet object.
        """
        # authorize pygsheets to read from the given URL
        gc = ps.authorize(custom_credentials=self.creds)

        # open google sheets and extract first sheet
        sh = gc.open_by_url(manifest_url)
        wb = sh[0]

        # The following line sets `valueInputOption = "RAW"` in pygsheets
        sh.default_parse = False

        # update spreadsheet with given manifest starting at top-left cell
        wb.set_dataframe(manifest_df, (1, 1))

        # set permissions so that anyone with the link can edit
        sh.share("", role="writer", type="anyone")

        return sh

    def get_dataframe_by_url(self, manifest_url: str) -> pd.DataFrame:
        """Retrieve pandas DataFrame from table in Google Sheets.

        Args:
            manifest_url (str): Google Sheets URL.

        Return:
            pd.DataFrame: Data frame corresponding to table in given URL.
        """

        # authorize pygsheets to read from the given URL
        gc = ps.authorize(custom_credentials=self.creds)

        # open google sheets and extract first sheet
        sh = gc.open_by_url(manifest_url)
        wb = sh[0]

        # get column headers and read it into a dataframe
        manifest_df = wb.get_as_df(hasHeader=True)

        # An empty column is sometimes included
        if "" in manifest_df:
            manifest_df.drop(columns=[""], inplace=True)

        return manifest_df

    def map_annotation_names_to_display_names(
            self, annotations: pd.DataFrame) -> pd.DataFrame:
        """Update columns names to use display names for consistency.

        Args:
            annotations (pd.DataFrame): Annotations table.

        Returns:
            pd.DataFrame: Annotations table with updated column headers.
        """
        # Get list of attribute nodes from data model
        model_nodes = self.sg.se.get_nx_schema().nodes

        # Subset annotations to those appearing as a label in the model
        labels = filter(lambda x: x in model_nodes, annotations.columns)

        # Generate a dictionary mapping labels to display names
        label_map = {l: model_nodes[l]["displayName"] for l in labels}

        # Use the above dictionary to rename columns in question
        return annotations.rename(columns=label_map)

    def get_manifest_with_annotations(
            self,
            annotations: pd.DataFrame) -> Tuple[ps.Spreadsheet, pd.DataFrame]:
        """Generate manifest, optionally with annotations (if requested).

        Args:
            annotations (pd.DataFrame): Annotations table (can be empty).

        Returns:
            Tuple[ps.Spreadsheet, pd.DataFrame]: Both the Google Sheet
            URL and the corresponding data frame is returned.
        """

        # Map annotation labels to display names to match manifest columns
        annotations = self.map_annotation_names_to_display_names(annotations)

        # Convert annotations table into dictionary, but maintain order
        annotations_dict_raw = annotations.to_dict(into=OrderedDict)
        annotations_dict = OrderedDict(
            (k, list(v.values())) for k, v in annotations_dict_raw.items())

        # Needs to happen before get_empty_manifest() gets called
        self.additional_metadata = annotations_dict

        # Generate empty manifest using `additional_metadata`
        manifest_url = self.get_empty_manifest()
        manifest_df = self.get_dataframe_by_url(manifest_url)

        # Annotations clashing with manifest attributes are skipped
        # during empty manifest generation. For more info, search
        # for `additional_metadata` in `self.get_empty_manifest`.
        # Hence, the shared columns need to be updated separately.
        if self.is_file_based and self.use_annotations:
            # This approach assumes that `update_df` returns
            # a data frame whose columns are in the same order
            manifest_df = update_df(manifest_df, annotations)
            manifest_sh = self.set_dataframe_by_url(manifest_url, manifest_df)
            manifest_url = manifest_sh.url

        return manifest_url, manifest_df

    def get_manifest(self,
                     dataset_id: str = None,
                     sheet_url: bool = None,
                     json_schema: str = None):
        """Gets manifest for a given dataset on Synapse.

        Args:
            dataset_id: Synapse ID of the "dataset" entity on Synapse (for a given center/project).
            sheet_url: Determines if googlesheet URL or pandas dataframe should be returned.

        Returns:
            Googlesheet URL (if sheet_url is True), or pandas dataframe (if sheet_url is False).
        """

        # Handle case when no dataset ID is provided
        if not dataset_id:
            return self.get_empty_manifest(json_schema_filepath=json_schema)

        # Otherwise, create manifest using the given dataset
        syn_store = SynapseStorage()

        # Get manifest file associated with given dataset (if applicable)
        syn_id_and_path = syn_store.getDatasetManifest(datasetId=dataset_id)

        # Populate empty template with existing manifest
        if syn_id_and_path:

            # TODO: Update or remove the warning in self.__init__() if
            # you change the behavior here based on self.use_annotations

            # get synapse ID manifest associated with dataset
            manifest_data = syn_store.getDatasetManifest(datasetId=dataset_id,
                                                         downloadFile=True)

            # If the sheet URL isn't requested, simply return a pandas DataFrame
            if not sheet_url:
                return pd.read_csv(manifest_data.path)

            # get URL of an empty manifest file created based on schema component
            empty_manifest_url = self.get_empty_manifest()

            # populate empty manifest with content from downloaded/existing manifest
            pop_manifest_url = self.populate_manifest_spreadsheet(
                manifest_data.path, empty_manifest_url)

            return pop_manifest_url

        # Generate empty template and optionally fill in with annotations
        else:

            # Using getDatasetAnnotations() to retrieve file names and subset
            # entities to files and folders (ignoring tables/views)
            annotations = pd.DataFrame()
            if self.is_file_based:
                annotations = syn_store.getDatasetAnnotations(dataset_id)

            # Subset columns if no interested in user-defined annotations
            if self.is_file_based and not self.use_annotations:
                annotations = annotations[["Filename", "eTag", "entityId"]]

            # Update `additional_metadata` and generate manifest
            manifest_url, manifest_df = self.get_manifest_with_annotations(
                annotations)

            if sheet_url:
                return manifest_url
            else:
                return manifest_df

    def populate_manifest_spreadsheet(self, existing_manifest_path,
                                      empty_manifest_url):
        """Creates a google sheet manifest based on existing manifest.

        Args:
            existing_manifest_path: the location of the manifest containing metadata presently stored
            empty_manifest_url: the path to a manifest template to be prepopulated with existing's manifest metadata
        """

        # read existing manifest
        manifest = pd.read_csv(existing_manifest_path).fillna("")

        # sort manifest columns
        manifest_fields = manifest.columns.tolist()
        manifest_fields = self.sort_manifest_fields(manifest_fields)
        manifest = manifest[manifest_fields]

        # TODO: Handle scenario when existing manifest does not match new
        #       manifest template due to changes in the data model
        manifest_sh = self.set_dataframe_by_url(empty_manifest_url, manifest)

        return manifest_sh.url

    def sort_manifest_fields(self, manifest_fields, order="schema"):
        # order manifest fields alphabetically (base order)
        manifest_fields = sorted(manifest_fields)

        if order == "alphabetical":
            # if the order is alphabetical ensure that filename is first, if present
            if "Filename" in manifest_fields:
                manifest_fields.remove("Filename")
                manifest_fields.insert(0, "Filename")

        # order manifest fields based on schema (schema.org)
        if order == "schema":
            if self.sg and self.root:
                # get display names of dependencies
                dependencies_display_names = self.sg.get_node_dependencies(
                    self.root)

                # reorder manifest fields so that root dependencies are first and follow schema order
                manifest_fields = sorted(
                    manifest_fields,
                    key=lambda x: dependencies_display_names.index(x)
                    if x in dependencies_display_names else len(manifest_fields
                                                                ) - 1)
            else:
                raise ValueError(
                    f"Provide valid data model path and valid component from data model."
                )

        # always have entityId as last columnn, if present
        if "entityId" in manifest_fields:
            manifest_fields.remove("entityId")
            manifest_fields.append("entityId")

        return manifest_fields
Example #7
0
class MetadataModel(object):
    """Metadata model wrapper around schema.org specification graph.

    Provides basic utilities to:

    1) manipulate the metadata model
    2) generate metadata model views:
        - generate manifest view of the metadata model
        - generate validation schema view of the metadata model
    """
    def __init__(
        self,
        inputMModelLocation: str,
        inputMModelLocationType: str,
    ) -> None:
        """Instantiates a MetadataModel object.

        Args:
            inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
            inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
        """
        # extract extension of 'inputMModelLocation'
        # ensure that it is necessarily pointing to a '.jsonld' file
        if inputMModelLocation.rpartition(".")[-1] == "jsonld":
            logger.debug(
                f"Initializing SchemaGenerator object from {inputMModelLocation} schema."
            )
            self.inputMModelLocation = inputMModelLocation

            self.sg = SchemaGenerator(inputMModelLocation)
        else:
            raise TypeError(
                f"Please make sure {inputMModelLocation} is a .jsonld file.")

        # check if the type of MModel file is "local"
        # currently, the application only supports reading from local JSON-LD files
        if inputMModelLocationType == "local":
            self.inputMModelLocationType = inputMModelLocationType
        else:
            raise ValueError(
                f"The type '{inputMModelLocationType}' is currently not supported."
            )

    def getModelSubgraph(self, rootNode: str, subgraphType: str) -> nx.DiGraph:
        """Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType.

        Args:
            rootNode: a schema node label (i.e. term).
            subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels).

        Returns:
            A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants.

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        pass

    def getOrderedModelNodes(self, rootNode: str,
                             relationshipType: str) -> List[str]:
        """Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type.

        Args:
            rootNode: a schema object/node label (i.e. term)
            relationshipType: edge label type of the schema subgraph (e.g. requiresDependency)

        Returns:
            An ordered list of objects, that are all descendants of rootNode.

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        ordered_nodes = self.sg.get_descendants_by_edge_type(rootNode,
                                                             relationshipType,
                                                             connected=True,
                                                             ordered=True)

        ordered_nodes.reverse()

        return ordered_nodes

    def getModelManifest(
        self,
        title: str,
        rootNode: str,
        datasetId: str = None,
        jsonSchema: str = None,
        filenames: list = None,
        useAnnotations: bool = False,
        sheetUrl: bool = True,
    ) -> str:
        """Gets data from the annotations manifest file.

        TBD: Does this method belong here or in manifest generator?

        Args:
            rootNode: a schema node label (i.e. term).
            useAnnotations: whether to populate manifest with current file annotations (True) or not (False, default).

        Returns:
            A manifest URI (assume Google doc for now).

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        additionalMetadata = {}
        if filenames:
            additionalMetadata["Filename"] = filenames

        mg = ManifestGenerator(
            path_to_json_ld=self.inputMModelLocation,
            title=title,
            root=rootNode,
            additional_metadata=additionalMetadata,
            use_annotations=useAnnotations,
        )

        if datasetId:
            return mg.get_manifest(dataset_id=datasetId,
                                   json_schema=jsonSchema,
                                   sheet_url=sheetUrl)

        return mg.get_manifest(sheet_url=sheetUrl)

    def get_component_requirements(self,
                                   source_component: str,
                                   as_graph: bool = False) -> List:
        """Given a source model component (see https://w3id.org/biolink/vocab/category for definnition of component), return all components required by it.
        Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes;
        Can be utilized to track metadata completion progress across multiple categories of attributes.

        Args:
            source_component: an attribute label indicating the source component.
            as_graph: if False return component requirements as a list; if True return component requirements as a dependency graph (i.e. a DAG)

        Returns:
            A list of required components associated with the source component.
        """

        # get required components for the input/source component
        req_components = self.sg.get_component_requirements(source_component)

        # retreive components as graph
        if as_graph:
            req_components_graph = self.sg.get_component_requirements_graph(
                source_component)

            # serialize component dependencies DAG to a edge list of node tuples
            req_components = list(req_components_graph.edges())

            return req_components

        return req_components

    # TODO: abstract validation in its own module
    def validateModelManifest(
        self,
        manifestPath: str,
        rootNode: str,
        restrict_rules: bool = False,
        jsonSchema: str = None,
        project_scope: List = None,
    ) -> List[str]:
        """Check if provided annotations manifest dataframe satisfies all model requirements.

        Args:
            rootNode: a schema node label (i.e. term).
            manifestPath: a path to the manifest csv file containing annotations.
            restrict_rules: bypass great expectations and restrict rule options to those implemented in house

        Returns:
            A validation status message; if there is an error the message.
            contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record.

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        # get validation schema for a given node in the data model, if the user has not provided input validation schema

        if not jsonSchema:
            jsonSchema = self.sg.get_json_schema_requirements(
                rootNode, rootNode + "_validation")

        errors = []
        warnings = []

        load_args = {"dtype": "string"}
        # get annotations from manifest (array of json annotations corresponding to manifest rows)
        manifest = load_df(
            manifestPath,
            preserve_raw_input=False,
            **load_args,
        )  # read manifest csv file as is from manifest path

        # handler for mismatched components/data types
        # throw TypeError if the value(s) in the "Component" column differ from the selected template type
        if ("Component" in manifest.columns) and (
            (len(manifest["Component"].unique()) > 1) or
            (manifest["Component"].unique()[0] != rootNode)):
            logging.error(
                f"The 'Component' column value(s) {manifest['Component'].unique()} do not match the "
                f"selected template type '{rootNode}'.")

            # row indexes for all rows where 'Component' is rootNode
            row_idxs = manifest.index[
                manifest["Component"] != rootNode].tolist()
            # column index value for the 'Component' column
            col_idx = manifest.columns.get_loc("Component")
            # Series with index and 'Component' values from manifest
            mismatched_ser = manifest.iloc[row_idxs, col_idx]
            for index, component in mismatched_ser.items():
                errors.append([
                    index + 2,
                    "Component",
                    f"Component value provided is: '{component}', whereas the Template Type is: '{rootNode}'",
                    # tuple of the component in the manifest and selected template type
                    # check: R/Reticulate cannnot handle dicts? So returning tuple
                    (component, rootNode),
                ])

            return errors, warnings

        errors, warnings, manifest = validate_all(self, errors, warnings,
                                                  manifest, manifestPath,
                                                  self.sg, jsonSchema,
                                                  restrict_rules,
                                                  project_scope)
        return errors, warnings

    def populateModelManifest(self, title, manifestPath: str,
                              rootNode: str) -> str:
        """Populate an existing annotations manifest based on a dataframe.
            TODO: Remove this method; always use getModelManifest instead

        Args:
            rootNode: a schema node label (i.e. term).
            manifestPath: a path to the manifest csv file containing annotations.

        Returns:
            A link to the filled in model manifest (e.g. google sheet).

        Raises:
            ValueError: rootNode not found in metadata model.
        """
        mg = ManifestGenerator(path_to_json_ld=self.inputMModelLocation,
                               title=title,
                               root=rootNode)

        emptyManifestURL = mg.get_manifest()

        return mg.populate_manifest_spreadsheet(manifestPath, emptyManifestURL)

    def submit_metadata_manifest(
        self,
        manifest_path: str,
        path_to_json_ld: str,
        dataset_id: str,
        manifest_record_type: str,
        restrict_rules: bool,
        validate_component: str = None,
        use_schema_label: bool = True,
        hide_blanks: bool = False,
        input_token: str = None,
        project_scope: List = None,
    ) -> string:
        """Wrap methods that are responsible for validation of manifests for a given component, and association of the
        same manifest file with a specified dataset.
        Args:
            manifest_path: Path to the manifest file, which contains the metadata.
            dataset_id: Synapse ID of the dataset on Synapse containing the metadata manifest file.
            validate_component: Component from the schema.org schema based on which the manifest template has been generated.
        Returns:
            Manifest ID: If both validation and association were successful.
        Exceptions:
            ValueError: When validate_component is provided, but it cannot be found in the schema.
            ValidationError: If validation against data model was not successful.
        """

        #TODO: avoid explicitly exposing Synapse store functionality
        # just instantiate a Store class and let it decide at runtime/config
        # the store type
        syn_store = SynapseStorage(input_token=input_token)
        manifest_id = None
        censored_manifest_id = None
        restrict_maniest = False
        censored_manifest_path = manifest_path.replace('.csv', '_censored.csv')
        # check if user wants to perform validation or not
        if validate_component is not None:

            try:
                # check if the component ("class" in schema) passed as argument is valid (present in schema) or not
                self.sg.se.is_class_in_schema(validate_component)
            except:
                # a KeyError exception is raised when validate_component fails in the try-block above
                # here, we are suppressing the KeyError exception and replacing it with a more
                # descriptive ValueError exception
                raise ValueError("The component {} could not be found "
                                 "in the schema.".format(validate_component))

            # automatic JSON schema generation and validation with that JSON schema
            val_errors, val_warnings = self.validateModelManifest(
                manifestPath=manifest_path,
                rootNode=validate_component,
                restrict_rules=restrict_rules,
                project_scope=project_scope,
            )

            # if there are no errors in validation process
            if val_errors == []:
                # upload manifest file from `manifest_path` path to entity with Syn ID `dataset_id`
                if exists(censored_manifest_path):
                    censored_manifest_id = syn_store.associateMetadataWithFiles(
                        schemaGenerator=self.sg,
                        metadataManifestPath=censored_manifest_path,
                        datasetId=dataset_id,
                        manifest_record_type=manifest_record_type,
                        useSchemaLabel=use_schema_label,
                        hideBlanks=hide_blanks,
                    )
                    restrict_maniest = True

                manifest_id = syn_store.associateMetadataWithFiles(
                    schemaGenerator=self.sg,
                    metadataManifestPath=manifest_path,
                    datasetId=dataset_id,
                    manifest_record_type=manifest_record_type,
                    useSchemaLabel=use_schema_label,
                    hideBlanks=hide_blanks,
                    restrict_manifest=restrict_maniest,
                )

                logger.info(f"No validation errors occured during validation.")
                return manifest_id

            else:
                raise ValidationError(
                    "Manifest could not be validated under provided data model. "
                    f"Validation failed with the following errors: {val_errors}"
                )

        # no need to perform validation, just submit/associate the metadata manifest file
        if exists(censored_manifest_path):
            censored_manifest_id = syn_store.associateMetadataWithFiles(
                schemaGenerator=self.sg,
                metadataManifestPath=censored_manifest_path,
                datasetId=dataset_id,
                manifest_record_type=manifest_record_type,
                useSchemaLabel=use_schema_label,
                hideBlanks=hide_blanks,
            )
            restrict_maniest = True

        manifest_id = syn_store.associateMetadataWithFiles(
            schemaGenerator=self.sg,
            metadataManifestPath=manifest_path,
            datasetId=dataset_id,
            manifest_record_type=manifest_record_type,
            useSchemaLabel=use_schema_label,
            hideBlanks=hide_blanks,
            restrict_manifest=restrict_maniest,
        )

        logger.debug(
            "Optional validation was not performed on manifest before association."
        )

        return manifest_id
    def validate_manifest_rules(
        self,
        manifest: pd.core.frame.DataFrame,
        sg: SchemaGenerator,
        restrict_rules: bool,
        project_scope: List,
    ) -> (pd.core.frame.DataFrame, List[List[str]]):
        """
        Purpose:
            Take validation rules set for a particular attribute
            and validate manifest entries based on these rules.
        Input:
            manifest: pd.core.frame.DataFrame
                imported from models/metadata.py
                contains metadata input from user for each attribute.
            sg: SchemaGenerator
                initialized within models/metadata.py
        Returns:
            manifest: pd.core.frame.DataFrame
                If a 'list' validatior is run, the manifest needs to be 
                updated to change the attribute column values to a list.
                In this case the manifest will be updated then exported.
            errors: List[List[str]]
                If any errors are generated they will be added to an errors
                list log recording the following information:
                [error_row, error_col, error_message, error_val]
        TODO: 
            -Investigate why a :: delimiter is breaking up the
                validation rules without me having to do anything...
            - Move the rules formatting validation to the JSONLD 
                generation script.
        """

        # for each type of rule that can be spefified (key) point
        # to the type of validation that will be run.
        validation_types = {
            "int": "type_validation",
            "float": "type_validation",
            "num": "type_validation",
            "str": "type_validation",
            "regex": "regex_validation",
            "url": "url_validation",
            "list": "list_validation",
            "matchAtLeastOne": "cross_validation",
            "matchExactlyOne": "cross_validation",
            "recommended": "content_validation",
            "protectAges": "content_validation",
            "unique": "content_validation",
            "inRange": "content_validation",
        }

        type_dict = {
            "float64": float,
            "int64": int,
            "str": str,
        }

        unimplemented_expectations = [
            "url",
            "list",
            "regex.*",
            "matchAtLeastOne.*",
            "matchExactlyOne.*",
        ]

        in_house_rules = [
            "int",
            "float",
            "num",
            "str",
            "regex.*",
            "url",
            "list",
            "matchAtLeastOne.*",
            "matchExactlyOne.*",
        ]

        # initialize error and warning handling lists.
        errors = []
        warnings = []

        unimplemented_expectations = '|'.join(unimplemented_expectations)
        in_house_rules = '|'.join(in_house_rules)

        if not restrict_rules:
            #operations necessary to set up and run ge suite validation
            ge_helpers = GreatExpectationsHelpers(
                sg=sg,
                unimplemented_expectations=unimplemented_expectations,
                manifest=manifest,
                manifestPath=self.manifestPath,
            )

            ge_helpers.build_context()
            ge_helpers.build_expectation_suite()
            ge_helpers.build_checkpoint()

            #run GE validation
            results = ge_helpers.context.run_checkpoint(
                checkpoint_name="manifest_checkpoint",
                batch_request={
                    "runtime_parameters": {
                        "batch_data": manifest
                    },
                    "batch_identifiers": {
                        "default_identifier_name": "manifestID"
                    },
                },
                result_format={'result_format': 'COMPLETE'},
            )

            #print(results)
            #results.list_validation_results()
            validation_results = results.list_validation_results()

            #parse validation results dict and generate errors
            errors, warnings = ge_helpers.generate_errors(
                errors=errors,
                warnings=warnings,
                validation_results=validation_results,
                validation_types=validation_types,
            )
        else:
            logging.info("Great Expetations suite will not be utilized.")

        regex_re = re.compile('regex.*')
        for col in manifest.columns:
            # remove trailing/leading whitespaces from manifest
            manifest.applymap(lambda x: x.strip() if isinstance(x, str) else x)
            validation_rules = sg.get_node_validation_rules(col)

            # Check that attribute rules conform to limits:
            # no more than two rules for an attribute.
            # As more combinations get added, may want to bring out into its own function / or use validate_rules_utils?
            if len(validation_rules) > 2:
                errors.append(
                    self.get_multiple_types_error(validation_rules,
                                                  col,
                                                  error_type="too_many_rules"))

            # Given a validation rule, run validation. Skip validations already performed by GE
            for rule in validation_rules:
                validation_type = rule.split(" ")[0]
                if re.match(unimplemented_expectations,
                            rule) or (re.match(in_house_rules, rule)
                                      and restrict_rules):
                    if not re.match(in_house_rules, rule):
                        logging.warning(
                            f"Validation rule {rule.split(' ')[0]} has not been implemented in house and cannnot be validated without Great Expectations."
                        )
                        continue

                    #Validate for each individual validation rule.
                    validation_method = getattr(
                        ValidateAttribute, validation_types[validation_type])

                    if validation_type == "list":
                        vr_errors, vr_warnings, manifest_col = validation_method(
                            self, rule, manifest[col])
                        manifest[col] = manifest_col
                    elif validation_type.lower().startswith("match"):
                        vr_errors, vr_warnings = validation_method(
                            self,
                            rule,
                            manifest[col],
                            project_scope,
                        )
                    else:
                        vr_errors, vr_warnings = validation_method(
                            self, rule, manifest[col])
                    # Check for validation rule errors and add them to other errors.
                    if vr_errors:
                        errors.extend(vr_errors)
                    if vr_warnings:
                        warnings.extend(vr_warnings)

        return manifest, errors, warnings
def generate_generator(schema):
    generator = SchemaGenerator(schema_explorer=schema)
    return generator
Example #10
0
def get_manifest(
    ctx,
    title,
    data_type,
    jsonld,
    dataset_id,
    sheet_url,
    output_csv,
    use_annotations,
    oauth,
    json_schema,
    output_xlsx,
    alphabetize_valid_values,
):
    """
    Running CLI with manifest generation options.
    """
    # optional parameters that need to be passed to ManifestGenerator()
    # can be read from config.yml as well
    data_type = fill_in_from_config("data_type", data_type,
                                    ("manifest", "data_type"))
    jsonld = fill_in_from_config("jsonld", jsonld,
                                 ("model", "input", "location"))
    title = fill_in_from_config("title",
                                title, ("manifest", "title"),
                                allow_none=True)
    json_schema = fill_in_from_config(
        "json_schema",
        json_schema,
        ("model", "input", "validation_schema"),
        allow_none=True,
    )

    def create_single_manifest(data_type, output_csv=None, output_xlsx=None):
        # create object of type ManifestGenerator
        manifest_generator = ManifestGenerator(
            path_to_json_ld=jsonld,
            title=t,
            root=data_type,
            oauth=oauth,
            use_annotations=use_annotations,
            alphabetize_valid_values=alphabetize_valid_values,
        )

        # call get_manifest() on manifest_generator
        result = manifest_generator.get_manifest(
            dataset_id=dataset_id,
            sheet_url=sheet_url,
            json_schema=json_schema,
        )

        if sheet_url:
            logger.info(
                "Find the manifest template using this Google Sheet URL:")
            click.echo(result)
        if output_csv is None and output_xlsx is None:
            prefix, _ = os.path.splitext(jsonld)
            prefix_root, prefix_ext = os.path.splitext(prefix)
            if prefix_ext == ".model":
                prefix = prefix_root
            output_csv = f"{prefix}.{data_type}.manifest.csv"
        elif output_xlsx:
            export_manifest_excel(output_excel=output_xlsx, manifest=result)
            logger.info(
                f"Find the manifest template using this Excel file path: {output_xlsx}"
            )
            return result
        export_manifest_csv(file_name=output_csv, manifest=result)
        logger.info(
            f"Find the manifest template using this CSV file path: {output_csv}"
        )
        return result

    if type(data_type) is str:
        data_type = [data_type]

    if data_type[0] == 'all manifests':
        sg = SchemaGenerator(path_to_json_ld=jsonld)
        component_digraph = sg.se.get_digraph_by_edge_type('requiresComponent')
        components = component_digraph.nodes()
        for component in components:
            t = f'{title}.{component}.manifest'
            result = create_single_manifest(data_type=component)
    else:
        for dt in data_type:
            if len(data_type) > 1:
                t = f'{title}.{dt}.manifest'
            else:
                t = title
            result = create_single_manifest(data_type=dt,
                                            output_csv=output_csv,
                                            output_xlsx=output_xlsx)

    return result
Example #11
0
def get_manifest_route(schema_url,
                       title,
                       oauth,
                       use_annotations,
                       dataset_ids=None,
                       asset_view=None):
    # call config_handler()
    config_handler(asset_view=asset_view)

    # get path to temporary JSON-LD file
    jsonld = get_temp_jsonld(schema_url)

    # Gather all data_types to make manifests for.
    all_args = connexion.request.args
    args_dict = dict(all_args.lists())
    data_type = args_dict['data_type']

    # Gather all dataset_ids
    try:
        dataset_ids = args_dict['dataset_id']
    except:
        pass

    if dataset_ids:
        # Check that the number of submitted data_types matches
        # the number of dataset_ids (if applicable)
        len_data_types = len(data_type)
        len_dataset_ids = len(dataset_ids)

        try:
            len_data_types == len_dataset_ids
        except:
            raise ValueError(
                f"There is a mismatch in the number of data_types and dataset_id's that "
                f"submitted. Please check your submission and try again.")

        # Raise an error if used in conjunction with datatype = 'all_manifests'
        try:
            data_type[0] != 'all manifests'
        except:
            raise ValueError(
                f"When submitting 'all manifests' as the data_type cannot also submit dataset_id. "
                f"Please check your submission and try again.")

    def create_single_manifest(data_type, dataset_id=None):
        # create object of type ManifestGenerator
        manifest_generator = ManifestGenerator(
            path_to_json_ld=jsonld,
            title=t,
            root=data_type,
            oauth=oauth,
            use_annotations=use_annotations,
            alphabetize_valid_values='ascending',
        )

        result = manifest_generator.get_manifest(
            dataset_id=dataset_id,
            sheet_url=True,
        )

        return result

    # Gather all returned result urls
    all_results = []
    if data_type[0] == 'all manifests':
        sg = SchemaGenerator(path_to_json_ld=jsonld)
        component_digraph = sg.se.get_digraph_by_edge_type('requiresComponent')
        components = component_digraph.nodes()
        for component in components:
            t = f'{title}.{component}.manifest'
            result = create_single_manifest(data_type=component)
            all_results.append(result)
    else:
        for i, dt in enumerate(data_type):
            if len(data_type) > 1:
                t = f'{title}.{dt}.manifest'
            else:
                t = title

            if dataset_ids:
                # if a dataset_id is provided add this to the function call.
                result = create_single_manifest(data_type=dt,
                                                dataset_id=dataset_ids[i])
            else:
                result = create_single_manifest(data_type=dt)
            all_results.append(result)

    return all_results
def sg(helpers):

    inputModelLocation = helpers.get_data_path('example.model.jsonld')
    sg = SchemaGenerator(inputModelLocation)

    yield sg