def _is_directory_valid(self, dir_identifier):
        """Checks recursively to top if directory tree is valid.

        :param dir_identifier: directory identifier of the directory to be
                               validated
        :returns: ``None``
        """

        if dir_identifier not in self.valid_directories:
            # Validate metadata against JSON schema
            dir_metadata = self.metax_client.get_directory(dir_identifier)
            try:
                jsonschema.validate(
                    dir_metadata,
                    siptools_research.schemas.DIRECTORY_METADATA_SCHEMA)
            except jsonschema.ValidationError as exc:
                directory_id = (dir_metadata['directory_path']
                                if 'directory_path' in dir_metadata else
                                dir_identifier)
                raise InvalidDatasetMetadataError(
                    "Validation error in metadata of {directory_id}: {error}".
                    format(directory_id=directory_id, error=str(exc)))
            self.valid_directories.add(dir_identifier)
            if 'parent_directory' in dir_metadata:
                self._is_directory_valid(
                    dir_metadata['parent_directory']['identifier'])
Example #2
0
def _validate_datacite(dataset_id, metax_client, dummy_doi="false"):
    """Validate datacite.

    :param dataset_id: dataset identifier
    :param metax_client: metax_access.Metax instance
    :returns: ``None``
    """
    try:
        datacite = metax_client.get_datacite(dataset_id, dummy_doi=dummy_doi)
    except DataciteGenerationError as exception:
        raise InvalidDatasetMetadataError(str(exception))

    schema = lxml.etree.XMLSchema(lxml.etree.parse(DATACITE_SCHEMA))
    if schema.validate(datacite) is False:
        # pylint: disable=not-an-iterable
        errors = [error.message for error in schema.error_log]
        raise InvalidDatasetMetadataError(DATACITE_VALIDATION_ERROR %
                                          (_format_error_list(errors)))
    def find_file_categories(self):
        """Create logical structure map of dataset files.

        Returns dictionary with filecategories as keys and filepaths as
        values.

        :returns: logical structure map dictionary
        """
        config_object = Configuration(self.config)
        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        dataset_files = metax_client.get_dataset_files(self.dataset_id)
        dataset_metadata = metax_client.get_dataset(self.dataset_id)
        languages = get_dataset_languages(dataset_metadata)
        dirpath2usecategory = get_dirpath_dict(metax_client, dataset_metadata)
        logical_struct = dict()

        for dataset_file in dataset_files:

            file_id = dataset_file['identifier']

            # Get the use category of file. The path to the file in
            # logical structmap is stored in 'use_category' in metax.
            filecategory = find_file_use_category(file_id, dataset_metadata)

            # If file listed in datasets/<id>/files is not listed in
            # 'files' section of dataset metadata, look for
            # parent_directory of the file from  'directories' section.
            # The "use_category" of file is the "use_category" of the
            # parent directory.
            if filecategory is None:
                name_len = len(dataset_file["file_name"])

                filecategory = find_dir_use_category(
                    dataset_file["file_path"][:-name_len], dirpath2usecategory,
                    languages)

            # If file category was not found even for the parent
            # directory, raise error
            if filecategory is None:
                raise InvalidDatasetMetadataError(
                    "File category for file {} was not found".format(file_id))

            # Append path to logical_struct[filecategory] list. Create
            # list if it does not exist already
            if filecategory not in logical_struct.keys():
                logical_struct[filecategory] = []
            logical_struct[filecategory].append(dataset_file['file_path'])

        return logical_struct
Example #4
0
def _validate_localization(localization_dict, field):
    """Validate languages.

    Check that the localization dict is not empty and all the keys are valid
    ISO 639-1 language codes.
    """
    if not localization_dict:
        raise InvalidDatasetMetadataError(
            "No localization provided in field: 'research_dataset/%s'" % field)

    for language in localization_dict:
        # Per MetaX schema, 'und' and 'zxx' are fallbacks for content that
        # can't be localized
        if language in ("und", "zxx"):
            continue

        try:
            languages.get(part1=language)
        except KeyError:
            message = (
                "Invalid language code: '%s' in field: 'research_dataset/%s'"
            ) % (language, field)

            raise InvalidDatasetMetadataError(message)
Example #5
0
def _validate_dataset_metadata(dataset_metadata, dummy_doi="false"):
    """Validate dataset metadata.

    Validates dataset metadata from /rest/v1/datasets/<dataset_id>

    :param dataset_metadata: dataset metadata dictionary
    :returns: ``None``
    """
    schema = copy.deepcopy(siptools_research.schemas.DATASET_METADATA_SCHEMA)
    # If dummy DOI is used, drop preeservation_identifier from schema
    if dummy_doi == "true":
        schema["required"] = ["research_dataset", "contract"]
        del schema["properties"]["preservation_identifier"]

    try:
        jsonschema.validate(dataset_metadata, schema)
    except jsonschema.ValidationError as exc:
        raise InvalidDatasetMetadataError(str(exc))
Example #6
0
def _validate_file_metadata(dataset, metax_client, conf):
    """Validate file metadata found from /rest/v1/datasets/<dataset_id>/files.

    :param dataset: dataset
    :param metax_client: metax_access.Metax instance
    :param conf: siptools_research Configuration object
    :returns: ``None``
    """
    # DatasetConsistency is used to verify file consistency within the dataset
    # i.e. every file returned by Metax API /datasets/datasetid/files
    # can be found from dataset.file or dataset.directories properties
    consistency = DatasetConsistency(metax_client, dataset)
    directory_validation = DirectoryValidation(metax_client)

    dataset_files = metax_client.get_dataset_files(dataset['identifier'])
    if not dataset_files:
        raise InvalidDatasetMetadataError(
            "Dataset must contain at least one file")
    for file_metadata in dataset_files:

        file_identifier = file_metadata["identifier"]
        file_path = file_metadata["file_path"]

        # Validate metadata against JSON schema
        try:
            jsonschema.validate(file_metadata,
                                siptools_research.schemas.FILE_METADATA_SCHEMA)
        except jsonschema.ValidationError as exc:
            raise InvalidFileMetadataError(
                "Validation error in metadata of {file_path}: {error}".format(
                    file_path=file_path, error=str(exc)))

        directory_validation.is_valid_for_file(file_metadata)

        # Check that mimetype is supported
        _check_mimetype(file_metadata, conf)

        # Check that file path does not point outside SIP
        normalised_path = os.path.normpath(file_path.strip('/'))
        if normalised_path.startswith('..'):
            raise InvalidFileMetadataError(
                'The file path of file %s is invalid: %s' %
                (file_identifier, file_path))
        consistency.is_consistent_for_file(file_metadata)
Example #7
0
    def is_consistent_for_file(self, file_md):
        """Verifies that file is contained by dataset.

        Raises ``InvalidDatasetMetadataError`` if file is not dataset files or
        directories.

        :param file_md: Metax file metadata dictionary
        :returns: ``None``
        """
        file_identifier = file_md['identifier']
        if 'files' in self.dataset['research_dataset']:
            for file_ in self.dataset['research_dataset']['files']:
                if file_['identifier'] == file_identifier:
                    return
        if file_md['parent_directory']['identifier'] not in self.directories:
            temp_dirs = set()
            if not self._is_directory_contained_by_dataset_directories(
                    file_md['parent_directory']['identifier'], temp_dirs):
                raise InvalidDatasetMetadataError(
                    'File not found from dataset files nor directories: %s' %
                    (file_md["file_path"]))
            # we might have found new dataset directories
            self.directories.update(temp_dirs)
    def run(self):
        """Raise InvalidDatasetMetadataError.

        :returns:  ``None``
        """
        raise InvalidDatasetMetadataError('Missing some important metadata')