Esempio n. 1
0
 def __init__(self, *args, **kwargs):
     """Initialize Task."""
     super(CreateTechnicalMetadata, self).__init__(*args, **kwargs)
     self.config_object = Configuration(self.config)
     self.metax_client = Metax(
         self.config_object.get('metax_url'),
         self.config_object.get('metax_user'),
         self.config_object.get('metax_password'),
         verify=self.config_object.getboolean('metax_ssl_verification'))
    def run(self):
        """Get dataset 1 from Metax.

        :returns:  ``None``
        """
        config_object = Configuration(self.config)
        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        metax_client.get_dataset('1')
    def find_file_categories(self):
        """Create logical structure map of dataset files.

        Returns dictionary with filecategories as keys and filepaths as
        values.

        :returns: logical structure map dictionary
        """
        config_object = Configuration(self.config)
        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        dataset_files = metax_client.get_dataset_files(self.dataset_id)
        dataset_metadata = metax_client.get_dataset(self.dataset_id)
        languages = get_dataset_languages(dataset_metadata)
        dirpath2usecategory = get_dirpath_dict(metax_client, dataset_metadata)
        logical_struct = dict()

        for dataset_file in dataset_files:

            file_id = dataset_file['identifier']

            # Get the use category of file. The path to the file in
            # logical structmap is stored in 'use_category' in metax.
            filecategory = find_file_use_category(file_id, dataset_metadata)

            # If file listed in datasets/<id>/files is not listed in
            # 'files' section of dataset metadata, look for
            # parent_directory of the file from  'directories' section.
            # The "use_category" of file is the "use_category" of the
            # parent directory.
            if filecategory is None:
                name_len = len(dataset_file["file_name"])

                filecategory = find_dir_use_category(
                    dataset_file["file_path"][:-name_len], dirpath2usecategory,
                    languages)

            # If file category was not found even for the parent
            # directory, raise error
            if filecategory is None:
                raise InvalidDatasetMetadataError(
                    "File category for file {} was not found".format(file_id))

            # Append path to logical_struct[filecategory] list. Create
            # list if it does not exist already
            if filecategory not in logical_struct.keys():
                logical_struct[filecategory] = []
            logical_struct[filecategory].append(dataset_file['file_path'])

        return logical_struct
Esempio n. 4
0
    def __init__(self, url=None, user=None, password=None, verify=None):
        """Init MetaxClient instances."""
        # If any of the params is not provided read them from app.config
        if url is None or user is None or password is None:
            url = CONFIG.get("METAX_URL")
            user = CONFIG.get("METAX_USER")
            password = CONFIG.get("METAX_PASSWORD")

        if verify is None:
            verify = CONFIG.get("METAX_SSL_VERIFICATION", True)

        self.client = Metax(url, user, password, verify=verify)
        # dataset_id => preservation_state dict
        self.dataset_cache = {}
Esempio n. 5
0
def test_validate_file_metadata_invalid_metadata(requests_mock):
    """Test ``_validate_file_metadata``.

    Function should raise exceptions with descriptive error messages.

    :param requests_mock: Mocker object
    :returns: ``None``
    """
    file_metadata = copy.deepcopy(BASE_FILE)
    file_metadata['file_characteristics'] = {
        "file_created": "2014-01-17T08:19:31Z"
    }
    tests.utils.add_metax_dataset(requests_mock, files=[file_metadata])

    # Init metax client
    configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE)
    client = Metax(configuration.get('metax_url'),
                   configuration.get('metax_user'),
                   configuration.get('metax_password'),
                   verify=configuration.getboolean('metax_ssl_verification'))

    expected_error = (
        "Validation error in metadata of path/to/file: 'file_format' is"
        " a required property\n\nFailed validating 'required' in schema")
    with pytest.raises(InvalidFileMetadataError, match=expected_error):
        # pylint: disable=protected-access
        siptools_research.metadata_validator._validate_file_metadata(
            {'identifier': 'dataset_identifier'}, client, configuration)
def test_verify_file_contained_by_dataset_files():
    """Test is_consistent_for_file method.

    Check that ``DatasetConsistency::is_consistent_for_file()`` succeeds
    when dataset files contains the file

    :returns: ``None``
    """
    # Init metax client
    configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE)
    client = Metax(
        configuration.get('metax_url'),
        configuration.get('metax_user'),
        configuration.get('metax_password'),
        verify=configuration.getboolean('metax_ssl_verification')
    )
    dataset = {
        'identifier': 'dataset_identifier',
        'research_dataset': {'files': [
            {'identifier': 'file_identifier'}
        ], 'directories': []}
    }

    file_metadata = {
        'identifier': 'file_identifier',
        'file_path': "/path/to/file",
        'parent_directory': {'identifier': 'parent_directory_identifier'}
    }
    try:
        dirs = DatasetConsistency(client, dataset)
        dirs.is_consistent_for_file(file_metadata)
    except InvalidDatasetMetadataError as exc:
        pytest.fail(
            '_verify_file_contained_by_dataset raised exception: ' + str(exc)
        )
Esempio n. 7
0
def test_validate_datacite(requests_mock):
    """Test _validate_datacite.

    Function should raises exception with readable error message when datacite
    XML contains multiple errors.

    :param requests_mock: Mocker object
    :returns: ``None``
    """
    # Init metax client
    configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE)
    metax_client = Metax(
        configuration.get('metax_url'),
        configuration.get('metax_user'),
        configuration.get('metax_password'),
        verify=configuration.getboolean('metax_ssl_verification'))

    requests_mock.get(
        "https://metaksi/rest/v1/datasets/dataset_identifier?"
        "dataset_format=datacite",
        content=get_very_invalid_datacite())

    # Try to validate datacite
    expected_error = "Datacite metadata is invalid:"
    # pylint: disable=protected-access
    with pytest.raises(InvalidDatasetMetadataError, match=expected_error):
        metadata_validator._validate_datacite('dataset_identifier',
                                              metax_client)
Esempio n. 8
0
def test_get_dirpath_dict_no_directories():
    """Test get_dirpath_dict function with dataset without directories.

    The function should return an empty dict when no directories are
    defined in the research_dataset.
    """
    metax_client = Metax("https://metaksi", "test", "test")
    assert not get_dirpath_dict(metax_client, {"research_dataset": {}})
Esempio n. 9
0
def test_validate_file_metadata(requests_mock):
    """Test _validate_file_metadata.

    Check that dataset directory caching is working correctly in
    DatasetConsistency when the files have common root directory in
    dataset.directories property.

    :param requests_mock: Mocker object
    :returns: ``None``
    """
    dataset = copy.deepcopy(BASE_DATASET)
    dataset['research_dataset']['directories'] = [{'identifier': 'root_dir'}]
    file_1 = copy.deepcopy(TXT_FILE)
    file_1['identifier'] = 'file_identifier1'
    file_2 = copy.deepcopy(TXT_FILE)
    file_2['identifier'] = 'file_identifier2'
    requests_mock.get(tests.conftest.METAX_URL +
                      '/directories/pid:urn:dir:wf1',
                      json={
                          'identifier': 'first_par_dir',
                          'directory_path': '',
                          'parent_directory': {
                              'identifier': 'second_par_dir'
                          }
                      },
                      status_code=200)
    requests_mock.get(tests.conftest.METAX_URL + '/directories/second_par_dir',
                      json={
                          'identifier': 'second_par_dir',
                          'directory_path': '',
                          'parent_directory': {
                              'identifier': 'root_dir'
                          }
                      },
                      status_code=200)
    requests_mock.get(tests.conftest.METAX_URL + '/directories/root_dir',
                      json={
                          'identifier': 'second_par_dir',
                          'directory_path': '/'
                      },
                      status_code=200)
    files_adapter = requests_mock.get(tests.conftest.METAX_URL +
                                      '/datasets/dataset_identifier/files',
                                      json=[file_1, file_2],
                                      status_code=200)

    # Init metax client
    configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE)
    client = Metax(configuration.get('metax_url'),
                   configuration.get('metax_user'),
                   configuration.get('metax_password'),
                   verify=configuration.getboolean('metax_ssl_verification'))

    # pylint: disable=protected-access
    siptools_research.metadata_validator._validate_file_metadata(
        dataset, client, configuration)

    assert files_adapter.call_count == 1
    def get_provenance_ids(self):
        """List identifiers of provenance events.

        Gets list of dataset provenance events from Metax, and reads
        provenance IDs of the events from event.xml files found in the
        workspace directory.

        :returns: list of provenance IDs
        """
        config_object = Configuration(self.config)
        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        metadata = metax_client.get_dataset(self.dataset_id)
        languages = get_dataset_languages(metadata)

        # Get the reference file path from Luigi task input
        # It already contains the workspace path.
        event_ids = get_md_references(
            read_md_references(
                self.workspace,
                os.path.basename(
                    self.input()['create_provenance_information'].path)))

        event_type_ids = {}
        for event_id in event_ids:
            event_file = event_id[1:] + "-PREMIS%3AEVENT-amd.xml"
            event_file_path = os.path.join(self.sip_creation_path, event_file)
            if not os.path.exists(event_file_path):
                continue
            root = ET.parse(encode_path(event_file_path)).getroot()
            event_type = root.xpath("//premis:eventType",
                                    namespaces=NAMESPACES)[0].text
            event_type_ids[event_type] = event_id

        provenance_ids = []
        for provenance in metadata["research_dataset"]["provenance"]:
            event_type = get_localized_value(
                provenance["preservation_event"]["pref_label"],
                languages=languages)
            provenance_ids += [event_type_ids[event_type]]

        return provenance_ids
    def run(self):
        """Read list of required files from Metax and download them.

        Files are written to path based on ``file_path`` in Metax.

        :returns: ``None``
        """
        upload_database = upload_rest_api.database.Database()

        # Find file identifiers from Metax dataset metadata.
        config_object = Configuration(self.config)
        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        dataset_files = metax_client.get_dataset_files(self.dataset_id)

        # Download files to temporary directory which will be moved to
        # output target path when all files have been downloaded
        with self.output().temporary_path() as temporary_directory:
            os.mkdir(temporary_directory)

            for dataset_file in dataset_files:
                identifier = dataset_file["identifier"]

                # Full path to file
                target_path = os.path.normpath(
                    os.path.join(temporary_directory,
                                 dataset_file["file_path"].strip('/')))
                if not target_path.startswith(temporary_directory):
                    raise InvalidFileMetadataError(
                        'The file path of file %s is invalid: %s' %
                        (identifier, dataset_file["file_path"]))

                # Create the download directory for file if it does not
                # exist already
                if not os.path.isdir(os.path.dirname(target_path)):
                    # TODO: Use exist_ok -parameter when moving to
                    # python3
                    os.makedirs(os.path.dirname(target_path))

                download_file(dataset_file, target_path, self.config,
                              upload_database)
    def run(self):
        """Report preservation status to Metax.

        Checks the path of ingest report file in digital preservation
        service. If the ingest report is in ~/accepted/.../ directory,
        the dataset has passed validation.If the report is found in
        ~/rejected/.../ directory, or somewhere else, an exception is
        risen. The event handlers will deal with the exceptions.

        :returns: ``None``
        """
        # List of all matching paths ValidateSIP found
        ingest_report_paths = self.input()[0].existing_paths()

        # Only one ingest report should be found
        assert len(ingest_report_paths) == 1

        # 'accepted' or 'rejected'?
        directory = ingest_report_paths[0].split('/')[0]
        if directory == 'accepted':
            # Init metax
            config_object = Configuration(self.config)
            metax_client = Metax(
                config_object.get('metax_url'),
                config_object.get('metax_user'),
                config_object.get('metax_password'),
                verify=config_object.getboolean('metax_ssl_verification'))
            # Set Metax preservation state of this dataset to 6 ("in
            # longterm preservation")
            metax_client.set_preservation_state(
                self.dataset_id,
                state=DS_STATE_IN_DIGITAL_PRESERVATION,
                system_description='Accepted to preservation')
            with self.output().open('w') as output:
                output.write('Dataset id=' + self.dataset_id)
        elif directory == 'rejected':
            # Raise exception that informs event handler that dataset
            # did not pass validation
            raise InvalidSIPError("SIP was rejected")
        else:
            raise ValueError('Report was found in incorrect '
                             'path: %s' % ingest_report_paths[0])
Esempio n. 13
0
    def run(self):
        """Copy datacite.xml metadatafile from Metax.

        Creates a METS document that contains dmdSec element with
        datacite metadata.

        :returns: ``None``
        """
        # Get datacite.xml from Metax
        config_object = Configuration(self.config)
        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        dataset = metax_client.get_dataset(self.dataset_id)
        datacite = metax_client.get_datacite(dataset['identifier'])

        # Write datacite.xml to file
        datacite_path = os.path.join(self.workspace, 'datacite.xml')
        datacite.write(datacite_path)

        tmp = os.path.join(config_object.get('packaging_root'), 'tmp/')
        with TemporaryDirectory(prefix=tmp) as temporary_workspace:
            # Create output files with siptools
            import_description.import_description(
                dmdsec_location=datacite_path,
                workspace=temporary_workspace,
                without_uuid=True)

            # Move created files to SIP creation directory. PREMIS event
            # reference file is moved to output target path after
            # everything else is done.
            with self.output().temporary_path() as target_path:
                shutil.move(
                    os.path.join(temporary_workspace,
                                 'premis-event-md-references.jsonl'),
                    target_path)
                for file_ in os.listdir(temporary_workspace):
                    shutil.move(os.path.join(temporary_workspace, file_),
                                self.sip_creation_path)
def test_directory_validation_caching_works(requests_mock):
    """Test directory validation caching.

    Two files are contained by same directory. Metax is called only once for
    each directory in tree and hence the directory validation as well.

    :returns: ``None``
    """
    # Init metax client
    configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE)
    client = Metax(configuration.get('metax_url'),
                   configuration.get('metax_user'),
                   configuration.get('metax_password'),
                   verify=configuration.getboolean('metax_ssl_verification'))
    first_par_dir_adapter = requests_mock.get(
        tests.conftest.METAX_URL + '/directories/first_par',
        json={
            'identifier': 'first_par',
            'directory_path': '/second_par/first_par',
            'parent_directory': {
                'identifier': 'second_par'
            }
        },
        status_code=200)
    second_par_dir_adapter = requests_mock.get(
        tests.conftest.METAX_URL + '/directories/second_par',
        json={
            'identifier': 'second_par',
            'directory_path': '/second_par',
            'parent_directory': {
                'identifier': 'root'
            }
        },
        status_code=200)
    root_dir_adapter = requests_mock.get(tests.conftest.METAX_URL +
                                         '/directories/root',
                                         json={
                                             'identifier': 'root',
                                             'directory_path': '/'
                                         },
                                         status_code=200)
    file2_metadata = copy.deepcopy(FILE_METADATA)
    file2_metadata['file_path'] = ["/path/to/file2"]
    try:
        validator = DirectoryValidation(client)
        validator.is_valid_for_file(FILE_METADATA)
        validator.is_valid_for_file(file2_metadata)
    except InvalidDatasetMetadataError as exc:
        pytest.fail('test_successful_directory_validation fails: ' + str(exc))
    # verify that metax is called only once for directories
    assert first_par_dir_adapter.call_count == 1
    assert second_par_dir_adapter.call_count == 1
    assert root_dir_adapter.call_count == 1
    def get_identifiers(self):
        """Get file identifiers.

        Return a list of all the file identifiers and the path to the
        downloaded files.

        :returns: Tuple (list of identifiers, cache_path)
        """
        config_object = Configuration(self.config)
        packaging_root = config_object.get("packaging_root")
        cache_path = os.path.join(packaging_root, "file_cache")

        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        try:
            dataset_files = metax_client.get_dataset_files(self.dataset_id)
            return [_file["identifier"] for _file in dataset_files], cache_path
        except DatasetNotAvailableError:
            return [], cache_path
Esempio n. 16
0
def validate_metadata(dataset_id,
                      config="/etc/siptools_research.conf",
                      dummy_doi="false"):
    """Validate dataset.

    Reads dataset metadata, file metadata, and additional techMD XML from
    Metax and validates them against schemas. Raises error if dataset is not
    valid. Raises InvalidDatasetError if dataset is invalid.

    :param dataset_id: dataset identifier
    :param config: configuration file path
    :param: dummy_doi: 'true' if dummy preservation identifier is to be used
    :returns: ``True``, if dataset metadata is valid.
    """
    conf = Configuration(config)
    metax_client = Metax(conf.get('metax_url'),
                         conf.get('metax_user'),
                         conf.get('metax_password'),
                         verify=conf.getboolean('metax_ssl_verification'))
    # Get dataset metadata from Metax
    dataset_metadata = metax_client.get_dataset(dataset_id)

    # Validate dataset metadata
    _validate_dataset_metadata(dataset_metadata, dummy_doi=dummy_doi)

    # Validate dataset localization
    _validate_dataset_localization(dataset_metadata)

    # Validate contract metadata
    _validate_contract_metadata(dataset_metadata['contract']['identifier'],
                                metax_client)

    # Validate file metadata for each file in dataset files
    _validate_file_metadata(dataset_metadata, metax_client, conf)

    # Validate datacite provided by Metax
    _validate_datacite(dataset_id, metax_client, dummy_doi=dummy_doi)

    return True
Esempio n. 17
0
def test_get_dirpath_dict(requests_mock):
    """Test that get_dirpath_dict returns the correct dictionary.

    The dictionary maps dirpath to use_category.

    :param requests_mock: Mocker object
    """
    requests_mock.get("https://metaksi/rest/v1/directories/1",
                      json={
                          "identifier": "1",
                          "directory_path": "/"
                      })
    requests_mock.get("https://metaksi/rest/v1/directories/2",
                      json={
                          "identifier": "2",
                          "directory_path": "/test"
                      })

    metax_client = Metax("https://metaksi", "test", "test")
    dataset_metadata = {
        "research_dataset": {
            "directories": [{
                "identifier": "1",
                "use_category": {
                    "pref_label": {
                        "en": "rootdir"
                    }
                }
            }, {
                "identifier": "2",
                "use_category": {
                    "pref_label": {
                        "en": "testdir"
                    }
                }
            }]
        }
    }

    assert get_dirpath_dict(metax_client, dataset_metadata) == {
        "/": {
            "pref_label": {
                "en": "rootdir"
            }
        },
        "/test": {
            "pref_label": {
                "en": "testdir"
            }
        }
    }
def _create_premis_events(dataset_id, workspace, config):
    """Create premis events from provenance metadata.

    Reads dataset provenance metadata from Metax. For each provenance
    object a METS document that contains a PREMIS event element is
    created.

    :param dataset_id: dataset identifier
    :param workspace: SIP creation directory
    :param config: path to configuration file
    :returns: ``None``
    """
    config_object = Configuration(config)
    metadata = Metax(config_object.get('metax_url'),
                     config_object.get('metax_user'),
                     config_object.get('metax_password'),
                     verify=config_object.getboolean(
                         'metax_ssl_verification')).get_dataset(dataset_id)

    dataset_languages = get_dataset_languages(metadata)

    provenances = metadata["research_dataset"]["provenance"]

    for provenance in provenances:

        event_type = get_localized_value(
            provenance["preservation_event"]["pref_label"],
            languages=dataset_languages)

        try:
            event_datetime = provenance["temporal"]["start_date"]
        except KeyError:
            event_datetime = 'OPEN'

        event_detail = get_localized_value(provenance["description"],
                                           languages=dataset_languages)

        event_outcome = get_localized_value(
            provenance["event_outcome"]["pref_label"],
            languages=dataset_languages)

        event_outcome_detail = get_localized_value(
            provenance["outcome_description"], languages=dataset_languages)

        premis_event.premis_event(workspace=workspace,
                                  event_type=event_type,
                                  event_datetime=event_datetime,
                                  event_detail=event_detail,
                                  event_outcome=event_outcome,
                                  event_outcome_detail=event_outcome_detail)
    def run(self):
        """Compile all metadata files into METS document.

        :returns: ``None``
        """
        config_object = Configuration(self.config)
        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        metadata = metax_client.get_dataset(self.dataset_id)

        # Get preservation_identifier from Metax
        preservation_id = metadata["preservation_identifier"]

        # Get contract data from Metax
        contract_id = metadata["contract"]["identifier"]
        contract_metadata = metax_client.get_contract(contract_id)
        contract_identifier = contract_metadata["contract_json"]["identifier"]
        contract_org_name \
            = contract_metadata["contract_json"]["organization"]["name"]

        # Compile METS
        mets = compile_mets.create_mets(workspace=self.sip_creation_path,
                                        mets_profile='tpas',
                                        contractid=contract_identifier,
                                        objid=preservation_id,
                                        organization_name=contract_org_name,
                                        packagingservice='Packaging Service')

        with self.output().open('wb') as outputfile:
            mets.write(outputfile,
                       pretty_print=True,
                       xml_declaration=True,
                       encoding='UTF-8')
def test_successful_directory_validation_fails(requests_mock):
    """Test validation of invalid directory tree.

    The root directory is missing the `directory_path` attribute

    :returns: ``None``
    """
    # Init metax client
    configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE)
    client = Metax(configuration.get('metax_url'),
                   configuration.get('metax_user'),
                   configuration.get('metax_password'),
                   verify=configuration.getboolean('metax_ssl_verification'))
    first_par_dir_adapter = requests_mock.get(
        tests.conftest.METAX_URL + '/directories/first_par',
        json={
            'identifier': 'first_par',
            'directory_path': '/second_par/first_par',
            'parent_directory': {
                'identifier': 'second_par'
            }
        },
        status_code=200)
    second_par_dir_adapter = requests_mock.get(
        tests.conftest.METAX_URL + '/directories/second_par',
        json={
            'identifier': 'second_par',
            'directory_path': '/second_par',
            'parent_directory': {
                'identifier': 'root'
            }
        },
        status_code=200)
    root_dir_adapter = requests_mock.get(tests.conftest.METAX_URL +
                                         '/directories/root',
                                         json={'identifier': 'root'},
                                         status_code=200)
    with pytest.raises(InvalidDatasetMetadataError) as exc_info:
        validator = DirectoryValidation(client)
        validator.is_valid_for_file(FILE_METADATA)

    assert str(exc_info.value).startswith(
        "Validation error in metadata of root: "
        "'directory_path' is a required property")

    assert first_par_dir_adapter.call_count == 1
    assert second_par_dir_adapter.call_count == 1
    assert root_dir_adapter.call_count == 1
def test_successful_directory_validation(requests_mock):
    """Directory validation of valid directory tree.

    :returns: ``None``
    """
    # Init metax client
    configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE)
    client = Metax(configuration.get('metax_url'),
                   configuration.get('metax_user'),
                   configuration.get('metax_password'),
                   verify=configuration.getboolean('metax_ssl_verification'))

    first_par_dir_adapter = requests_mock.get(
        tests.conftest.METAX_URL + '/directories/first_par',
        json={
            'identifier': 'first_par',
            'directory_path': '/second_par/first_par',
            'parent_directory': {
                'identifier': 'second_par'
            }
        },
        status_code=200)
    second_par_dir_adapter = requests_mock.get(
        tests.conftest.METAX_URL + '/directories/second_par',
        json={
            'identifier': 'second_par',
            'directory_path': '/second_par',
            'parent_directory': {
                'identifier': 'root'
            }
        },
        status_code=200)
    root_dir_adapter = requests_mock.get(tests.conftest.METAX_URL +
                                         '/directories/root',
                                         json={
                                             'identifier': 'root',
                                             'directory_path': '/'
                                         },
                                         status_code=200)
    try:
        validator = DirectoryValidation(client)
        validator.is_valid_for_file(FILE_METADATA)
    except InvalidDatasetMetadataError as exc:
        pytest.fail('test_successful_directory_validation fails: ' + str(exc))
    assert first_par_dir_adapter.call_count == 1
    assert second_par_dir_adapter.call_count == 1
    assert root_dir_adapter.call_count == 1
def test_verify_file_contained_by_dataset_missing_from_dataset(requests_mock):
    """Test is_consistent_for_file method.

    Check that ``DatasetConsistency::is_consistent_for_file()`` raises
    exception with descriptive error messages when dataset files nor
    directories do not contain the file.

    :returns: ``None``
    """
    # Init metax client
    configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE)
    client = Metax(
        configuration.get('metax_url'),
        configuration.get('metax_user'),
        configuration.get('metax_password'),
        verify=configuration.getboolean('metax_ssl_verification')
    )
    dataset = {
        'identifier': 'dataset_identifier',
        'research_dataset': {
            'files': [],
            'directories': []
        }
    }

    file_metadata = {
        'identifier': 'file_identifier',
        'file_path': "/path/to/file",
        'parent_directory': {
            'identifier': 'parent_directory_identifier'
        }
    }
    requests_mock.get(
        tests.conftest.METAX_URL + '/directories/parent_directory_identifier',
        json={'identifier': 'parent_directory_identifier'},
        status_code=200
    )
    with pytest.raises(InvalidDatasetMetadataError) as exc_info:
        dirs = DatasetConsistency(client, dataset)
        dirs.is_consistent_for_file(file_metadata)

    assert str(exc_info.value) == ("File not found from dataset files nor "
                                   "directories: /path/to/file")
Esempio n. 23
0
class CreateTechnicalMetadata(WorkflowTask):
    """Create METS documents that contain technical metadata.

    The PREMIS object metadata is created to all dataset files and it is
    written to
    `<sip_creation_path>/<url_encoded_filepath>-PREMIS%3AOBJECT-amd.xml`.
    File properties are written to
    `<sip_creation_path>/<url_encoded_filepath>-scraper.json`.
    PREMIS event metadata and PREMIS agent metadata are written to
    `<sip_creation_path>/<premis_event_id>-PREMIS%3AEVENT-amd.xml` and
    `<sip_creation_path>/<premis_agent_id>-PREMIS%3AEVENT-amd.xml`.
    Import object PREMIS event metadata references are written to
    `<sip_creation_path>/import-object-md-references.jsonl`.

    The file format specific metadata is copied from metax if it is
    available. It is written to
    `<sip_creation_path>/<url_encoded_filepath>-<metadata_type>-amd.xml`,
    where <metadata_type> is NISOIMG, ADDML, AudioMD, or VideoMD.
    File format specific metadata references are written to a json-file
    depending on file format. For example, refences to NISOIMG metadata
    are written to `<sip_creation_path>/create-mix-md-references`.

    List of PREMIS event references is written to
    `<workspace>/create-technical-metadata.jsonl`

    The task requires workspace to be created, dataset metadata to be
    validated and dataset files to be downloaded.
    """

    success_message = 'Technical metadata for objects created'
    failure_message = 'Technical metadata for objects could not be created'

    def __init__(self, *args, **kwargs):
        """Initialize Task."""
        super(CreateTechnicalMetadata, self).__init__(*args, **kwargs)
        self.config_object = Configuration(self.config)
        self.metax_client = Metax(
            self.config_object.get('metax_url'),
            self.config_object.get('metax_user'),
            self.config_object.get('metax_password'),
            verify=self.config_object.getboolean('metax_ssl_verification'))

    def requires(self):
        """List the Tasks that this Task depends on.

        :returns: dictionary of required tasks
        """
        return {
            'workspace':
            CreateWorkspace(workspace=self.workspace,
                            dataset_id=self.dataset_id,
                            config=self.config),
            'validation':
            ValidateMetadata(workspace=self.workspace,
                             dataset_id=self.dataset_id,
                             config=self.config),
            'files':
            GetFiles(workspace=self.workspace,
                     dataset_id=self.dataset_id,
                     config=self.config)
        }

    def output(self):
        """Return output target of this Task.

        :returns: `<workspace>/create-technical-metadata.jsonl`
        :rtype: LocalTarget
        """
        return LocalTarget(
            os.path.join(self.workspace, 'create-technical-metadata.jsonl'))

    def run(self):
        """Create techincal metadta.

        Creates PREMIS technical metadata files and technical attribute
        files.

        :returns: ``None``
        """
        files = self.metax_client.get_dataset_files(self.dataset_id)

        # Create one timestamp for import_object events to avoid
        # creating new events each time import_object is iterated
        event_datetime = datetime.datetime.utcnow().isoformat()

        tmp = os.path.join(self.config_object.get('packaging_root'), 'tmp/')
        with TemporaryDirectory(prefix=tmp) as temporary_workspace:
            for file_ in files:

                filepath = os.path.join('dataset_files',
                                        file_['file_path'].strip('/'))

                # Create METS document that contains PREMIS metadata
                self.create_objects(file_, filepath, event_datetime,
                                    temporary_workspace)

                # Create METS documents that contain technical
                # attributes
                self.create_technical_attributes(file_, filepath,
                                                 temporary_workspace)

            # Move created files to sip creation directory. PREMIS event
            # reference file is moved to output target path after
            # everything else is done.
            with self.output().temporary_path() as target_path:
                shutil.move(
                    os.path.join(temporary_workspace,
                                 'premis-event-md-references.jsonl'),
                    target_path)
                for file_ in os.listdir(temporary_workspace):
                    shutil.move(os.path.join(temporary_workspace, file_),
                                self.sip_creation_path)

    def create_objects(self, metadata, filepath, event_datetime, output):
        """Create PREMIS metadata for file.

        Reads file metadata from Metax. Technical metadata is generated
        by siptools import_object script.

        :param metadata: file metadata dictionary
        :param filepath: file path in SIP
        :param event_datetime: the timestamp for the import_object
                               events
        :param output: output directory for import_object script
        :returns: ``None``
        """
        # Read character set if it defined for this file
        try:
            charset = metadata["file_characteristics"]["encoding"]
        except KeyError:
            charset = None

        # Read format version if it is defined for this file
        try:
            formatversion = metadata["file_characteristics"]["format_version"]
        except KeyError:
            formatversion = ""

        digest_algorithm = metadata["checksum"]["algorithm"]

        # figure out the checksum algorithm
        if digest_algorithm in ["md5", "sha2"]:
            digest_algorithm = algorithm_name(digest_algorithm,
                                              metadata["checksum"]["value"])

        # Read file creation date if it is defined for this file
        try:
            date_created = metadata["file_characteristics"]["file_created"]
        except KeyError:
            date_created = None

        # Create PREMIS file metadata XML
        siptools.scripts.import_object.import_object(
            filepaths=[filepath],
            base_path=self.workspace,
            workspace=output,
            skip_wellformed_check=True,
            file_format=(metadata["file_characteristics"]["file_format"],
                         formatversion),
            checksum=(digest_algorithm, metadata["checksum"]["value"]),
            charset=charset,
            date_created=date_created,
            event_datetime=event_datetime,
            event_target='.')

    def create_technical_attributes(self, metadata, filepath, output):
        """Create technical metadata for a file

        Create METS TechMD files for each metadata type based on previously
        scraped file characteristics

        :param file_identifier: file identifier
        :param filepath: path of file in SIP
        :param output: Path to the temporary workspace
        :returns: ``None``
        """
        creator = siptools.mdcreator.MetsSectionCreator(output)
        metadata_generator = XMLMetadataGenerator(file_path=os.path.join(
            self.input()['files'].path, metadata['file_path'].strip('/')),
                                                  file_metadata=metadata)

        md_elems = metadata_generator.create()

        for md_elem in md_elems:
            # Retrieve the wrapped MD document
            md_namespace = md_elem.nsmap[md_elem.prefix]

            mdtype = TECH_ATTR_TYPES[md_namespace]["mdtype"]
            mdtypeversion = TECH_ATTR_TYPES[md_namespace]["mdtypeversion"]
            othermdtype = TECH_ATTR_TYPES[md_namespace].get(
                "othermdtype", None)
            ref_file = TECH_ATTR_TYPES[md_namespace]["ref_file"]

            # Create METS TechMD file
            techmd_id, _ = creator.write_md(metadata=md_elem,
                                            mdtype=mdtype,
                                            mdtypeversion=mdtypeversion,
                                            othermdtype=othermdtype)

            # Add reference from fileSec to TechMD
            creator.add_reference(techmd_id, filepath)
            creator.write(ref_file=ref_file)
def test_dataset_directories_caching_works(requests_mock):
    """Test is_consistent_for_file method.

    Checks that caching of dataset directories in``DatasetConsistency``
    works and no extra calls are done to Metax. In this test dataset contains
    only one entry in dataset directories which is the root directory of the
    dataset files:

    /root_dir/second_par_dir/first_par_dir/file1
    /root_dir/second_par_dir/first_par_dir/file1

    :returns: ``None``
    """
    FILE_METADATA = {
        'file_path': "/path/to/file1",
        'parent_directory': {
            'identifier': 'first_par_dir'
        },
        "checksum": {
            "algorithm": "md5",
            "value": "foobar"
        },
        "file_characteristics": {
            "file_format": "text/csv"
        },
        "file_storage": {
            "identifier": "foobar",
            "id": 1
        }
    }
    file_1 = copy.deepcopy(FILE_METADATA)
    file_1['identifier'] = 'file_identifier1'
    file_2 = copy.deepcopy(FILE_METADATA)
    file_2['identifier'] = 'file_identifier2'

    # Init metax client
    configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE)
    client = Metax(
        configuration.get('metax_url'),
        configuration.get('metax_user'),
        configuration.get('metax_password'),
        verify=configuration.getboolean('metax_ssl_verification')
    )
    dataset = {
        'identifier': 'dataset_identifier',
        'research_dataset': {
            'files': [],
            'directories': [{'identifier': 'root_dir'}]
        }
    }

    first_par_dir_adapter = requests_mock.get(
        tests.conftest.METAX_URL + '/directories/first_par_dir',
        json={
            'identifier': 'first_par_dir',
            'parent_directory': {
                'identifier': 'second_par_dir'
            }
        },
        status_code=200
    )
    second_par_dir_adapter = requests_mock.get(
        tests.conftest.METAX_URL + '/directories/second_par_dir',
        json={
            'identifier': 'second_par_dir',
            'parent_directory': {
                'identifier': 'root_dir'
            }
        },
        status_code=200
    )
    try:
        dirs = DatasetConsistency(client, dataset)
        dirs.is_consistent_for_file(file_1)
        dirs.is_consistent_for_file(file_2)
    except InvalidDatasetMetadataError as exc:
        pytest.fail(
            '_verify_file_contained_by_dataset raised exception: ' + str(exc)
        )
    # verify that dataset directory caching works. Metax is called only once
    # for the parent directories for the two files.
    assert first_par_dir_adapter.call_count == 1
    assert second_par_dir_adapter.call_count == 1
Esempio n. 25
0
class MetaxClient(object):
    """Class for handling Metax metadata."""

    def __init__(self, url=None, user=None, password=None, verify=None):
        """Init MetaxClient instances."""
        # If any of the params is not provided read them from app.config
        if url is None or user is None or password is None:
            url = CONFIG.get("METAX_URL")
            user = CONFIG.get("METAX_USER")
            password = CONFIG.get("METAX_PASSWORD")

        if verify is None:
            verify = CONFIG.get("METAX_SSL_VERIFICATION", True)

        self.client = Metax(url, user, password, verify=verify)
        # dataset_id => preservation_state dict
        self.dataset_cache = {}

    def get_files_dict(self, project):
        """Return dict {fpath: id} of all the files of a given project.
        """
        return self.client.get_files_dict(project)

    def post_metadata(self, fpaths, root_upload_path, username, storage_id):
        """Generate file metadata and POST it to Metax in 5k chunks.

        :param fpaths: List of files for which to generate the metadata
        :param root_upload_path: root upload directory
        :param username: current user
        :param storage_id: pas storage identifier in Metax
        :returns: Stripped HTTP response returned by Metax.
                  Success list contains succesfully generated file
                  metadata in format:
                  [
                      {
                          "object": {
                              "identifier": identifier,
                              "file_path": file_path,
                              "checksum": {"value": checksum},
                              "parent_directory": {
                                  "identifier": identifier
                              }
                          }
                      },
                      .
                      .
                      .
                  ]
        """
        database = db.Database()
        project = database.user(username).get_project()
        checksums = database.checksums.get_checksums()
        metadata = []
        responses = []

        i = 0
        for fpath in fpaths:
            metadata.append(_generate_metadata(
                fpath, root_upload_path,
                project, storage_id, checksums
            ))

            # POST metadata to Metax every 5k steps
            i += 1
            if i % 5000 == 0:
                response = self.client.post_file(metadata)
                responses.append(_strip_metax_response(response))
                # Add created identifiers to Mongo
                if "success" in response and response["success"]:
                    database.store_identifiers(
                        response["success"], root_upload_path, username
                    )

                metadata = []

        # POST remaining metadata
        if metadata:
            response = self.client.post_file(metadata)
            responses.append(_strip_metax_response(response))
            # Add created identifiers to Mongo
            if "success" in response and response["success"]:
                database.store_identifiers(
                    response["success"], root_upload_path, username
                )

        # Merge all responses into one response
        response = {"success": [], "failed": []}
        for metax_response in responses:
            if "success" in metax_response:
                response["success"].extend(metax_response["success"])
            if "failed" in metax_response:
                response["failed"].extend(metax_response["failed"])

        return response

    def delete_metadata(self, project, fpaths):
        """DELETE metadata from Metax.

        :param project: Project identifier
        :param fpaths: List of file_paths to remove
        :returns: HTTP response returned by Metax
        """
        files_dict = self.client.get_files_dict(project)

        # Retrieve "file -> dataset" association map
        file_ids = [
            file_["identifier"] for file_ in six.itervalues(files_dict)
        ]
        file2datasets = {}
        if file_ids:
            file2datasets = self.client.get_file2dataset_dict(file_ids)

        # Delete metadata if file exists in fpaths AND it doesn't have
        # any datasets
        file_ids_to_delete = []
        for metax_path, file_ in six.iteritems(files_dict):
            path_exists = metax_path in fpaths
            dataset_exists = file2datasets.get(file_["identifier"], None)

            if path_exists and not dataset_exists:
                file_ids_to_delete.append(file_["identifier"])

        if not file_ids_to_delete:
            return {"deleted_files_count": 0}

        return self.client.delete_files(file_ids_to_delete)

    def delete_file_metadata(self, project, fpath, root_upload_path=None,
                             force=False):
        """Delete file metadata from Metax if file is not associated
        with any dataset.

        If force parameter is True metadata is deleted if the file
        belongs to a dataset not accepted to preservation.
        """
        self.dataset_cache.clear()
        files_dict = self.client.get_files_dict(project)
        metax_path = get_metax_path(fpath, root_upload_path)

        if metax_path not in files_dict:
            raise MetaxClientError("Metadata not found in Metax")

        file_metadata = files_dict[metax_path]
        if file_metadata["storage_identifier"] != PAS_FILE_STORAGE_ID:
            raise MetaxClientError("Incorrect file storage")
        if not force and self.file_has_dataset(metax_path, files_dict):
            raise MetaxClientError("Metadata is part of a dataset")
        if self.file_has_accepted_dataset(metax_path, files_dict):
            raise MetaxClientError(
                "Metadata is part of an accepted dataset"
            )

        file_id = six.text_type(file_metadata["id"])
        return self.client.delete_file(file_id)

    def delete_all_metadata(self, project, fpath, root_upload_path,
                            force=False):
        """Delete all file metadata from Metax found under dir fpath,
        which is not associated with any dataset and is stored in PAS
        file storage.

        If force parameter is True metadata is deleted if file belongs
        to a dataset not accepted to preservation.
        """
        self.dataset_cache.clear()
        files_dict = self.client.get_files_dict(project)
        files_to_delete = {}

        # Iterate through all files under dir fpath
        for dirpath, _, files in os.walk(fpath):
            for _file in files:
                fpath = os.path.join(dirpath, _file)
                metax_path = get_metax_path(fpath, root_upload_path)
                if metax_path not in files_dict:
                    continue
                storage_id = files_dict[metax_path]["storage_identifier"]
                if storage_id != PAS_FILE_STORAGE_ID:
                    continue

                files_to_delete[metax_path] = files_dict[metax_path]

        if force:
            # Delete metadata for files which don't belong to accepted
            # datasets
            # FIXME: Deleting all file metadata when 'force' is in use
            # is inefficient at the moment due to each check requiring
            # an API call.
            file_ids_to_delete = [
                file_["identifier"] for metax_path, file_
                in six.iteritems(files_to_delete)
                if not self.file_has_accepted_dataset(metax_path, files_dict)
            ]
        else:
            # Delete metadata for files that don't belong to datasets
            file_ids = [
                file_["identifier"] for file_
                in six.itervalues(files_to_delete)
            ]
            # Retrieve related datasets in a single bulk operation
            file2datasets = {}
            if file_ids:
                file2datasets = self.client.get_file2dataset_dict(file_ids)

            file_ids_to_delete = [
                file_["identifier"] for metax_path, file_
                in six.iteritems(files_to_delete)
                if not file2datasets.get(file_["identifier"], None)
            ]

        if not file_ids_to_delete:
            return {"deleted_files_count": 0}

        return self.client.delete_files(file_ids_to_delete)

    def get_all_ids(self, project_list):
        """Get a set of all identifiers of files in any of the projects
        in project_list.
        """
        id_set = set()

        # Iterate all projects
        for project in project_list:
            # Find all indentifiers in one project
            files_dict = self.get_files_dict(project)
            project_id_set = {
                _file["identifier"] for _file in files_dict.values()
            }

            # Add the identifiers to id_set
            id_set |= project_id_set

        return id_set

    def file_has_dataset(self, metax_path, files_dict):
        """Check if file belongs to any dataset."""
        if metax_path not in files_dict:
            return False

        file_id = files_dict[metax_path]["id"]
        datasets = self.client.get_file_datasets(file_id)
        return len(datasets) != 0

    def file_has_accepted_dataset(self, metax_path, files_dict):
        """Check if file belongs to dataset accepted to preservation."""
        if metax_path in files_dict:
            file_id = files_dict[metax_path]["id"]
            dataset_ids = self.client.get_file_datasets(file_id)
            for dataset_id in dataset_ids:
                if dataset_id not in self.dataset_cache:
                    dataset = self.client.get_dataset(dataset_id)
                    self.dataset_cache[dataset_id] = \
                        dataset['preservation_state']
                dataset_state = self.dataset_cache[dataset_id]
                if (DS_STATE_ACCEPTED_TO_DIGITAL_PRESERVATION <=
                        dataset_state <=
                        DS_STATE_IN_DIGITAL_PRESERVATION):
                    return True
        return False