def get_reference_lists(**attributes):
    """
    Fill the attributes with various lists.

    :attributes: The following keys:
                 workspace: Workspace path
                 object_refs: XML tree of digital objects.
                              Will be created if missing.
                 filelist: ID list of objects. Will be created if missing.
                 all_amd_refs: All administrative metadata references.
                               Will be created if missing.
                 all_dmd_refs: All descriptive metadata references.
                               Will be created if missing.
    :returns: Attributes filled with the lists listed above.
    """
    attributes["object_refs"] = attributes.get(
        "object_refs",
        read_md_references(attributes["workspace"],
                           "import-object-md-references.jsonl"))
    attributes["filelist"] = attributes.get(
        "filelist", get_objectlist(attributes["object_refs"]))
    attributes["all_amd_refs"] = attributes.get(
        "all_amd_refs", read_all_amd_references(attributes["workspace"]))
    attributes["all_dmd_refs"] = attributes.get(
        "all_dmd_refs",
        read_md_references(attributes["workspace"],
                           "import-description-md-references.jsonl"))

    return attributes
Example #2
0
def test_paths(testpath, directory, base_path, run_cli):
    """ Test the following path arguments:
    (1) Path without base_path
    (2) Path without base bath, but with "./"
    (3) Path with base path
    (4) Path with base path and with "./"
    (5) Absolute base path
    (6) Cases (1)-(5) with and without ending "/"
    """
    if "absolute" in base_path:
        base_path = os.path.join(os.getcwd(), "tests")
    if base_path != "":
        run_cli(import_description.main, [
            "--workspace", testpath, "--base_path", base_path,
            "--dmdsec_target", directory, "--remove_root",
            "tests/data/import_description/metadata/dc_description.xml"
        ])
    else:
        run_cli(import_description.main, [
            "--workspace", testpath, "--dmdsec_target", directory,
            "--remove_root",
            "tests/data/import_description/metadata/dc_description.xml"
        ])

    md_references = read_md_references(
        testpath,
        'import-description-md-references.jsonl')

    assert os.path.normpath(directory) in md_references
    assert os.path.isdir(os.path.normpath(os.path.join(base_path, directory)))
Example #3
0
def test_create_mix_techmdfile(testpath):
    """Test for ``create_mix_techmdfile`` function. Creates MIX techMD for
    three different image files. Two of the image files share the same MIX
    metadata, so only two MIX techMD files should be created in workspace.
    References to MIX techMD should be written into md-references.jsonl
    file.
    """

    creator = create_mix.MixCreator(testpath)

    os.makedirs(os.path.join(testpath, 'data'))
    for image in ['tiff1.tif', 'tiff2.tif', 'tiff1_compressed.tif']:
        # copy sample image into data directory in temporary workspace
        image_path = os.path.join(testpath, 'data/%s' % image)
        shutil.copy('tests/data/images/%s' % image, image_path)

        # Add metadata
        creator.add_mix_md(image_path)

    # Write metadata
    creator.write()

    # Count the MIX techMD files, i.e. the files with "NISOIMG-" prefix. There
    # should two of them since tiff1.tif and tiff2.tif share the same MIX
    # metadata.
    files = os.listdir(testpath)
    assert len([x for x in files if x.endswith('NISOIMG-amd.xml')]) == 2

    # Count the references written to md-reference file. There should be
    # one reference per image file.
    refs = read_md_references(testpath, 'create-mix-md-references.jsonl')

    assert len(refs) == 3
Example #4
0
def test_compile_structmap_dmdsecid(testpath, run_cli):
    """Test the compile_structmap script for workspace that contains
    descriptive metadata in dmdsec.xml file. The ID of dmdSec should be
    included in structMap.
    """
    # Create -premis-amd.xml and dmdsec.xml files in workspace
    run_cli(import_object.main, [
        '--workspace', testpath, '--skip_wellformed_check',
        'tests/data/structured/Software files/koodi.java'
    ])
    import_description.import_description(
        dmdsec_location='tests/data/import_description/metadata/'
        'dc_description.xml',
        workspace=testpath)

    # Create structmap
    run_cli(compile_structmap.main, ['--workspace', testpath])

    # The root div of structMap should have reference to dmdSec element in
    # dmdsec.xml
    refs = read_md_references(testpath,
                              'import-description-md-references.jsonl')
    dmdsecid = refs['.']['md_ids'][0]

    structmap = lxml.etree.parse(os.path.join(testpath, 'structmap.xml'))
    assert len(
        structmap.xpath('/mets:mets/mets:structMap/mets:div[@DMDID="%s"]' %
                        dmdsecid,
                        namespaces=NAMESPACES)) == 1
Example #5
0
def test_paths(testpath, file_, base_path, run_cli):
    """ Test the following path arguments:
    (1) Path without base_path
    (2) Path without base bath, but with './'
    (3) Path with base path
    (4) Path with base path and with './'
    (5) Absolute base path
    """
    if 'absolute' in base_path:
        base_path = os.path.join(os.getcwd(), 'tests')

    if base_path != '':
        run_cli(create_addml.main, [
            '--delim', DELIMITER, '--charset', CHARSET,
            '--sep', RECORDSEPARATOR, '--quot', QUOTINGCHAR,
            '--workspace', testpath, '--base_path', base_path, file_])
    else:
        run_cli(create_addml.main, [
            '--delim', DELIMITER, '--charset', CHARSET,
            '--sep', RECORDSEPARATOR, '--quot', QUOTINGCHAR,
            '--workspace', testpath, file_])

    references = read_md_references(testpath,
                                    'create-addml-md-references.jsonl')
    assert os.path.normpath(file_) in references

    assert os.path.isfile(os.path.normpath(os.path.join(base_path, file_)))
Example #6
0
def test_paths(testpath, file_, base_path, run_cli):
    """ Test the following path arguments:
    (1) Path without base_path
    (2) Path without base bath, but with "./"
    (3) Path with base path
    (4) Path with base path and with "./"
    (5) Absolute base path
    """
    if "absolute" in base_path:
        base_path = os.path.join(os.getcwd(), "tests")
    if base_path:
        run_cli(premis_event.main, [
            "--workspace", testpath, "--base_path", base_path,
            "--event_target", file_, "--event_detail", "foo",
            "--event_outcome", "success", "creation", "--event_outcome_detail",
            "Test ok", "2020-02-02T20:20:20"
        ])
    else:
        run_cli(premis_event.main, [
            "--workspace", testpath, "--event_target", file_,
            "--event_outcome_detail", "Test ok", "--event_detail", "foo",
            "--event_outcome", "success", "creation", "2020-02-02T20:20:20"
        ])

    md_references = read_md_references(testpath,
                                       'premis-event-md-references.jsonl')
    assert os.path.normpath(file_) in md_references
    assert os.path.isfile(os.path.normpath(os.path.join(base_path, file_)))
Example #7
0
def get_md_file(path,
                input_target,
                ref_file='import-description-md-references.jsonl',
                output_suffix='-dmdsec.xml'):
    """Get id"""
    refs = read_md_references(path, ref_file)
    reference = refs[fsdecode_path(input_target)]
    for dmdref in reference['md_ids']:
        output = os.path.join(path, dmdref[1:] + output_suffix)
        if os.path.exists(output):
            return output

    return None
Example #8
0
def test_amd_links_root(testpath, run_cli):
    """Tests that premis_event script writes reference links correctly
    to the md-references file.
    """
    run_cli(premis_event.main, [
        'creation', '2016-10-13T12:30:55', '--event_detail', 'Testing',
        '--event_outcome', 'success', '--event_outcome_detail', 'Test ok',
        '--workspace', testpath
    ])
    refs_file = os.path.join(testpath, 'premis-event-md-references.jsonl')
    assert os.path.isfile(refs_file)
    refs = read_md_references(testpath, 'premis-event-md-references.jsonl')
    assert '.' in refs
    assert refs['.']['path_type'] == 'directory'
Example #9
0
def copy_objects(workspace, data_dir):
    """
    Copy digital objects to workspace.

    :workspace: Workspace path
    :data_dir: Path to digital objects
    """
    files = get_objectlist(read_md_references(
        workspace, "import-object-md-references.jsonl"
    ))
    for source in files:
        target = os.path.join(workspace, source)
        if not os.path.exists(os.path.dirname(target)):
            os.makedirs(os.path.dirname(target))
        copyfile(os.path.join(data_dir, source), target)
Example #10
0
def test_amd_links_dir(testpath, run_cli):
    """Tests that premis_event script writes reference links correctly
    to the md-references file with a directory target.
    """
    target = 'tests/data'
    run_cli(premis_event.main, [
        'creation', '2016-10-13T12:30:55', '--event_detail', 'Testing',
        '--event_outcome', 'success', '--event_outcome_detail', 'Test ok',
        '--workspace', testpath, '--event_target', target
    ])

    refs_file = os.path.join(testpath, 'premis-event-md-references.jsonl')
    assert os.path.isfile(refs_file)
    refs = read_md_references(testpath, 'premis-event-md-references.jsonl')
    assert target in refs
Example #11
0
def test_compile_structmap_ok(testpath, run_cli):
    """Test the compile_structmap script."""
    create_test_data(testpath, run_cli)
    run_cli(compile_structmap.main, ['--workspace', testpath])

    output_structmap = os.path.join(testpath, 'structmap.xml')
    sm_tree = lxml.etree.parse(output_structmap)
    sm_root = sm_tree.getroot()

    output_filesec = os.path.join(testpath, 'filesec.xml')
    fs_tree = lxml.etree.parse(output_filesec)
    fs_root = fs_tree.getroot()

    assert len(
        fs_root.xpath(
            '/mets:mets/mets:fileSec/mets:fileGrp/mets:file/'
            'mets:FLocat[@xlink:href="file://tests/data/structured/'
            'Software+files/koodi.java"]',
            namespaces=NAMESPACES)) == 1

    assert len(
        sm_root.xpath('//mets:div[@TYPE="Software files"]',
                      namespaces=NAMESPACES)) == 1

    # Assert that an event has been created
    references = read_md_references(testpath,
                                    'premis-event-md-references.jsonl')

    for amdref in references['.']['md_ids']:
        output = os.path.join(testpath, amdref[1:] + '-PREMIS%3AEVENT-amd.xml')
        if os.path.exists(output):
            event_output_path = os.path.join(testpath, output)
            event_root = lxml.etree.parse(event_output_path).getroot()
            if premis.parse_event_type(event_root) != 'creation':
                continue
            found_root = event_root
    assert found_root.xpath('./*/*/*/*/*')[0].tag == ('{info:lc/xmlns/'
                                                      'premis-v2}event')
    assert found_root.xpath(
        '//premis:eventDetail',
        namespaces=NAMESPACES)[0].text == ('Creation of structural metadata '
                                           'with the compile-structmap script')
    assert found_root.xpath(
        '//premis:eventOutcomeDetailNote',
        namespaces=NAMESPACES)[0].text == ('Created METS structural map of '
                                           'type directory')
    def get_provenance_ids(self):
        """List identifiers of provenance events.

        Gets list of dataset provenance events from Metax, and reads
        provenance IDs of the events from event.xml files found in the
        workspace directory.

        :returns: list of provenance IDs
        """
        config_object = Configuration(self.config)
        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        metadata = metax_client.get_dataset(self.dataset_id)
        languages = get_dataset_languages(metadata)

        # Get the reference file path from Luigi task input
        # It already contains the workspace path.
        event_ids = get_md_references(
            read_md_references(
                self.workspace,
                os.path.basename(
                    self.input()['create_provenance_information'].path)))

        event_type_ids = {}
        for event_id in event_ids:
            event_file = event_id[1:] + "-PREMIS%3AEVENT-amd.xml"
            event_file_path = os.path.join(self.sip_creation_path, event_file)
            if not os.path.exists(event_file_path):
                continue
            root = ET.parse(encode_path(event_file_path)).getroot()
            event_type = root.xpath("//premis:eventType",
                                    namespaces=NAMESPACES)[0].text
            event_type_ids[event_type] = event_id

        provenance_ids = []
        for provenance in metadata["research_dataset"]["provenance"]:
            event_type = get_localized_value(
                provenance["preservation_event"]["pref_label"],
                languages=languages)
            provenance_ids += [event_type_ids[event_type]]

        return provenance_ids
Example #13
0
def test_create_techmd_without_charset(testpath, requests_mock):
    """Test techmd creation for files without defined charset.

    :param requests_mock: Mocker object
    :param testpath: Temporary directory fixture
    :returns: ``None``
    """
    text_file = copy.deepcopy(tests.metax_data.files.TXT_FILE)
    del text_file['file_characteristics']['encoding']
    tests.utils.add_metax_dataset(requests_mock, files=[text_file])

    # Create workspace that contains a textfile
    workspace = os.path.join(testpath, 'workspaces', 'workspace')
    sipdirectory = os.path.join(workspace, 'sip-in-progress')
    os.makedirs(sipdirectory)
    dataset_files = os.path.join(workspace, 'dataset_files')
    text_file_path = os.path.join(dataset_files, 'path', 'to', 'file')
    os.makedirs(os.path.dirname(text_file_path))
    with open(text_file_path, 'w') as file_:
        file_.write('foo')

    # Init and run task
    task = CreateTechnicalMetadata(workspace=workspace,
                                   dataset_id='dataset_identifier',
                                   config=tests.conftest.UNIT_TEST_CONFIG_FILE)
    task.run()

    # Metadata reference file and premis object XML file should be
    # created in SIP directory
    amd_refs = read_md_references(sipdirectory,
                                  'import-object-md-references.jsonl')
    assert len(amd_refs) == 1
    amd_id = amd_refs['dataset_files/path/to/file']['md_ids'][0][1:]
    premis_object_xml = lxml.etree.parse(
        os.path.join(sipdirectory,
                     '{}-PREMIS%3AOBJECT-amd.xml'.format(amd_id)))

    # If charset is not defined the siptools.import_objects default
    # value is used. Siptools recognizes ASCII text files as UTF-8 text
    # files.
    format_name = premis_object_xml.xpath("//premis:formatName",
                                          namespaces=NAMESPACES)[0].text
    assert format_name == 'text/plain; charset=UTF-8'
def test_main_utf8_files(testpath, run_cli):
    """Test for ``main`` function with filenames that contain non-ascii
    characters.
    """
    # Create sample data directory with file that has non-ascii characters in
    # filename
    os.makedirs(os.path.join(testpath, 'data'))
    relative_path = os.path.join('data', 'äöå.wav')
    full_path = os.path.join(testpath, relative_path)
    shutil.copy('tests/data/audio/valid__wav.wav', full_path)

    # Call main function with encoded filename as parameter
    run_cli(create_audiomd.main, [
        '--workspace', testpath, '--base_path', testpath,
        relative_path.encode(sys.getfilesystemencoding())
    ])

    # Check that filename is found in amd-reference file.
    refs = read_md_references(testpath, 'create-audiomd-md-references.jsonl')
    assert refs["data/äöå.wav"]
def get_amd_file(path,
                 input_file,
                 stream=None,
                 ref_file='import-object-md-references.jsonl',
                 suffix='-PREMIS%3AOBJECT-amd.xml'):
    """Get id"""
    refs = read_md_references(path, ref_file)
    reference = refs[fsdecode_path(input_file)]

    if not stream:
        amdrefs = reference['md_ids']
    else:
        amdrefs = reference['streams'][stream]

    output = []
    for amdref in amdrefs:
        output_file = os.path.join(path, amdref[1:] + suffix)
        if os.path.exists(output_file):
            output.append(output_file)

    return output
Example #16
0
def test_paths(testpath, file_, base_path, run_cli):
    """ Test the following path arguments:
    (1) Path without base_path
    (2) Path without base bath, but with './'
    (3) Path with base path
    (4) Path with base path and with './'
    (5) Absolute base path
    """
    if 'absolute' in base_path:
        base_path = os.path.join(os.getcwd(), 'tests')

    if base_path != '':
        run_cli(create_mix.main, [
            '--workspace', testpath, '--base_path', base_path, file_])
    else:
        run_cli(create_mix.main, ['--workspace', testpath, file_])

    refs = read_md_references(testpath, 'create-mix-md-references.jsonl')
    assert refs[os.path.normpath(file_)]

    assert os.path.isfile(os.path.normpath(os.path.join(base_path, file_)))
def test_add_fptrs_div_ead(testpath, run_cli, hrefs, length, child_elem,
                           order):
    """Tests the add_fptrs_div_ead function by asserting that the c_div
    element has been modified with fptrs and divs correctly according to
    the test cases.
    """
    create_test_data(testpath, run_cli, order=order)
    div_elem = '<mets:div xmlns:mets="http://www.loc.gov/METS/"></mets:div>'

    xml = ET.fromstring(div_elem)
    attrs = {}
    attrs["all_amd_refs"] = read_md_references(
        testpath, "import-object-md-references.jsonl")
    attrs["object_refs"] = attrs["all_amd_refs"]
    attrs["workspace"] = testpath
    attrs["filelist"] = [
        'tests/data/structured/Publication files/publication.txt',
        'tests/data/structured/Software files/koodi.java'
    ]
    filegrp = filegrp = mets.filegrp()
    c_div = compile_structmap.add_fptrs_div_ead(xml, hrefs, filegrp, attrs)

    # Child elements are either new divs or fptrs
    assert c_div.xpath(
        './*')[0].tag == '{http://www.loc.gov/METS/}%s' % child_elem

    # Number of child elements should equal the number of valid hrefs
    assert len(c_div.xpath('./*')) == length

    # Number of fptr elements should equal the number of valid hrefs
    assert len(c_div.findall('.//{http://www.loc.gov/METS/}fptr')) == length

    # If file properties exist, it is written to the divs
    if order and length == 1:
        assert 'ORDER' in c_div.attrib
    elif order:
        assert 'ORDER' in c_div.xpath('./*')[0].attrib
        assert c_div.xpath('./*')[0].get('TYPE') == 'dao'
    else:
        assert 'ORDER' not in c_div.attrib
def test_import_object_multiple(testpath, run_cli):
    """Tests that mport object works for multiple files when filepaths
    is a directory. The test asserts that an equal amount of premis
    object metadata files have been created to the amount of imported
    files. The test also checks that the numer of links in the reference
    file equals that amount.
    """
    arguments = [
        '--workspace', testpath, '--skip_wellformed_check',
        'tests/data/structured'
    ]
    run_cli(import_object.main, arguments)

    expected_files = 9

    refs = read_md_references(testpath, 'import-object-md-references.jsonl')
    assert len(refs) == expected_files

    count = 0
    for filename in os.listdir(testpath):
        if filename.endswith('-PREMIS%3AOBJECT-amd.xml'):
            count += 1
    assert count == expected_files
Example #19
0
def test_add_mdreference(testpath, references, expected):
    """Test add_reference and write_references functions. Calls
    the add_reference function for each reference in references and
    assert that the write_references writes the expected output.

    This test test with different scenarios of references, invluding
    one or multiple files and bitstreams and a combination thereof.
    """

    md_creator = MetsSectionCreator(testpath)

    for reference in references:
        md_creator.add_reference(md_id=reference[0],
                                 filepath=reference[1],
                                 stream=reference[2])

    md_creator.write_references('md-references.json')

    created_references = read_md_references(testpath, 'md-references.json')

    assert len(created_references) == len(expected)

    for path in expected:
        assert path in created_references
        assert len(created_references[path]['md_ids']) == len(
            expected[path]['md_ids'])
        assert len(created_references[path]['streams']) == len(
            expected[path]['streams'])
        for ref in expected[path]['md_ids']:
            assert ref in created_references[path]['md_ids']
        for stream in expected[path]['streams']:
            assert stream in created_references[path]['streams']
            assert len(created_references[path]['streams'][stream]) == len(
                expected[path]['streams'][stream])
            for stream_id in expected[path]['streams'][stream]:
                assert stream_id in created_references[path]['streams'][stream]
def test_create_structmap_ok(testpath):
    """Test the workflow task CreateStructMap.

    :param testpath: Temporary directory fixture
    :returns: ``None``
    """
    # Create clean workspace directory for dataset that contains many
    # files in directories and subdirectories in sip creation directory
    workspace = os.path.join(testpath, 'workspaces', 'workspace')
    sip_creation_path = os.path.join(workspace, "sip-in-progress")
    data_directory_path = os.path.join(sip_creation_path, 'data')
    subdirectory_path = os.path.join(data_directory_path, 'subdirectory')
    os.makedirs(subdirectory_path)
    with open(os.path.join(data_directory_path, 'file1'), 'w') \
            as file_in_directory:
        file_in_directory.write('foo')
    with open(os.path.join(data_directory_path, 'file2'), 'w') \
            as file_in_directory:
        file_in_directory.write('bar')
    with open(os.path.join(subdirectory_path, 'file3'), 'w') \
            as file_in_subdirectory:
        file_in_subdirectory.write('baz')

    # Create required metadata in workspace directory
    _create_metadata(workspace, 'data')

    # Init and run CreateStructMap task
    sip_content_before_run = os.listdir(sip_creation_path)
    task = CreateStructMap(workspace=workspace,
                           dataset_id='create_structmap_test_dataset',
                           config=tests.conftest.UNIT_TEST_CONFIG_FILE)
    task.run()
    assert task.complete()

    # Validate logical filesec XML-file
    filesec_xml = lxml.etree.parse(
        os.path.join(sip_creation_path, 'filesec.xml'))
    files = filesec_xml.xpath(
        '/mets:mets/mets:fileSec/mets:fileGrp/mets:file/mets:FLocat/'
        '@xlink:href',
        namespaces=NAMESPACES)
    assert len(files) == 3
    assert set(files) == set([
        'file://data/file1', 'file://data/file2',
        'file://data/subdirectory/file3'
    ])

    # Validate directory structure in structmap XML-file.
    structmap_xml = lxml.etree.parse(
        os.path.join(sip_creation_path, 'structmap.xml'))
    assert structmap_xml.xpath("/mets:mets/mets:structMap/mets:div/@TYPE",
                               namespaces=NAMESPACES)[0] == 'directory'
    assert structmap_xml.xpath(
        "/mets:mets/mets:structMap/mets:div/mets:div/@TYPE",
        namespaces=NAMESPACES)[0] == 'data'
    assert structmap_xml.xpath(
        "/mets:mets/mets:structMap/mets:div/mets:div/mets:div/@TYPE",
        namespaces=NAMESPACES)[0] == 'subdirectory'
    # Two files should be found in data directory
    assert len(
        structmap_xml.xpath(
            '/mets:mets/mets:structMap/mets:div/mets:div/mets:fptr/@FILEID',
            namespaces=NAMESPACES)) == 2
    # One file should be found in subdirectory of data directory
    assert len(
        structmap_xml.xpath(
            '/mets:mets/mets:structMap/mets:div/mets:div/mets:div'
            '/mets:fptr/@FILEID',
            namespaces=NAMESPACES)) == 1

    # Structure map should be linked to descriptive metadata creation
    # event
    descriptive_metadata_creation_event_id \
        = read_md_references(
            workspace,
            'create-descriptive-metadata.jsonl'
        )['.']['md_ids'][0]
    assert descriptive_metadata_creation_event_id in structmap_xml.xpath(
        "/mets:mets/mets:structMap/mets:div/@ADMID", namespaces=NAMESPACES)[0]

    # Only premis-event-md-references.jsonl, filesec.xml and
    # structmap.xml be created into SIP directory
    assert set(os.listdir(sip_creation_path)) \
        == set(sip_content_before_run + ['filesec.xml',
                                         'structmap.xml',
                                         'premis-event-md-references.jsonl'])
def test_compile_structmap_ok(testpath, run_cli):
    """Tests the successful compilation of mets:structmap
    by using ead3 metadata as basis. Test that a leading slash
    in the ead3 metadata is removed since only relative paths
    are allowed.
    """
    create_test_data(testpath, run_cli)
    arguments = [
        '--structmap_type', 'EAD3-logical', '--dmdsec_loc',
        'tests/data/import_description/metadata/ead3_test.xml', '--workspace',
        testpath
    ]
    run_cli(compile_structmap.main, arguments)

    output_structmap = os.path.join(testpath, 'structmap.xml')
    sm_tree = ET.parse(output_structmap)
    sm_root = sm_tree.getroot()

    output_filesec = os.path.join(testpath, 'filesec.xml')
    fs_tree = ET.parse(output_filesec)
    fs_root = fs_tree.getroot()

    assert len(
        fs_root.xpath(('/mets:mets/mets:fileSec/mets:fileGrp/*'),
                      namespaces=NAMESPACES)) == 2
    assert len(
        fs_root.xpath(
            ('/mets:mets/mets:fileSec/mets:fileGrp/mets:file/mets:FLocat'
             '[@xlink:href="file://tests/data/structured/Software+'
             'files/koodi.java"]'),
            namespaces=NAMESPACES)) == 1
    assert len(
        fs_root.xpath(
            ('/mets:mets/mets:fileSec/mets:fileGrp/mets:file/mets:FLocat'
             '[@xlink:href="file://tests/data/structured/Publication+'
             'files/publication.txt"]'),
            namespaces=NAMESPACES)) == 1
    assert len(
        sm_root.xpath('//mets:div/mets:div[@LABEL="fonds"]',
                      namespaces=NAMESPACES)) == 1
    assert len(
        sm_root.xpath('//mets:div/mets:div/mets:div[@LABEL="subseries"]',
                      namespaces=NAMESPACES)) == 1
    assert len(
        sm_root.xpath('//mets:div/mets:div/mets:div/mets:div[@LABEL="item"]',
                      namespaces=NAMESPACES)) == 1
    assert len(
        sm_root.xpath(
            '//mets:div/mets:div/mets:div/mets:div/mets:div[@LABEL="file"]',
            namespaces=NAMESPACES)) == 2
    assert sm_root.xpath(
        '//mets:div[@LABEL="file"]/*',
        namespaces=NAMESPACES)[0].tag == '{http://www.loc.gov/METS/}fptr'
    assert 'FILEID' in sm_root.xpath('//mets:div[@LABEL="file"]/*',
                                     namespaces=NAMESPACES)[0].attrib
    assert sm_root.xpath('//mets:div[@LABEL="file"]',
                         namespaces=NAMESPACES)[0].get('ORDER') == '1'
    assert sm_root.xpath('//mets:div[@LABEL="file"]',
                         namespaces=NAMESPACES)[1].get('ORDER') == '2'

    # Assert that an event has been created
    references = read_md_references(testpath,
                                    'premis-event-md-references.jsonl')

    for amdref in references['.']['md_ids']:
        output = os.path.join(testpath, amdref[1:] + '-PREMIS%3AEVENT-amd.xml')
        if os.path.exists(output):
            event_output_path = os.path.join(testpath, output)
            event_root = ET.parse(event_output_path).getroot()
            if premis.parse_event_type(event_root) != 'creation':
                continue
            found_root = event_root
    assert found_root.xpath('./*/*/*/*/*')[0].tag == ('{info:lc/xmlns/'
                                                      'premis-v2}event')
    assert found_root.xpath(
        '//premis:eventDetail',
        namespaces=NAMESPACES)[0].text == ('Creation of structural metadata '
                                           'with the compile-structmap script')
    assert found_root.xpath(
        '//premis:eventOutcomeDetailNote',
        namespaces=NAMESPACES)[0].text == ('Created METS structural map of '
                                           'type EAD3-logical')
Example #22
0
    def run(self):
        """Create structural map.

        Creates METS fileSec element based on contents of
        `sip-in-progress` directory and writes it to METS document
        `filesec.xml`. FileSec element is used to create physical
        structure map which is written to METS document `structmap.xml`.

        :returns: ``None``
        """
        # Merge premis event reference files
        md_ids = []
        for input_target in ('create_provenance_information',
                             'create_descriptive_metadata',
                             'create_technical_metadata'):
            md_ids += (read_md_references(
                self.workspace,
                self.input()[input_target].path)['.']['md_ids'])
        with open(os.path.join(self.sip_creation_path,
                               'premis-event-md-references.jsonl'), 'w') \
                as references:
            references.write(
                json.dumps({
                    ".": {
                        "path_type": "directory",
                        "streams": {},
                        "md_ids": md_ids
                    }
                }))

        # Setup required reference list and supplementary files information.
        (all_amd_refs, all_dmd_refs, object_refs, filelist,
         file_properties) = get_reference_lists(
             workspace=self.sip_creation_path)
        (supplementary_files, supplementary_types) = iter_supplementary(
            file_properties=file_properties)

        # Create fileSec
        (filesec, file_ids) = compile_structmap.create_filesec(
            all_amd_refs=all_amd_refs,
            object_refs=object_refs,
            file_properties=file_properties,
            supplementary_files=supplementary_files,
            supplementary_types=supplementary_types)
        with self.output()[0].open('wb') as filesecxml:
            filesecxml.write(serialize(filesec))

        # Create physical structmap
        structmap = compile_structmap.create_structmap(
            filesec=filesec,
            structmap_type='Fairdata-physical',
            file_ids=file_ids,
            all_amd_refs=all_amd_refs,
            all_dmd_refs=all_dmd_refs,
            filelist=filelist,
            supplementary_files=supplementary_files,
            supplementary_types=supplementary_types,
            file_properties=file_properties,
            workspace=self.sip_creation_path)
        with self.output()[1].open('wb') as structmapxml:
            structmap.write(structmapxml,
                            pretty_print=True,
                            xml_declaration=True,
                            encoding='UTF-8')
Example #23
0
def test_create_techmd_ok(testpath, requests_mock):
    """Test the workflow task CreateTechnicalMetadata module.

    :param testpath: Temporary directory fixture
    :param requests_mock: Mocker object
    :returns: ``None``
    """
    # Mock metax
    tests.utils.add_metax_dataset(requests_mock,
                                  files=[tests.metax_data.files.TIFF_FILE])

    # Create workspace that already contains the dataset files
    workspace = os.path.join(testpath, 'workspaces', 'workspace')
    sipdirectory = os.path.join(workspace, 'sip-in-progress')
    os.makedirs(sipdirectory)
    dataset_files = os.path.join(workspace, 'dataset_files')
    tiff_path = os.path.join(dataset_files, 'path/to/file')
    os.makedirs(os.path.dirname(tiff_path))
    shutil.copy('tests/data/sample_files/valid_tiff.tiff', tiff_path)

    # Init task
    task = CreateTechnicalMetadata(workspace=workspace,
                                   dataset_id='dataset_identifier',
                                   config=tests.conftest.UNIT_TEST_CONFIG_FILE)
    assert not task.complete()

    # Run task
    task.run()
    assert task.complete()

    # Premis object references should be written to file.
    premis_object_references \
        = read_md_references(sipdirectory, 'import-object-md-references.jsonl')
    assert len(premis_object_references) == 1
    assert len(
        premis_object_references['dataset_files/path/to/file']['md_ids']) == 1

    # Check that the PREMIS object file has desired properties
    premis_object_identifier \
        = (premis_object_references['dataset_files/path/to/file']
           ['md_ids'][0][1:])
    premis_object_file \
        = '{}-PREMIS%3AOBJECT-amd.xml'.format(premis_object_identifier)
    premis_object_xml = lxml.etree.parse(
        os.path.join(sipdirectory, premis_object_file))
    assert len(premis_object_xml.xpath('//mets:amdSec',
                                       namespaces=NAMESPACES)) == 1
    assert len(
        premis_object_xml.xpath("//premis:object", namespaces=NAMESPACES)) == 1
    assert premis_object_xml.xpath("//premis:object/@*",
                                   namespaces=NAMESPACES)[0] == 'premis:file'
    assert premis_object_xml.xpath(
        "//premis:formatName", namespaces=NAMESPACES)[0].text == 'image/tiff'
    assert premis_object_xml.xpath("//premis:formatVersion",
                                   namespaces=NAMESPACES)[0].text == '6.0'

    # The file properties of premis object should written to json file
    file_properties_file = '{}-scraper.json'.format(premis_object_identifier)
    with open(os.path.join(sipdirectory, file_properties_file)) as file_:
        file_properties = json.load(file_)
    assert file_properties['0']['mimetype'] == 'image/tiff'
    assert file_properties['0']['version'] == '6.0'

    # One premis event file should be created
    premis_event_files = [
        file_ for file_ in os.listdir(sipdirectory)
        if file_.endswith('-PREMIS%3AEVENT-amd.xml')
    ]
    assert len(premis_event_files) == 1
    premis_event_id \
        = premis_event_files[0].rsplit('-PREMIS%3AEVENT-amd.xml')[0]

    # Some premis agent files should be created
    premis_references = read_md_references(workspace,
                                           'create-technical-metadata.jsonl')
    premis_agent_files = [
        '{}-PREMIS%3AAGENT-amd.xml'.format(id_[1:])
        for id_ in premis_references['.']['md_ids']
        if id_[1:] != premis_event_id
    ]
    assert len(premis_agent_files) == 8
    for file_ in premis_agent_files:
        assert os.path.isfile(os.path.join(sipdirectory, file_))

    # MIX references should be written to file
    mix_references \
        = read_md_references(sipdirectory, 'create-mix-md-references.jsonl')
    assert len(mix_references) == 1
    assert len(mix_references['dataset_files/path/to/file']["md_ids"]) == 1
    assert mix_references['dataset_files/path/to/file']["md_ids"][0] \
        == '_dd0f489d6e47cc2dca598beb608cc78d'

    # Compare MIX metadata in techMD file to original MIX metadata in
    # Metax
    mets = lxml.etree.parse(
        os.path.join(sipdirectory,
                     'dd0f489d6e47cc2dca598beb608cc78d-NISOIMG-amd.xml'))
    mdwrap = mets.xpath('/mets:mets/mets:amdSec/mets:techMD/mets:mdWrap',
                        namespaces=NAMESPACES)[0]
    mix = mdwrap.xpath('mets:xmlData/mix:mix', namespaces=NAMESPACES)[0]
    with open("tests/data/mix_sample_tiff.xml", "rb") as mix_file:
        original_mix = lxml.etree.fromstring(mix_file.read())

    original_mix = original_mix.xpath(
        "/mets:mets/mets:amdSec/mets:techMD/mets:mdWrap/mets:xmlData/*",
        namespaces=NAMESPACES)[0]
    assert xml2simpledict(mix) == xml2simpledict(original_mix)

    # SIP directory should contain all technical metadata and related
    # files
    assert set(os.listdir(sipdirectory)) \
        == set(['import-object-md-references.jsonl',
                premis_object_file,
                file_properties_file,
                'create-mix-md-references.jsonl',
                'dd0f489d6e47cc2dca598beb608cc78d-NISOIMG-amd.xml',
                'import-object-extraction-AGENTS-amd.json']
               + premis_agent_files
               + premis_event_files)