Example #1
0
def test_index_aipfile_fileuuid(
    dummy_try_to_index,
    dummy_wait_for_cluster_yellow_status,
    dummy_get_dashboard_uuid,
    metsfile,
    fileuuid_dict,
    aipuuid,
    aipname,
):
    """Check AIP file uuids are being correctly parsed from METS files.

    Mock _try_to_index() with a function that populates a dict
    indexed_data, with the fileuuids that _index_aip_files() obtained
    from the METS
    """

    dummy_get_dashboard_uuid.return_value = "test-uuid"

    indexed_data = {}

    def get_fileuuid(client, indexData, index, printfn):
        indexed_data[indexData["filePath"]] = indexData["FILEUUID"]

    dummy_try_to_index.side_effect = get_fileuuid

    elasticSearchFunctions._index_aip_files(
        client=None,
        uuid=aipuuid,
        mets_path=os.path.join(THIS_DIR, "fixtures", metsfile),
        name="{}-{}".format(aipname, aipuuid),
        identifiers=[],
    )

    for file_uuid in fileuuid_dict:
        assert indexed_data[file_uuid["filePath"]] == file_uuid["FILEUUID"]
def test_index_aipfile_fileuuid(
    dummy_helpers_bulk,
    dummy_get_dashboard_uuid,
    metsfile,
    fileuuid_dict,
    aipuuid,
    aipname,
):
    """Check AIP file uuids are being correctly parsed from METS files.

    Mock _try_to_index() with a function that populates a dict
    indexed_data, with the fileuuids that _index_aip_files() obtained
    from the METS
    """

    dummy_get_dashboard_uuid.return_value = "test-uuid"

    indexed_data = {}

    def _bulk(client, actions, stats_only=False, *args, **kwargs):
        for item in actions:
            indexed_data[item["_source"]["filePath"]] = item["_source"]["FILEUUID"]

    dummy_helpers_bulk.side_effect = _bulk

    elasticSearchFunctions._index_aip_files(
        client=None,
        uuid=aipuuid,
        mets=etree.parse(os.path.join(THIS_DIR, "fixtures", metsfile)).getroot(),
        name="{}-{}".format(aipname, aipuuid),
        identifiers=[],
    )

    for file_uuid in fileuuid_dict:
        assert indexed_data[file_uuid["filePath"]] == file_uuid["FILEUUID"]
Example #3
0
def test_index_aipfile_dmdsec(
    dummy_try_to_index,
    dummy_wait_for_cluster_yellow_status,
    dummy_get_dashboard_uuid,
    metsfile,
    dmdsec_dict,
):
    """Check AIP file dmdSec is correctly parsed from METS files.

    Mock _try_to_index() with a function that populates a dict
    indexed_data, with the dmdSec data that _index_aip_files() obtained
    from the METS
    """

    dummy_get_dashboard_uuid.return_value = "test-uuid"

    indexed_data = {}

    def get_dublincore_metadata(client, indexData, index, printfn):
        try:
            dmd_section = indexData["METS"]["dmdSec"]
            metadata_container = dmd_section["ns0:xmlData_dict_list"][0]
            dc = metadata_container["ns1:dublincore_dict_list"][0]
        except (KeyError, IndexError):
            dc = None
        indexed_data[indexData["filePath"]] = dc

    dummy_try_to_index.side_effect = get_dublincore_metadata

    elasticSearchFunctions._index_aip_files(
        client=None,
        uuid="DUMMYUUID",
        mets_path=os.path.join(THIS_DIR, "fixtures", metsfile),
        name="{}-{}".format("DUMMYNAME", "DUMMYUUID"),
        identifiers=[],
    )

    for key, value in dmdsec_dict["dublincore_dict"].iteritems():
        assert indexed_data[dmdsec_dict["filePath"]][key] == value
def test_index_aipfile_dmdsec(
    dummy_helpers_bulk, dummy_get_dashboard_uuid, metsfile, dmdsec_dict
):
    """Check AIP file dmdSec is correctly parsed from METS files.

    Mock _try_to_index() with a function that populates a dict
    indexed_data, with the dmdSec data that _index_aip_files() obtained
    from the METS
    """

    dummy_get_dashboard_uuid.return_value = "test-uuid"

    indexed_data = {}

    def _bulk(client, actions, stats_only=False, *args, **kwargs):
        for item in actions:
            try:
                dmd_section = item["_source"]["METS"]["dmdSec"]
                metadata_container = dmd_section["mets:xmlData_dict_list"][0]
                dc = metadata_container["dcterms:dublincore_dict_list"][0]
            except (KeyError, IndexError):
                dc = None
            indexed_data[item["_source"]["filePath"]] = dc

    dummy_helpers_bulk.side_effect = _bulk

    elasticSearchFunctions._index_aip_files(
        client=None,
        uuid="DUMMYUUID",
        mets=etree.parse(os.path.join(THIS_DIR, "fixtures", metsfile)).getroot(),
        name="{}-{}".format("DUMMYNAME", "DUMMYUUID"),
        identifiers=[],
    )

    for key, value in dmdsec_dict["dublincore_dict"].iteritems():
        assert indexed_data[dmdsec_dict["filePath"]][key] == value
Example #5
0
    def test_index_mets_file_metadata(
        self,
        dummy_try_to_index,
        dummy_wait_for_cluster_yellow_status,
        dummy_get_dashboard_uuid,
    ):
        # Set up mocked functions
        dummy_get_dashboard_uuid.return_value = 'test-uuid'
        indexed_data = {}

        def get_dublincore_metadata(client, indexData, index, printfn):
            try:
                dmd_section = indexData['METS']['dmdSec']
                metadata_container = dmd_section['ns0:xmlData_dict_list'][0]
                dc = metadata_container['ns1:dublincore_dict_list'][0]
            except (KeyError, IndexError):
                dc = None
            indexed_data[indexData['filePath']] = dc

        dummy_try_to_index.side_effect = get_dublincore_metadata

        # This METS file is a cut-down version of the AIP METS produced
        # using the SampleTransfers/DemoTransfer
        mets_file_path = os.path.join(THIS_DIR, 'fixtures',
                                      'test_index_metadata-METS.xml')
        mets_object_id = '771aa252-7930-4e68-b73e-f91416b1d4a4'
        uuid = 'f42a260a-9b53-4555-847e-8a4329c81662'
        sipName = 'DemoTransfer-{}'.format(uuid)
        identifiers = []
        indexed_files_count = elasticSearchFunctions._index_aip_files(
            client=self.client,
            uuid=uuid,
            mets_path=mets_file_path,
            name=sipName,
            identifiers=identifiers,
        )

        # ES should have indexed 12 files
        # - 5 content files
        # - 5 checksum and csv files in the metadata directory
        # - 2 files generated in the transfer process
        assert indexed_files_count == 12
        assert dummy_try_to_index.call_count == 12

        # Metadata should have been indexed only for these content
        # files because they are listed in the metadata.csv file
        content_files_with_metadata = (
            {
                'path': ('objects/View_from_lookout_over_Queenstown_'
                         'towards_the_Remarkables_in_spring.jpg'),
                'title': ('Morning view from lookout over Queenstown '
                          'towards the Remarkables in spring'),
                'creator':
                'Pseudopanax at English Wikipedia',
            },
            {
                'path':
                'objects/beihai.tif',
                'title':
                'Beihai, Guanxi, China, 1988',
                'creator': ('NASA/GSFC/METI/ERSDAC/JAROS and U.S./Japan '
                            'ASTER Science Team'),
            },
            {
                'path': 'objects/bird.mp3',
                'title': '14000 Caen, France - Bird in my garden',
                'creator': 'Nicolas Germain',
            },
            {
                'path': 'objects/ocr-image.png',
                'title': 'OCR image',
                'creator': 'Tesseract',
            },
        )
        for file_metadata in content_files_with_metadata:
            dc = indexed_data[file_metadata['path']]
            assert dc['dc:title'] == file_metadata['title']
            assert dc['dc:creator'] == file_metadata['creator']

        # There is no metadata for this content file because
        # it was not listed in the metadata.csv file
        assert indexed_data['objects/piiTestDataCreditCardNumbers.txt'] is None

        # Checksum and csv files in the metadata directory
        # won't have dublin core metadata indexed
        files_in_metadata_directory = (
            'checksum.md5',
            'checksum.sha1',
            'checksum.sha256',
            'metadata.csv',
            'rights.csv',
        )
        for filename in files_in_metadata_directory:
            path = 'objects/metadata/transfers/DemoTransfer-{}/{}'.format(
                mets_object_id, filename)
            assert indexed_data[path] is None

        # Neither will the generated files during the transfer process
        generated_files = (
            'dc.json',
            'directory_tree.txt',
        )
        for filename in generated_files:
            path = 'objects/metadata/transfers/DemoTransfer-{}/{}'.format(
                mets_object_id, filename)
            assert indexed_data[path] is None
Example #6
0
    def test_index_mets_file_metadata(
        self,
        dummy_try_to_index,
        dummy_wait_for_cluster_yellow_status,
        dummy_get_dashboard_uuid,
    ):
        # Set up mocked functions
        dummy_get_dashboard_uuid.return_value = "test-uuid"
        indexed_data = {}

        def get_dublincore_metadata(client, indexData, index, printfn):
            try:
                dmd_section = indexData["METS"]["dmdSec"]
                metadata_container = dmd_section["ns0:xmlData_dict_list"][0]
                dc = metadata_container["ns1:dublincore_dict_list"][0]
            except (KeyError, IndexError):
                dc = None
            indexed_data[indexData["filePath"]] = dc

        dummy_try_to_index.side_effect = get_dublincore_metadata

        # This METS file is a cut-down version of the AIP METS produced
        # using the SampleTransfers/DemoTransfer
        mets_file_path = os.path.join(
            THIS_DIR, "fixtures", "test_index_metadata-METS.xml"
        )
        mets_object_id = "771aa252-7930-4e68-b73e-f91416b1d4a4"
        uuid = "f42a260a-9b53-4555-847e-8a4329c81662"
        sipName = "DemoTransfer-{}".format(uuid)
        identifiers = []
        indexed_files_count = elasticSearchFunctions._index_aip_files(
            client=self.client,
            uuid=uuid,
            mets_path=mets_file_path,
            name=sipName,
            identifiers=identifiers,
        )

        # ES should have indexed 12 files
        # - 5 content files
        # - 5 checksum and csv files in the metadata directory
        # - 2 files generated in the transfer process
        assert indexed_files_count == 12
        assert dummy_try_to_index.call_count == 12

        # Metadata should have been indexed only for these content
        # files because they are listed in the metadata.csv file
        content_files_with_metadata = (
            {
                "path": (
                    "objects/View_from_lookout_over_Queenstown_"
                    "towards_the_Remarkables_in_spring.jpg"
                ),
                "title": (
                    "Morning view from lookout over Queenstown "
                    "towards the Remarkables in spring"
                ),
                "creator": "Pseudopanax at English Wikipedia",
            },
            {
                "path": "objects/beihai.tif",
                "title": "Beihai, Guanxi, China, 1988",
                "creator": (
                    "NASA/GSFC/METI/ERSDAC/JAROS and U.S./Japan " "ASTER Science Team"
                ),
            },
            {
                "path": "objects/bird.mp3",
                "title": "14000 Caen, France - Bird in my garden",
                "creator": "Nicolas Germain",
            },
            {
                "path": "objects/ocr-image.png",
                "title": "OCR image",
                "creator": "Tesseract",
            },
        )
        for file_metadata in content_files_with_metadata:
            dc = indexed_data[file_metadata["path"]]
            assert dc["dc:title"] == file_metadata["title"]
            assert dc["dc:creator"] == file_metadata["creator"]

        # There is no metadata for this content file because
        # it was not listed in the metadata.csv file
        assert indexed_data["objects/piiTestDataCreditCardNumbers.txt"] is None

        # Checksum and csv files in the metadata directory
        # won't have dublin core metadata indexed
        files_in_metadata_directory = (
            "checksum.md5",
            "checksum.sha1",
            "checksum.sha256",
            "metadata.csv",
            "rights.csv",
        )
        for filename in files_in_metadata_directory:
            path = "objects/metadata/transfers/DemoTransfer-{}/{}".format(
                mets_object_id, filename
            )
            assert indexed_data[path] is None

        # Neither will the generated files during the transfer process
        generated_files = ("dc.json", "directory_tree.txt")
        for filename in generated_files:
            path = "objects/metadata/transfers/DemoTransfer-{}/{}".format(
                mets_object_id, filename
            )
            assert indexed_data[path] is None