Beispiel #1
0
    def write_doc(self, xml_file_path):
        """
        Write document to file

        @type xml_file_path: str
        @param xml_file_path: XML file path
        """
        # update timestamp
        self.set_lastchange(current_timestamp())
        xml_content = Etree.tostring(self.ted, encoding='UTF-8')
        with open(xml_file_path, 'w') as output_file:
            output_file.write(xml_content)
        output_file.close()
Beispiel #2
0
 def make_mdref(self, path, file, id, mdtype):
     mimetype, _ = self.mime.guess_type(os.path.join(path, file))
     rel_path = "file://./%s" % os.path.relpath(os.path.join(path, file),
                                                self.root_path)
     mets_mdref = {
         "LOCTYPE": "URL",
         "MIMETYPE": mimetype,
         "CREATED": current_timestamp(),
         q(XLINK_NS, "type"): "simple",
         q(XLINK_NS, "href"): rel_path,
         "CHECKSUMTYPE": "SHA-256",
         "CHECKSUM": get_sha256_hash(os.path.join(path, file)),
         "ID": id,
         "MDTYPE": mdtype
     }
     return mets_mdref
Beispiel #3
0
    def addEvent(self, premispath, info):
        '''
        Add an event to an exisiting Premis file (DefaultTask finalize method).

        @param premispath:
        @param info:
        @return:
        '''
        # print type(premispath)

        outcome = info['outcome']
        agent = info['task_name']
        event_type = info['event_type']
        linked_object = info['linked_object']

        premis_path = os.path.join(self.root_path, premispath)
        premis_parsed = etree.parse(premis_path)
        premis_root = premis_parsed.getroot()

        event_id = 'ID' + uuid.uuid4().__str__()
        event = P.event(
            P.eventIdentifier(P.eventIdentifierType('local'),
                              P.eventIdentifierValue(event_id)),
            P.eventType(event_type), P.eventDateTime(current_timestamp()),
            P.eventOutcomeInformation(P.eventOutcome(outcome)),
            P.linkingAgentIdentifier(
                P.linkingAgentIdentifierType('software'),
                P.linkingAgentIdentifierValue(
                    'E-ARK Web %s (task: %s)' %
                    (configuration.earkweb_version, agent))),
            P.linkingObjectIdentifier(
                P.linkingObjectIdentifierType('repository'),
                P.linkingObjectIdentifierValue(linked_object)))
        premis_root.insert(len(premis_root) - 1, event)

        str = etree.tostring(premis_root,
                             encoding='UTF-8',
                             pretty_print=True,
                             xml_declaration=True)
        with open(premis_path, 'w') as output_file:
            output_file.write(str)

        return
Beispiel #4
0
    def set_additional_data(self, task_name, additional_data_dict):
        """
        Set lastchange value

        @type lastchange: str
        @param lastchange: lastchange (timestamp)
        """

        provenance_elm = self.ted.find('.//provenance')
        if provenance_elm is None:
            provenance_elm = SubElement(self.ted, 'provenance')

        task_execution_elm = SubElement(provenance_elm, 'task_execution')

        datetime_elm = SubElement(task_execution_elm, 'datetime')
        datetime_elm.text = str(current_timestamp())
        task_name_elm = SubElement(task_execution_elm, 'task')
        task_name_elm.text = str(task_name)

        additional_data = {'additional_data': additional_data_dict}
        additional_data_elm = ConvertDictToXml(additional_data)
        task_execution_elm.append(additional_data_elm)
Beispiel #5
0
    def createDeliveryMets(self, input_archive, output_mets):
        #create delivery METS skeleton
        METS_ATTRIBUTES = {
            "OBJID": "UUID:" + uuid.uuid4().__str__(),
            "TYPE": "SIP",
            "LABEL": "Delivery METS",
            "PROFILE": "http://webb.eark/package/METS/IP_CS.xml",
            "ID": "ID" + uuid.uuid4().__str__()
        }
        root = M.mets(METS_ATTRIBUTES)
        root.attrib['{%s}schemaLocation' %
                    XSI_NS] = "http://www.loc.gov/METS/ schemas/IP.xsd"

        mets_hdr = M.metsHdr({"CREATEDATE": current_timestamp()})
        root.append(mets_hdr)

        mets_hdr.append(
            self.createAgent("ARCHIVIST", "ORGANIZATION", "", "Institution",
                             "Note"))
        mets_hdr.append(
            self.createAgent("CREATOR", "ORGANIZATION", "", "Institution",
                             "Note"))
        mets_hdr.append(
            self.createAgent("CREATOR", "OTHER", "SOFTWARE",
                             "E-ARK SIP Creator", "VERSION=0.0.1"))
        mets_hdr.append(
            self.createAgent("PRESERVATION", "ORGANIZATION", "", "Institution",
                             "Note"))
        _, fname = os.path.split(output_mets)
        mets_hdr.append(M.metsDocumentID(fname))

        mets_fileSec = M.fileSec()
        root.append(mets_fileSec)

        mets_filegroup = M.fileGrp({
            "USE": "PACKAGES",
            "ID": "ID" + uuid.uuid4().__str__()
        })
        mets_fileSec.append(mets_filegroup)

        content_id = self.addFile(input_archive, mets_filegroup)

        mets_structmap = M.structMap({
            "ID": "ID%s" % uuid.uuid4(),
            "TYPE": "physical",
            "LABEL": "Profilestructmap"
        })
        root.append(mets_structmap)
        mets_structmap_div = M.div({"LABEL": "Package"})
        mets_structmap.append(mets_structmap_div)
        mets_structmap_content_div = M.div({"LABEL": "Content"})
        mets_structmap_div.append(mets_structmap_content_div)
        fptr = M.fptr({"FILEID": "ID%s" % uuid.uuid4()})
        mets_structmap_content_div.append(fptr)

        str = etree.tostring(root,
                             encoding='UTF-8',
                             pretty_print=True,
                             xml_declaration=True)
        with open(output_mets, 'w') as output_file:
            output_file.write(str)
Beispiel #6
0
    def createMets(self, mets_data):
        self.mets_data = mets_data
        packageid = mets_data['packageid']
        packagetype = mets_data['type']
        schemafolder = mets_data['schemas']
        parent = mets_data['parent']

        print 'creating Mets'
        ###########################
        # create METS skeleton
        ###########################

        # create Mets root
        METS_ATTRIBUTES = {
            "OBJID": "urn:uuid:" + packageid,
            "LABEL":
            "METS file describing the %s matching the OBJID." % packagetype,
            "PROFILE": "http://www.ra.ee/METS/v01/IP.xml",
            "TYPE": packagetype
        }
        root = M.mets(METS_ATTRIBUTES)

        if os.path.isfile(os.path.join(schemafolder, 'mets_1_11.xsd')):
            mets_schema_location = os.path.relpath(
                os.path.join(schemafolder, 'mets_1_11.xsd'), self.root_path)
        else:
            mets_schema_location = 'empty'
        if os.path.isfile(os.path.join(schemafolder, 'xlink.xsd')):
            xlink_schema_loaction = os.path.relpath(
                os.path.join(schemafolder, 'xlink.xsd'), self.root_path)
        else:
            xlink_schema_loaction = 'empty'

        root.attrib[
            '{%s}schemaLocation' %
            XSI_NS] = "http://www.loc.gov/METS/ %s http://www.w3.org/1999/xlink %s" % (
                mets_schema_location, xlink_schema_loaction)

        # create Mets header
        mets_hdr = M.metsHdr({
            "CREATEDATE": current_timestamp(),
            "RECORDSTATUS": "NEW"
        })
        root.append(mets_hdr)

        # add an agent
        mets_hdr.append(
            self.createAgent("CREATOR", "OTHER", "SOFTWARE", "E-ARK earkweb",
                             "VERSION=0.0.1"))

        # add document ID
        mets_hdr.append(M.metsDocumentID("METS.xml"))

        # create amdSec
        mets_amdSec = M.amdSec({"ID": "ID" + uuid.uuid4().__str__()})
        root.append(mets_amdSec)

        # create fileSec
        mets_fileSec = M.fileSec()
        root.append(mets_fileSec)

        # general filegroup
        mets_filegroup = M.fileGrp({
            "ID": "ID" + uuid.uuid4().__str__(),
            "USE": "general filegroup"
        })
        mets_fileSec.append(mets_filegroup)

        # structMap 'E-ARK structural map' - default, physical structure
        mets_earkstructmap = M.structMap({
            "LABEL": "E-ARK structural map",
            "TYPE": "physical"
        })
        root.append(mets_earkstructmap)
        package_div = M.div({"LABEL": packageid})
        # append physical structMap
        mets_earkstructmap.append(package_div)

        # structMap and div for the whole package (metadata, schema and /data)
        mets_structmap = M.structMap({
            "LABEL": "Simple %s structuring" % packagetype,
            "TYPE": "logical"
        })
        root.append(mets_structmap)
        mets_structmap_div = M.div({"LABEL": "Package structure"})
        mets_structmap.append(mets_structmap_div)

        # metadata structmap - IP root level!
        mets_structmap_metadata_div = M.div({"LABEL": "metadata files"})
        mets_structmap_div.append(mets_structmap_metadata_div)

        # structmap for schema files
        mets_structmap_schema_div = M.div({"LABEL": "schema files"})
        mets_structmap_div.append(mets_structmap_schema_div)

        # content structmap - all representations! (is only filled if no separate METS exists for the rep)
        mets_structmap_content_div = M.div({"LABEL": "content files"})
        mets_structmap_div.append(mets_structmap_content_div)

        # create structmap and div for Mets files from representations
        # mets_structmap_reps = M.structMap({"TYPE": "logical", "LABEL": "representations"})
        # root.append(mets_structmap_reps)
        # mets_div_reps = M.div({"LABEL": "representations", "TYPE": "type"})
        # mets_structmap_reps.append(mets_div_reps)

        # create structmap for parent/child relation, if applicable
        if parent != '':
            print 'creating link to parent %s' % packagetype
            mets_structmap_relation = M.structMap({
                'TYPE': 'logical',
                'LABEL': 'parent'
            })
            root.append(mets_structmap_relation)
            mets_div_rel = M.div(
                {'LABEL': '%s parent identifier' % packagetype})
            mets_structmap_relation.append(mets_div_rel)
            parent_pointer = M.mptr({
                "LOCTYPE":
                "OTHER",
                "OTHERLOCTYPE":
                "UUID",
                q(XLINK_NS, "title"):
                ("Referencing the parent %s of this (urn:uuid:%s) %s." %
                 (packagetype, packageid, packagetype)),
                q(XLINK_NS, "href"):
                "urn:uuid:" + parent,
                "ID":
                "ID" + uuid.uuid4().__str__()
            })
            mets_div_rel.append(parent_pointer)

        ###########################
        # add to Mets skeleton
        ###########################

        # add the package content to the Mets skeleton
        for directory, subdirectories, filenames in os.walk(self.root_path):
            # build the earkstructmap
            path = os.path.relpath(directory, self.root_path)
            physical_div = ''
            if path != '.':
                physical_div = M.div({"LABEL": path})
                package_div.append(physical_div)
            # if directory.endswith('metadata/earkweb'):
            #     # Ignore temp files only needed for IP processing with earkweb
            #     del filenames[:]
            #     del subdirectories[:]
            if directory.endswith('submission/metadata') or directory.endswith(
                    'submission/schemas'):
                del filenames[:]
                del subdirectories[:]
            if directory == os.path.join(self.root_path, 'metadata'):
                # Metadata on IP root level - if there are folders for representation-specific metadata,
                # check if the corresponding representation has a Mets file. If yes, skip; if no, add to IP root Mets.
                for filename in filenames:
                    if filename == 'earkweb.log':
                        mets_digiprovmd = M.digiprovMD(
                            {"ID": "ID" + uuid.uuid4().__str__()})
                        mets_amdSec.append(mets_digiprovmd)
                        id = "ID" + uuid.uuid4().__str__()
                        ref = self.make_mdref(directory, filename, id, 'OTHER')
                        mets_mdref = M.mdRef(ref)
                        mets_digiprovmd.append(mets_mdref)
                        mets_structmap_metadata_div.append(
                            M.fptr({"FILEID": id}))
                        physical_div.append(M.fptr({"FILEID": id}))
                del subdirectories[:]  # prevent loop to iterate subfolders outside of this if statement
                dirlist = os.listdir(os.path.join(self.root_path, 'metadata'))
                for dirname in dirlist:
                    if fnmatch.fnmatch(dirname, '*_mig-*'):
                        # TODO: maybe list it all the time?
                        # this folder contains metadata for a representation/migration, currently:
                        # only listed if no representation Mets file exists
                        if os.path.isfile(
                                os.path.join(self.root_path,
                                             'representations/%s/METS.xml') %
                                dirname):
                            pass
                        else:
                            for dir, subdir, files in os.walk(
                                    os.path.join(self.root_path, 'metadata/%s')
                                    % dirname):
                                for filename in files:
                                    if dir.endswith('descriptive'):
                                        mets_dmd = M.dmdSec({
                                            "ID":
                                            "ID" + uuid.uuid4().__str__()
                                        })
                                        root.insert(1, mets_dmd)
                                        id = "ID" + uuid.uuid4().__str__()
                                        ref = self.make_mdref(
                                            dir, filename, id, 'OTHER')
                                        mets_mdref = M.mdRef(ref)
                                        mets_dmd.append(mets_mdref)
                                        mets_structmap_metadata_div.append(
                                            M.fptr({"FILEID": id}))
                                        physical_div.append(
                                            M.fptr({"FILEID": id}))
                                    elif dir.endswith('preservation'):
                                        mets_digiprovmd = M.digiprovMD({
                                            "ID":
                                            "ID" + uuid.uuid4().__str__()
                                        })
                                        mets_amdSec.append(mets_digiprovmd)
                                        id = "ID" + uuid.uuid4().__str__()
                                        mdtype = ''
                                        if filename.startswith(
                                                'premis') or filename.endswith(
                                                    'premis.xml'):
                                            mdtype = 'PREMIS'
                                        else:
                                            mdtype = 'OTHER'
                                        ref = self.make_mdref(
                                            dir, filename, id, mdtype)
                                        mets_mdref = M.mdRef(ref)
                                        mets_digiprovmd.append(mets_mdref)
                                        mets_structmap_metadata_div.append(
                                            M.fptr({"FILEID": id}))
                                        physical_div.append(
                                            M.fptr({"FILEID": id}))
                                    elif filename:
                                        print 'Unclassified metadata file %s in %s.' % (
                                            filename, dir)
                    else:
                        # metadata that should be listed in the Mets
                        for dir, subdir, files in os.walk(
                                os.path.join(self.root_path, 'metadata/%s') %
                                dirname):
                            if len(files) > 0:
                                for filename in files:
                                    #if dir.endswith('descriptive'):
                                    if dirname == 'descriptive':
                                        mets_dmd = M.dmdSec({
                                            "ID":
                                            "ID" + uuid.uuid4().__str__()
                                        })
                                        root.insert(1, mets_dmd)
                                        id = "ID" + uuid.uuid4().__str__()
                                        # TODO: change MDTYPE
                                        ref = self.make_mdref(
                                            dir, filename, id, 'OTHER')
                                        mets_mdref = M.mdRef(ref)
                                        mets_dmd.append(mets_mdref)
                                        mets_structmap_metadata_div.append(
                                            M.fptr({"FILEID": id}))
                                        physical_div.append(
                                            M.fptr({"FILEID": id}))
                                    #elif dir.endswith('preservation'):
                                    elif dirname == 'preservation' or dirname == 'earkweb':
                                        mets_digiprovmd = M.digiprovMD({
                                            "ID":
                                            "ID" + uuid.uuid4().__str__()
                                        })
                                        mets_amdSec.append(mets_digiprovmd)
                                        id = "ID" + uuid.uuid4().__str__()
                                        mdtype = ''
                                        if filename.startswith(
                                                'premis') or filename.endswith(
                                                    'premis.xml'):
                                            mdtype = 'PREMIS'
                                        elif filename:
                                            mdtype = 'OTHER'
                                        ref = self.make_mdref(
                                            dir, filename, id, mdtype)
                                        mets_mdref = M.mdRef(ref)
                                        mets_digiprovmd.append(mets_mdref)
                                        mets_structmap_metadata_div.append(
                                            M.fptr({"FILEID": id}))
                                        physical_div.append(
                                            M.fptr({"FILEID": id}))
                                    elif filename:
                                        print 'Unclassified metadata file %s in %s.' % (
                                            filename, dir)
            else:
                # Any other folder outside of /<root>/metadata
                for filename in filenames:
                    if directory == self.root_path:
                        # ignore files on IP root level
                        del filename
                    else:
                        # TODO: list rep metadata only in the rep Mets?
                        rel_path_file = "file://./%s" % os.path.relpath(
                            os.path.join(directory, filename), self.root_path)
                        if filename.lower() == 'mets.xml':
                            # delete the subdirectories list to stop os.walk from traversing further;
                            # mets file should be added as <mets:mptr> to <structMap> for corresponding rep
                            del subdirectories[:]
                            rep_name = directory.rsplit('/', 1)[1]
                            # create structMap div and append to representations structMap
                            # mets_structmap_rep_div = M.div({"LABEL": rep_name, "TYPE": "representation mets", "ID": "ID" + uuid.uuid4().__str__()})
                            # mets_div_reps.append(mets_structmap_rep_div)
                            # add mets file as <mets:mptr>
                            metspointer = M.mptr({
                                "LOCTYPE":
                                "URL",
                                q(XLINK_NS, "title"):
                                ("Mets file describing representation: %s of %s: urn:uuid:%s."
                                 % (rep_name, packagetype, packageid)),
                                q(XLINK_NS, "href"):
                                rel_path_file,
                                "ID":
                                "ID" + uuid.uuid4().__str__()
                            })
                            #mets_structmap_rep_div.append(metspointer)
                            #mets_structmap_rep_div.append(M.fptr({"FILEID": id}))
                            physical_div.append(
                                metspointer
                            )  # IMPORTANT: The <mptr> element needs to be the first entry in a <div>, or the Mets will be invalid!
                            # also create a <fptr> for the Mets file
                            id = self.addFile(
                                os.path.join(directory, filename),
                                mets_filegroup)
                            physical_div.append(M.fptr({"FILEID": id}))
                        elif filename and directory.endswith('schemas'):
                            # schema files
                            id = self.addFile(
                                os.path.join(directory, filename),
                                mets_filegroup)
                            mets_structmap_schema_div.append(
                                M.fptr({'FILEID': id}))
                            physical_div.append(M.fptr({'FILEID': id}))
                        elif filename:
                            id = self.addFile(
                                os.path.join(directory, filename),
                                mets_filegroup)
                            mets_structmap_content_div.append(
                                M.fptr({'FILEID': id}))
                            physical_div.append(M.fptr({'FILEID': id}))

        str = etree.tostring(root,
                             encoding='UTF-8',
                             pretty_print=True,
                             xml_declaration=True)

        path_mets = os.path.join(self.root_path, 'METS.xml')
        with open(path_mets, 'w') as output_file:
            output_file.write(str)