Example #1
0
    def _getDescriptiveMetadata(self, lxmlNode):
        ## This always normalizes to rdf namespace, without warning/message
        descriptiveMetadataItem = lxmlNode.xpath(
            '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/descriptiveMetadata"]',
            namespaces=self._nsMap)
        if len(descriptiveMetadataItem
               ) == 0:  #Fallback to @resource (no rdf nmsp), if available...
            descriptiveMetadataItem = lxmlNode.xpath(
                '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/descriptiveMetadata"]',
                namespaces=self._nsMap)
            if len(descriptiveMetadataItem) > 0:
                self.do.logMsg(self._identifier, LOGGER3, prefix=STR_DIDL)
        if len(descriptiveMetadataItem
               ) == 0:  #Fallback to dip namespace, if available...
            descriptiveMetadataItem = lxmlNode.xpath(
                '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/descriptiveMetadata"]',
                namespaces=self._nsMap)
            if len(descriptiveMetadataItem) > 0:
                self.do.logMsg(self._identifier, LOGGER4, prefix=STR_DIDL)
        if len(descriptiveMetadataItem) > 0:
            #look for first DMI containing MODS:
            dmi_mods = None
            dmItem = None
            for dmi in descriptiveMetadataItem:
                node = dmi.xpath('self::didl:Item//mods:mods',
                                 namespaces=self._nsMap)
                if len(node) > 0:  #Found MODS:
                    dmi_mods = node[0]
                    dmItem = dmi
                    break
            else:
                raise ValidateException(
                    formatExceptionLine(EXCEPTION6, prefix=STR_DIDL))

            item_template = """<didl:Item>
                                    <didl:Descriptor>
                                        <didl:Statement mimeType="application/xml">
                                            <rdf:type rdf:resource="info:eu-repo/semantics/descriptiveMetadata"/>
                                        </didl:Statement>
                                    </didl:Descriptor>
                                    %s%s<didl:Component>
                                        <didl:Resource mimeType="application/xml">
                                           %s 
                                        </didl:Resource>
                                    </didl:Component>
                                </didl:Item>""" % (
                self._getIdentifierDescriptor(dmItem),
                self._getDateModifiedDescriptor(dmItem), tostring(dmi_mods))
        else:
            raise ValidateException(
                formatExceptionLine(EXCEPTION7, prefix=STR_DIDL))
        return item_template
Example #2
0
    def _normalizeRecord(self, lxmlNode):
        # MODS normalisation in 4 steps:
        # 1. Get Mods from the lxmlNode.
        # 2. Normalize it
        # 3. Put it back in place.
        # 4. return the lxmlNode containing the normalized MODS.

        #1: Get Mods from the lxmlNode:
        lxmlMODS = lxmlNode.xpath('(//mods:mods)[1]', namespaces=self._nsMap)

        ## Our normalisation functions to call:
        modsFunctions = [self._convertFullMods2GHMods]

        if len(lxmlMODS) > 0:
            #2: Normalize it
            str_norm_mods = ''
            for function in modsFunctions:
                str_norm_mods += function(lxmlMODS[0])

        #3: Put it back in DIDL/place:
            lxmlMODS[0].getparent().replace(lxmlMODS[0],
                                            etree.fromstring(str_norm_mods))

        else:  #This should never happen @runtime: record should have been validated up front...
            raise ValidateException(
                formatExceptionLine(EXCEPTION1, prefix=STR_MODS))

        #4: Return the lxmlNode containing the normalized MODS:
        #print(etree.tostring(lxmlNode, pretty_print=True))
        return lxmlNode
Example #3
0
    def _tlOrigininfo(self, childNode):
        hasDateIssued = False
        ## Select all children from originInfo having 'encoding' attribute:
        children = childNode.xpath(
            "self::mods:originInfo/child::*[@encoding='w3cdtf' or @encoding='iso8601']",
            namespaces=self._nsMap)
        if len(children) > 0:
            for child in children:
                if self._validateISO8601(child.text):
                    child.text = self._granulateDate(child.text)
                    child.set('encoding', 'w3cdtf')
                    if child.tag == ('{%s}dateIssued') % self._nsMap['mods']:
                        hasDateIssued = True
                else:
                    child.getparent().remove(child)
        if not hasDateIssued:
            raise ValidateException(
                formatExceptionLine(EXCEPTION7, prefix=STR_MODS))

        for child in childNode.xpath("self::mods:originInfo/mods:publisher",
                                     namespaces=self._nsMap):
            if not child.text:
                child.getparent().remove(child)

        return childNode if len(childNode) > 0 else None
Example #4
0
 def _isValidTitleInfoTag(self, lxmlNode):
     for title in lxmlNode.iterfind(('{%s}title') % self._nsMap['mods']):
         if not title.text:
             raise ValidateException(
                 formatExceptionLine(EXCEPTION3, prefix=STR_MODS))
     for subtitle in lxmlNode.iterfind(
         ('{%s}subTitle') % self._nsMap['mods']):
         if not subtitle.text:
             subtitle.getparent().remove(subtitle)
     return True
Example #5
0
 def _normalizeTitleinfo(self, modsNode):
     ## Select all titleInfo's
     hasTitleInfo = False
     for child in modsNode.iterfind(
         ('{%s}titleInfo') % self._nsMap['mods']):
         hasTitleInfo = True
         if not self._isValidTitleInfoTag(child):
             modsNode.remove(child)
     if not hasTitleInfo:
         raise ValidateException(
             formatExceptionLine(EXCEPTION2, prefix=STR_MODS))
Example #6
0
    def _getHumanStartPage(self, lxmlNode):

        didl_hsp_item = lxmlNode.xpath(
            '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/humanStartPage"]',
            namespaces=self._nsMap)
        if len(didl_hsp_item) == 0:
            didl_hsp_item = lxmlNode.xpath(
                '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/humanStartPage"]',
                namespaces=self._nsMap)
            if len(didl_hsp_item) > 0:
                self.do.logMsg(self._identifier, LOGGER9, prefix=STR_DIDL)
            if len(didl_hsp_item) == 0:
                didl_hsp_item = lxmlNode.xpath(
                    '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/humanStartPage"]',
                    namespaces=self._nsMap)
                if len(didl_hsp_item) > 0:
                    self.do.logMsg(self._identifier, LOGGER10, prefix=STR_DIDL)
                if len(didl_hsp_item) == 0:
                    self.do.logMsg(self._identifier, LOGGER11, prefix=STR_DIDL)
                    return ""

        uriref = didl_hsp_item[0].xpath(
            'self::didl:Item/didl:Component/didl:Resource/@ref',
            namespaces=self._nsMap)
        mimetype = didl_hsp_item[0].xpath(
            'self::didl:Item/didl:Component/didl:Resource/@mimeType',
            namespaces=self._nsMap)

        if len(mimetype) == 0:
            self.do.logMsg(self._identifier, LOGGER13, prefix=STR_DIDL)

        if len(mimetype) > 0 and not comm.isMimeType(mimetype[0]):
            self.do.logMsg(self._identifier,
                           LOGGER12 + mimetype[0],
                           prefix=STR_DIDL)

        if len(uriref) == 0 or not comm.isURL(uriref[0]):
            raise ValidateException(
                formatExceptionLine(EXCEPTION11, prefix=STR_DIDL))

        return """<didl:Item>
                    <didl:Descriptor>
                        <didl:Statement mimeType="application/xml">
                            <rdf:type rdf:resource="info:eu-repo/semantics/humanStartPage"/>
                        </didl:Statement>
                    </didl:Descriptor>
                    <didl:Component>
                        <didl:Resource ref="%s" mimeType="%s"/>
                    </didl:Component>
                </didl:Item>""" % (escapeXml(comm.urlQuote(
            uriref[0].strip())), escapeXml(mimetype[0]))
Example #7
0
 def _validateNames(self, modsNode):
     for name in modsNode.iterfind(('{%s}name') % self._nsMap['mods']):
         for roleterm in name.iterfind(
             ('.//{%s}roleTerm') % self._nsMap['mods']):
             if roleterm.text: roleterm.text = roleterm.text.strip()
         role = name.xpath(
             "self::mods:name/mods:role/mods:roleTerm[@type='code' and @authority='marcrelator']/text()",
             namespaces=self._nsMap)
         for namepart in name.iterfind(
             ('{%s}namePart') % self._nsMap['mods']):
             if not namepart.text:  # Remove empty nameparts
                 name.remove(namepart)
         if not role or len(role) < 1 or name.find(
             ('{%s}namePart') % self._nsMap['mods']
         ) is None:  ## Geen roleterm gevonden, of lege string voor type code en authority marcrelator, of geen nameParts: Verwijder dit name element:
             modsNode.remove(name)
         elif len(role) > 0 and not self.__isValidRoleTerm(role[0]):
             raise ValidateException(
                 formatExceptionLine(EXCEPTION4 + role[0], prefix=STR_MODS))
     if len(modsNode.xpath("//mods:mods/mods:name",
                           namespaces=self._nsMap)) <= 0:
         raise ValidateException(
             formatExceptionLine(EXCEPTION5, prefix=STR_MODS))
Example #8
0
    def _validateGenre(self, modsNode):

        fqGenre = None
        bln_hasValid = False
        ## Loop all 'genre' elements as separate nodes:
        for genre in modsNode.iterfind('{' + self._nsMap.get('mods') +
                                       '}genre'):

            for key, value in GENRES_SEMANTIEK.iteritems():
                if genre.text.strip().lower().find(
                        key) >= 0:  #found a (lowercased) genre
                    fqGenre = value
                    break

            if fqGenre is not None and not bln_hasValid:
                bln_hasValid = True
                genre.text = fqGenre
            else:
                modsNode.remove(genre)

        if not bln_hasValid:
            raise ValidateException(
                formatExceptionLine(EXCEPTION6, prefix=STR_MODS))
Example #9
0
    def _getObjectfiles(self, lxmlNode):
        of_container = ''
        objectfiles = lxmlNode.xpath(
            '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/objectFile"]',
            namespaces=self._nsMap)
        if len(objectfiles) == 0:
            objectfiles = lxmlNode.xpath(
                '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/objectFile"]',
                namespaces=self._nsMap)
            if len(objectfiles) > 0:
                self.do.logMsg(self._identifier, LOGGER6, prefix=STR_DIDL)
        if len(objectfiles) == 0:
            objectfiles = lxmlNode.xpath(
                '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/objectFile"]',
                namespaces=self._nsMap)
            if len(objectfiles) > 0:
                self.do.logMsg(self._identifier, LOGGER7, prefix=STR_DIDL)
        for objectfile in objectfiles:
            #1:Define correct ObjectFile descriptor:
            of_container += '<didl:Item><didl:Descriptor><didl:Statement mimeType="application/xml"><rdf:type rdf:resource="info:eu-repo/semantics/objectFile"/></didl:Statement></didl:Descriptor>'

            #2: Check geldige Identifier (feitelijk verplicht, hoewel vaak niet geimplemeteerd...)
            pi = objectfile.xpath(
                'self::didl:Item/didl:Descriptor/didl:Statement/dii:Identifier/text()',
                namespaces=self._nsMap)
            if len(pi) > 0:
                of_container += descr_templ % ('<dii:Identifier>' + escapeXml(
                    pi[0].strip()) + '</dii:Identifier>')

        #3: Check op geldige AccessRights:
            arights = objectfile.xpath(
                'self::didl:Item/didl:Descriptor/didl:Statement/dcterms:accessRights/text()',
                namespaces=self._nsMap)
            if len(arights) > 0:
                for key, value in accessRights.iteritems():
                    if arights[0].strip().lower().find(key) >= 0:
                        of_container += descr_templ % (
                            '<dcterms:accessRights>' + value +
                            '</dcterms:accessRights>')
                        break
                else:
                    raise ValidateException(
                        formatExceptionLine(arights[0] + EXCEPTION12,
                                            prefix=STR_DIDL))
            else:
                raise ValidateException(
                    formatExceptionLine(EXCEPTION8, prefix=STR_DIDL))

        #4: Check geldige datemodified (feitelijk verplicht, hoewel vaak niet geimplemeteerd...)
            modified = objectfile.xpath(
                'self::didl:Item/didl:Descriptor/didl:Statement/dcterms:modified/text()',
                namespaces=self._nsMap)
            if len(modified) > 0 and comm.isISO8601(modified[0]):
                of_container += descr_templ % ('<dcterms:modified>' +
                                               modified[0].strip() +
                                               '</dcterms:modified>')

        #5: Check for 'file' description:
            descr = objectfile.xpath(
                'self::didl:Item/didl:Descriptor/didl:Statement/dc:description/text()',
                namespaces=self._nsMap)
            if len(descr) > 0:
                of_container += descr_templ % ('<dc:description>' + escapeXml(
                    descr[0].strip()) + '</dc:description>')

        ## SKIPPING: Not in EduStandaard.
        #6.0: Check for embargo:
        #    embargo = objectfile.xpath('self::didl:Item/didl:Descriptor/didl:Statement/dcterms:available/text()', namespaces=self._nsMap)
        #    if len(embargo) > 0 and comm.isISO8601(embargo[0]):
        #        of_container += descr_templ % ('<dcterms:available>'+embargo[0].strip()+'</dcterms:available>')

        ## SKIPPING: Not in EduStandaard.
        #6.1: Check for dateSubmitted:
        #    dembargo = objectfile.xpath('self::didl:Item/didl:Descriptor/didl:Statement/dcterms:dateSubmitted/text()', namespaces=self._nsMap)
        #    if len(dembargo) > 0 and comm.isISO8601(dembargo[0]):
        #        of_container += descr_templ % ('<dcterms:dateSubmitted>'+dembargo[0].strip()+'</dcterms:dateSubmitted>')
        #    else:
        #        #6.2: Check for issued (depricated, normalize to dateSubmitted):
        #        issued = objectfile.xpath('self::didl:Item/didl:Descriptor/didl:Statement/dcterms:issued/text()', namespaces=self._nsMap)
        #        if len(issued) > 0 and comm.isISO8601(issued[0]):
        #            of_container += descr_templ % ('<dcterms:dateSubmitted>'+issued[0].strip()+'</dcterms:dateSubmitted>')

        #7: Check for published version(author/publisher):
            pubVersion = objectfile.xpath(
                'self::didl:Item/didl:Descriptor/didl:Statement/rdf:type/@rdf:resource',
                namespaces=self._nsMap)
            if len(
                    pubVersion
            ) > 0:  ## Both (author/publisher) may be available: we'll take the first one...
                for key, value in pubVersions.iteritems():
                    if pubVersion[0].strip().lower().find(key) >= 0:
                        of_container += descr_templ % (
                            '<rdf:type rdf:resource="' + value + '"/>')
                        break

        #8:Check for MANDATORY resources and mimetypes:
            didl_resources = objectfile.xpath(
                'self::didl:Item/didl:Component/didl:Resource[@mimeType and @ref]',
                namespaces=self._nsMap)
            resources = ''
            _url_list = []
            for resource in didl_resources:
                mimeType = resource.xpath('self::didl:Resource/@mimeType',
                                          namespaces=self._nsMap)
                uri = resource.xpath('self::didl:Resource/@ref',
                                     namespaces=self._nsMap)
                ## We need both mimeType and URI: (MIMETYPE is required by DIDL schema, @ref not).
                if len(mimeType) > 0 and len(uri) > 0:
                    if not comm.isMimeType(mimeType[0]):
                        self.do.logMsg(self._identifier,
                                       LOGGER8 + mimeType[0],
                                       prefix=STR_DIDL)
                    if comm.isURL(uri[0].strip()):
                        resources += """<didl:Resource mimeType="%s" ref="%s"/>""" % (
                            escapeXml(mimeType[0].strip()),
                            escapeXml(comm.urlQuote(uri[0].strip())))
                        _url_list.append(
                            """<didl:Resource mimeType="%s" ref="%s"/>""" %
                            (escapeXml(mimeType[0].strip()),
                             escapeXml(comm.urlQuote(uri[0].strip()))))
                    else:
                        raise ValidateException(
                            formatExceptionLine(EXCEPTION9 + uri[0],
                                                prefix=STR_DIDL))

            if resources != '':
                of_container += """<didl:Component>
                %s
            </didl:Component>""" % (resources)
            else:
                raise ValidateException(
                    formatExceptionLine(EXCEPTION10, prefix=STR_DIDL))
            of_container += '</didl:Item>'
        return of_container
Example #10
0
    def _getTopItem(self, lxmlNode):
        ## Wrappers:
        pid, modified, mimetype, pidlocation = '', '', "application/xml", ''

        #1:     Get persistentIdentifier:
        pidlist = lxmlNode.xpath(
            '//didl:DIDL/didl:Item/didl:Descriptor/didl:Statement/dii:Identifier/text()',
            namespaces=self._nsMap)
        if len(pidlist) > 0:
            pid = pidlist[0].strip()
            if not comm.isURNNBN(pid):
                raise ValidateException(
                    formatExceptionLine(EXCEPTION0 + pid, prefix=STR_DIDL))
        else:
            raise ValidateException(
                formatExceptionLine(EXCEPTION1, prefix=STR_DIDL))

#2:     Get toplevel modificationDate: comm.isISO8601()
        tl_modified = lxmlNode.xpath(
            '//didl:DIDL/didl:Item/didl:Descriptor/didl:Statement/dcterms:modified/text()',
            namespaces=self._nsMap)
        ## Check op geldig/aanwezigheid tlModified, anders exception:
        if len(tl_modified) > 0 and not comm.isISO8601(tl_modified[0]):
            raise ValidateException(
                formatExceptionLine(EXCEPTION2 + tl_modified[0],
                                    prefix=STR_DIDL))
        elif len(tl_modified) == 0:
            raise ValidateException(
                formatExceptionLine(EXCEPTION3, prefix=STR_DIDL))

        ## Get all modified dates:
        all_modified = lxmlNode.xpath(
            '//didl:Item/didl:Descriptor/didl:Statement/dcterms:modified/text()',
            namespaces=self._nsMap)

        ## Get most recent date from all items, to add to toplevelItem:
        if len(all_modified) > 0:
            datedict = {}
            for date in all_modified:
                if comm.isISO8601(date.strip()):
                    #datedict[parseDate(date.strip())] = date.strip()
                    pd = parseDate(date.strip())
                    datedict["%s %s" %
                             (str(pd.date()), str(pd.time()))] = date.strip()

            ## Get first sorted key:
            for key in reversed(sorted(datedict.iterkeys())):
                modified = datedict[key]
                break
        if not tl_modified[0].strip() == modified:
            self.do.logMsg(self._identifier, LOGGER1, prefix=STR_DIDL)

#3:     Get PidResourceMimetype
        mimetypelist = lxmlNode.xpath(
            '//didl:DIDL/didl:Item/didl:Component/didl:Resource/@mimeType',
            namespaces=self._nsMap)
        if len(mimetypelist) > 0:
            mimetype = mimetypelist[0].strip()
            if not comm.isMimeType(mimetype):
                self.do.logMsg(self._identifier,
                               LOGGER2 + mimetype,
                               prefix=STR_DIDL)

#4:     Get PidResourceLocation:
        pidlocation = self._findAndBindFirst(
            lxmlNode,
            '%s',
            '//didl:DIDL/didl:Item/didl:Component/didl:Resource/@ref',
            '//didl:DIDL/didl:Item/didl:Component/didl:Resource/text()'
            '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref',  #DIDL 3.0
            '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref',  #DIDL 3.0, without @rdf:resource
            '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref',  #fallback DIDL 2.3.1
            '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref',  #fallback DIDL 3.0
            '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref',  #fallback DIDL 3.0, without @rdf:resource
            '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref'  #fallback DIDL 2.3.1
        ).strip()

        if pidlocation == '':
            raise ValidateException(
                formatExceptionLine(EXCEPTION4, prefix=STR_DIDL))
        if not comm.isURL(pidlocation):
            raise ValidateException(
                formatExceptionLine(EXCEPTION5 + pidlocation, prefix=STR_DIDL))

        return """<didl:Item>
        <didl:Descriptor><didl:Statement mimeType="application/xml"><dii:Identifier>%s</dii:Identifier></didl:Statement></didl:Descriptor>
        <didl:Descriptor><didl:Statement mimeType="application/xml"><dcterms:modified>%s</dcterms:modified></didl:Statement></didl:Descriptor>
        <didl:Component><didl:Resource mimeType="%s" ref="%s"/></didl:Component>""" % (
            escapeXml(pid), modified, escapeXml(mimetype),
            comm.urlQuote(pidlocation))
Example #11
0
 def _checkOriginInfoDateIssued(self, modsNode):
     if len(
             modsNode.xpath("//mods:mods/mods:originInfo/mods:dateIssued",
                            namespaces=self._nsMap)) <= 0:
         raise ValidateException(
             formatExceptionLine(EXCEPTION7, prefix=STR_MODS))