def _getDescriptiveMetadata(self, lxmlNode): ## This always normalizes to rdf namespace, without warning/message descriptiveMetadataItem = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/descriptiveMetadata"]', namespaces=self._nsMap) if len(descriptiveMetadataItem ) == 0: #Fallback to @resource (no rdf nmsp), if available... descriptiveMetadataItem = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/descriptiveMetadata"]', namespaces=self._nsMap) if len(descriptiveMetadataItem) > 0: self.do.logMsg(self._uploadid, LOGGER3, prefix=STR_DIDL) if len(descriptiveMetadataItem ) == 0: #Fallback to dip namespace, if available... descriptiveMetadataItem = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/descriptiveMetadata"]', namespaces=self._nsMap) if len(descriptiveMetadataItem) > 0: self.do.logMsg(self._uploadid, LOGGER4, prefix=STR_DIDL) if len(descriptiveMetadataItem) > 0: #look for first DMI containing MODS: dmi_mods = None dmItem = None for dmi in descriptiveMetadataItem: node = dmi.xpath('self::didl:Item//mods:mods', namespaces=self._nsMap) if len(node) > 0: #Found MODS: dmi_mods = node[0] dmItem = dmi break else: raise ValidateException( formatExceptionLine(EXCEPTION6, prefix=STR_DIDL)) item_template = """<didl:Item> <didl:Descriptor> <didl:Statement mimeType="application/xml"> <rdf:type rdf:resource="info:eu-repo/semantics/descriptiveMetadata"/> </didl:Statement> </didl:Descriptor> %s%s<didl:Component> <didl:Resource mimeType="application/xml"> %s </didl:Resource> </didl:Component> </didl:Item>""" % ( self._getIdentifierDescriptor(dmItem), self._getDateModifiedDescriptor(dmItem), tostring(dmi_mods)) else: raise ValidateException( formatExceptionLine(EXCEPTION7, prefix=STR_DIDL)) return item_template
def _validateNames(self, modsNode): for name in modsNode.iterfind(('{%s}name') % self._nsMap['mods']): for roleterm in name.iterfind(('.//{%s}roleTerm') % self._nsMap['mods']): if roleterm.text: roleterm.text = roleterm.text.strip() role = name.xpath("self::mods:name/mods:role/mods:roleTerm[@type='code' and @authority='marcrelator']/text()", namespaces=self._nsMap) for namepart in name.iterfind(('{%s}namePart') % self._nsMap['mods']): if not namepart.text or not namepart.text.strip(): # Remove empty nameparts name.remove(namepart) if not role or len(role) < 1 or name.find(('{%s}namePart') % self._nsMap['mods']) is None: ## Geen roleterm gevonden, of lege string voor type code en authority marcrelator, of geen nameParts: Verwijder dit name element: modsNode.remove(name) elif len(role) > 0 and not self.__isValidRoleTerm(role[0]): raise ValidateException(formatExceptionLine( EXCEPTION4 + role[0], prefix=STR_MODS)) if len(modsNode.xpath("//mods:mods/mods:name", namespaces=self._nsMap)) <= 0: raise ValidateException(formatExceptionLine(EXCEPTION5, prefix=STR_MODS))
def _normaliseRecord(self, lxmlNode): # MODS normalisation in 4 steps: # 1. Get Mods from the lxmlNode. # 2. Normalize it # 3. Put it back in place. # 4. return the lxmlNode containing the normalized MODS. #1: Get Mods from the lxmlNode: lxmlMODS = lxmlNode.xpath('(//mods:mods)[1]', namespaces=self._nsMap) ## Our normalisation functions to call: modsFunctions = [ self._convertFullMods2GHMods ] if len(lxmlMODS) > 0: #2: Normalize it str_norm_mods = '' for function in modsFunctions: str_norm_mods += function(lxmlMODS[0]) #3: Put it back in DIDL/place: lxmlMODS[0].getparent().replace(lxmlMODS[0], etree.fromstring(str_norm_mods) ) else: #This should never happen @runtime: record should have been validated up front... raise ValidateException(formatExceptionLine(EXCEPTION1, prefix=STR_MODS)) #4: Return the lxmlNode containing the normalized MODS: #print(etree.tostring(lxmlNode, pretty_print=True)) return lxmlNode
def _isValidTitleInfoTag(self, lxmlNode): for title in lxmlNode.iterfind(('{%s}title') % self._nsMap['mods']): if not title.text or not title.text.strip(): raise ValidateException(formatExceptionLine(EXCEPTION3, prefix=STR_MODS)) for subtitle in lxmlNode.iterfind(('{%s}subTitle') % self._nsMap['mods']): if not subtitle.text or not subtitle.text.strip(): subtitle.getparent().remove(subtitle) return True
def _normalizeTitleinfo(self, modsNode): ## Select all titleInfo's hasTitleInfo = False for child in modsNode.iterfind(('{%s}titleInfo') % self._nsMap['mods']): hasTitleInfo = True if not self._isValidTitleInfoTag(child): modsNode.remove(child) if not hasTitleInfo: raise ValidateException(formatExceptionLine(EXCEPTION2, prefix=STR_MODS))
def _getHumanStartPage(self, lxmlNode): didl_hsp_item = lxmlNode.xpath( '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/humanStartPage"]', namespaces=self._nsMap) if len(didl_hsp_item) == 0: didl_hsp_item = lxmlNode.xpath( '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/humanStartPage"]', namespaces=self._nsMap) if len(didl_hsp_item) > 0: self.do.logMsg(self._uploadid, LOGGER9, prefix=STR_DIDL) if len(didl_hsp_item) == 0: didl_hsp_item = lxmlNode.xpath( '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/humanStartPage"]', namespaces=self._nsMap) if len(didl_hsp_item) > 0: self.do.logMsg(self._uploadid, LOGGER10, prefix=STR_DIDL) if len(didl_hsp_item) == 0: self.do.logMsg(self._uploadid, LOGGER11, prefix=STR_DIDL) return "" uriref = didl_hsp_item[0].xpath( 'self::didl:Item/didl:Component/didl:Resource/@ref', namespaces=self._nsMap) mimetype = didl_hsp_item[0].xpath( 'self::didl:Item/didl:Component/didl:Resource/@mimeType', namespaces=self._nsMap) if len(mimetype) == 0: self.do.logMsg(self._uploadid, LOGGER13, prefix=STR_DIDL) if len(mimetype) > 0 and not comm.isMimeType(mimetype[0]): self.do.logMsg(self._uploadid, LOGGER12 + mimetype[0], prefix=STR_DIDL) if len(uriref) == 0 or not comm.isURL(uriref[0]): raise ValidateException( formatExceptionLine(EXCEPTION11, prefix=STR_DIDL)) return """<didl:Item> <didl:Descriptor> <didl:Statement mimeType="application/xml"> <rdf:type rdf:resource="info:eu-repo/semantics/humanStartPage"/> </didl:Statement> </didl:Descriptor> <didl:Component> <didl:Resource ref="%s" mimeType="%s"/> </didl:Component> </didl:Item>""" % (escapeXml(comm.urlQuote( uriref[0].strip())), escapeXml(mimetype[0]))
def _tlOrigininfo(self, childNode): hasDateIssued = False ## Select all children from originInfo having 'encoding' attribute: children = childNode.xpath("self::mods:originInfo/child::*[@encoding='w3cdtf' or @encoding='iso8601']", namespaces=self._nsMap) if len(children) > 0: for child in children: if self._validateISO8601( child.text ): child.text = self._granulateDate(child.text) child.set('encoding', 'w3cdtf') if child.tag == ('{%s}dateIssued') % self._nsMap['mods']: hasDateIssued = True else: child.getparent().remove(child) if not hasDateIssued: raise ValidateException(formatExceptionLine(EXCEPTION7, prefix=STR_MODS)) for child in childNode.xpath("self::mods:originInfo/mods:publisher", namespaces=self._nsMap): if not child.text or not child.text.strip(): child.getparent().remove(child) return childNode if len(childNode) > 0 else None
def _validateGenre(self, modsNode): fqGenre = None bln_hasValid = False ## Loop all 'genre' elements as separate nodes: for genre in modsNode.iterfind('{'+self._nsMap.get('mods')+'}genre'): for key, value in GENRES_SEMANTIEK.iteritems(): if genre.text and genre.text.strip().lower().find(key) >= 0: #found a (lowercased) genre fqGenre = value break if fqGenre is not None and not bln_hasValid: bln_hasValid = True genre.text = fqGenre else: modsNode.remove(genre) if not bln_hasValid: raise ValidateException(formatExceptionLine(EXCEPTION6, prefix=STR_MODS))
def _getObjectfiles(self, lxmlNode): of_container = '' objectfiles = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/objectFile"]', namespaces=self._nsMap) if len(objectfiles) == 0: objectfiles = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/objectFile"]', namespaces=self._nsMap) if len(objectfiles) > 0: self.do.logMsg(self._uploadid, LOGGER6, prefix=STR_DIDL) if len(objectfiles) == 0: objectfiles = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/objectFile"]', namespaces=self._nsMap) if len(objectfiles) > 0: self.do.logMsg(self._uploadid, LOGGER7, prefix=STR_DIDL) for objectfile in objectfiles: #1:Define correct ObjectFile descriptor: of_container += '<didl:Item><didl:Descriptor><didl:Statement mimeType="application/xml"><rdf:type rdf:resource="info:eu-repo/semantics/objectFile"/></didl:Statement></didl:Descriptor>' #2: Check geldige Identifier (feitelijk verplicht, hoewel vaak niet geimplemeteerd...) pi = objectfile.xpath( 'self::didl:Item/didl:Descriptor/didl:Statement/dii:Identifier/text()', namespaces=self._nsMap) if len(pi) > 0: of_container += descr_templ % ('<dii:Identifier>' + escapeXml( pi[0].strip()) + '</dii:Identifier>') #3: Check op geldige AccessRights: arights = objectfile.xpath( 'self::didl:Item/didl:Descriptor/didl:Statement/dcterms:accessRights/text()', namespaces=self._nsMap) if len(arights) > 0: for key, value in accessRights.iteritems(): if arights[0].strip().lower().find(key) >= 0: of_container += descr_templ % ( '<dcterms:accessRights>' + value + '</dcterms:accessRights>') break else: raise ValidateException( formatExceptionLine(arights[0] + EXCEPTION12, prefix=STR_DIDL)) else: raise ValidateException( formatExceptionLine(EXCEPTION8, prefix=STR_DIDL)) #4: Check geldige datemodified (feitelijk verplicht, hoewel vaak niet geimplemeteerd...) modified = objectfile.xpath( 'self::didl:Item/didl:Descriptor/didl:Statement/dcterms:modified/text()', namespaces=self._nsMap) if len(modified) > 0 and comm.isISO8601(modified[0]): of_container += descr_templ % ('<dcterms:modified>' + modified[0].strip() + '</dcterms:modified>') #5: Check for 'file' description: descr = objectfile.xpath( 'self::didl:Item/didl:Descriptor/didl:Statement/dc:description/text()', namespaces=self._nsMap) if len(descr) > 0: of_container += descr_templ % ('<dc:description>' + escapeXml( descr[0].strip()) + '</dc:description>') ## SKIPPING: Not in EduStandaard. #6.0: Check for embargo: # embargo = objectfile.xpath('self::didl:Item/didl:Descriptor/didl:Statement/dcterms:available/text()', namespaces=self._nsMap) # if len(embargo) > 0 and comm.isISO8601(embargo[0]): # of_container += descr_templ % ('<dcterms:available>'+embargo[0].strip()+'</dcterms:available>') ## SKIPPING: Not in EduStandaard. #6.1: Check for dateSubmitted: # dembargo = objectfile.xpath('self::didl:Item/didl:Descriptor/didl:Statement/dcterms:dateSubmitted/text()', namespaces=self._nsMap) # if len(dembargo) > 0 and comm.isISO8601(dembargo[0]): # of_container += descr_templ % ('<dcterms:dateSubmitted>'+dembargo[0].strip()+'</dcterms:dateSubmitted>') # else: # #6.2: Check for issued (depricated, normalize to dateSubmitted): # issued = objectfile.xpath('self::didl:Item/didl:Descriptor/didl:Statement/dcterms:issued/text()', namespaces=self._nsMap) # if len(issued) > 0 and comm.isISO8601(issued[0]): # of_container += descr_templ % ('<dcterms:dateSubmitted>'+issued[0].strip()+'</dcterms:dateSubmitted>') #7: Check for published version(author/publisher): pubVersion = objectfile.xpath( 'self::didl:Item/didl:Descriptor/didl:Statement/rdf:type/@rdf:resource', namespaces=self._nsMap) if len( pubVersion ) > 0: ## Both (author/publisher) may be available: we'll take the first one... for key, value in pubVersions.iteritems(): if pubVersion[0].strip().lower().find(key) >= 0: of_container += descr_templ % ( '<rdf:type rdf:resource="' + value + '"/>') break #8:Check for MANDATORY resources and mimetypes: didl_resources = objectfile.xpath( 'self::didl:Item/didl:Component/didl:Resource[@mimeType and @ref]', namespaces=self._nsMap) resources = '' _url_list = [] for resource in didl_resources: mimeType = resource.xpath('self::didl:Resource/@mimeType', namespaces=self._nsMap) uri = resource.xpath('self::didl:Resource/@ref', namespaces=self._nsMap) ## We need both mimeType and URI: (MIMETYPE is required by DIDL schema, @ref not). if len(mimeType) > 0 and len(uri) > 0: if not comm.isMimeType(mimeType[0]): self.do.logMsg(self._uploadid, LOGGER8 + mimeType[0], prefix=STR_DIDL) if comm.isURL(uri[0].strip()): resources += """<didl:Resource mimeType="%s" ref="%s"/>""" % ( escapeXml(mimeType[0].strip()), escapeXml(comm.urlQuote(uri[0].strip()))) _url_list.append( """<didl:Resource mimeType="%s" ref="%s"/>""" % (escapeXml(mimeType[0].strip()), escapeXml(comm.urlQuote(uri[0].strip())))) else: raise ValidateException( formatExceptionLine(EXCEPTION9 + uri[0], prefix=STR_DIDL)) if resources != '': of_container += """<didl:Component> %s </didl:Component>""" % (resources) else: raise ValidateException( formatExceptionLine(EXCEPTION10, prefix=STR_DIDL)) of_container += '</didl:Item>' return of_container
def _getTopItem(self, lxmlNode): ## Wrappers: pid, modified, mimetype, pidlocation = '', '', "application/xml", '' #1: Get persistentIdentifier: pidlist = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Descriptor/didl:Statement/dii:Identifier/text()', namespaces=self._nsMap) if len(pidlist) > 0: pid = pidlist[0].strip() if not comm.isURNNBN(pid): raise ValidateException( formatExceptionLine(EXCEPTION0 + pid, prefix=STR_DIDL)) else: raise ValidateException( formatExceptionLine(EXCEPTION1, prefix=STR_DIDL)) #2: Get toplevel modificationDate: comm.isISO8601() tl_modified = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Descriptor/didl:Statement/dcterms:modified/text()', namespaces=self._nsMap) ## Check op geldig/aanwezigheid tlModified, anders exception: if len(tl_modified) > 0 and not comm.isISO8601(tl_modified[0]): raise ValidateException( formatExceptionLine(EXCEPTION2 + tl_modified[0], prefix=STR_DIDL)) elif len(tl_modified) == 0: raise ValidateException( formatExceptionLine(EXCEPTION3, prefix=STR_DIDL)) ## Get all modified dates: all_modified = lxmlNode.xpath( '//didl:Item/didl:Descriptor/didl:Statement/dcterms:modified/text()', namespaces=self._nsMap) ## Get most recent date from all items, to add to toplevelItem: if len(all_modified) > 0: datedict = {} for date in all_modified: if comm.isISO8601(date.strip()): #datedict[parseDate(date.strip())] = date.strip() pd = parseDate(date.strip()) datedict["%s %s" % (str(pd.date()), str(pd.time()))] = date.strip() ## Get first sorted key: for key in reversed(sorted(datedict.iterkeys())): modified = datedict[key] break if not tl_modified[0].strip() == modified: self.do.logMsg(self._uploadid, LOGGER1, prefix=STR_DIDL) #3: Get PidResourceMimetype mimetypelist = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Component/didl:Resource/@mimeType', namespaces=self._nsMap) if len(mimetypelist) > 0: mimetype = mimetypelist[0].strip() if not comm.isMimeType(mimetype): self.do.logMsg(self._uploadid, LOGGER2 + mimetype, prefix=STR_DIDL) #4: Get PidResourceLocation: pidlocation = self._findAndBindFirst( lxmlNode, '%s', '//didl:DIDL/didl:Item/didl:Component/didl:Resource/@ref', '//didl:DIDL/didl:Item/didl:Component/didl:Resource/text()' '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref', #DIDL 3.0 '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref', #DIDL 3.0, without @rdf:resource '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref', #fallback DIDL 2.3.1 '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref', #fallback DIDL 3.0 '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref', #fallback DIDL 3.0, without @rdf:resource '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref' #fallback DIDL 2.3.1 ).strip() if pidlocation == '': raise ValidateException( formatExceptionLine(EXCEPTION4, prefix=STR_DIDL)) if not comm.isURL(pidlocation): raise ValidateException( formatExceptionLine(EXCEPTION5 + pidlocation, prefix=STR_DIDL)) return """<didl:Item> <didl:Descriptor><didl:Statement mimeType="application/xml"><dii:Identifier>%s</dii:Identifier></didl:Statement></didl:Descriptor> <didl:Descriptor><didl:Statement mimeType="application/xml"><dcterms:modified>%s</dcterms:modified></didl:Statement></didl:Descriptor> <didl:Component><didl:Resource mimeType="%s" ref="%s"/></didl:Component>""" % ( escapeXml(pid), modified, escapeXml(mimetype), comm.urlQuote(pidlocation))
def _checkOriginInfoDateIssued(self, modsNode): if len(modsNode.xpath("//mods:mods/mods:originInfo/mods:dateIssued", namespaces=self._nsMap)) <= 0: raise ValidateException(formatExceptionLine(EXCEPTION7, prefix=STR_MODS))