Example #1
0
    def _detectAndValidate(self, *args, **kwargs):
        allArguments = list(args) + kwargs.values()
        for arg in allArguments:
            if type(arg) == _ElementTree: #Should be only one...
                
                for strName, strXPath, schema in self._xmlSchemas:
                    ## Doe xpath op betreffende XML/argument:
                    # Wij laten hier het volledige upload-record voorbijkomen, want die is later nodig. Echter is de metadata die wij moeten valideren beschikbaar als text en NIET als LXM-object.
                    # Wij gaan deze dus nu eerst opzoeken en converteren naar een LXML node.
                    record_part = arg.xpath("//document:document/document:part[@name='record']/text()", namespaces=self._namespacesMap)
                    record_lxml = fromstring(record_part[0])
                    xml = record_lxml.xpath(strXPath, namespaces=self._namespacesMap)

                    #################
                    # xml = arg.xpath(strXPath, namespaces=self._namespacesMap)
                    if len(xml) > 0:
                        schema.validate(xml[0])
                        if schema.error_log:
                            exception = ValidateException(formatXSDException(strName + " is NOT valid.", None, schema)) #, arg
                            self.do.logException(exception) # Sends ValidateException back to the Harvester, stops processing this record.
                            raise exception
                    else:
                        exception = ValidateException(formatExceptionLine("Mandatory " + strName + " NOT found."))
                        self.do.logException(exception) # Sends ValidateException back to the Harvester, stops processing this record.
                        raise exception
Example #2
0
    def _getDescriptiveMetadata(self, lxmlNode):
        ## This always normalizes to rdf namespace, without warning/message
        descriptiveMetadataItem = lxmlNode.xpath(
            '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/descriptiveMetadata"]',
            namespaces=self._nsMap)
        if len(descriptiveMetadataItem
               ) == 0:  #Fallback to @resource (no rdf nmsp), if available...
            descriptiveMetadataItem = lxmlNode.xpath(
                '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/descriptiveMetadata"]',
                namespaces=self._nsMap)
            if len(descriptiveMetadataItem) > 0:
                self.do.logMsg(self._uploadid, LOGGER3, prefix=STR_DIDL)
        if len(descriptiveMetadataItem
               ) == 0:  #Fallback to dip namespace, if available...
            descriptiveMetadataItem = lxmlNode.xpath(
                '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/descriptiveMetadata"]',
                namespaces=self._nsMap)
            if len(descriptiveMetadataItem) > 0:
                self.do.logMsg(self._uploadid, LOGGER4, prefix=STR_DIDL)
        if len(descriptiveMetadataItem) > 0:
            #look for first DMI containing MODS:
            dmi_mods = None
            dmItem = None
            for dmi in descriptiveMetadataItem:
                node = dmi.xpath('self::didl:Item//mods:mods',
                                 namespaces=self._nsMap)
                if len(node) > 0:  #Found MODS:
                    dmi_mods = node[0]
                    dmItem = dmi
                    break
            else:
                raise ValidateException(
                    formatExceptionLine(EXCEPTION6, prefix=STR_DIDL))

            item_template = """<didl:Item>
                                    <didl:Descriptor>
                                        <didl:Statement mimeType="application/xml">
                                            <rdf:type rdf:resource="info:eu-repo/semantics/descriptiveMetadata"/>
                                        </didl:Statement>
                                    </didl:Descriptor>
                                    %s%s<didl:Component>
                                        <didl:Resource mimeType="application/xml">
                                           %s 
                                        </didl:Resource>
                                    </didl:Component>
                                </didl:Item>""" % (
                self._getIdentifierDescriptor(dmItem),
                self._getDateModifiedDescriptor(dmItem), tostring(dmi_mods))
        else:
            raise ValidateException(
                formatExceptionLine(EXCEPTION7, prefix=STR_DIDL))
        return item_template
Example #3
0
 def _validateNames(self, modsNode): 
     for name in modsNode.iterfind(('{%s}name') % self._nsMap['mods']):
         for roleterm in name.iterfind(('.//{%s}roleTerm') % self._nsMap['mods']):                
             if roleterm.text: roleterm.text = roleterm.text.strip()
         role = name.xpath("self::mods:name/mods:role/mods:roleTerm[@type='code' and @authority='marcrelator']/text()", namespaces=self._nsMap)
         for namepart in name.iterfind(('{%s}namePart') % self._nsMap['mods']):
             if not namepart.text or not namepart.text.strip(): # Remove empty nameparts
                 name.remove(namepart)
         if not role or len(role) < 1 or name.find(('{%s}namePart') % self._nsMap['mods']) is None: ## Geen roleterm gevonden, of lege string voor type code en authority marcrelator, of geen nameParts: Verwijder dit name element:
             modsNode.remove(name)
         elif len(role) > 0 and not self.__isValidRoleTerm(role[0]):
             raise ValidateException(formatExceptionLine( EXCEPTION4 + role[0], prefix=STR_MODS))        
     if len(modsNode.xpath("//mods:mods/mods:name", namespaces=self._nsMap)) <= 0:
         raise ValidateException(formatExceptionLine(EXCEPTION5, prefix=STR_MODS))
 def testValidationErrors(self):
     self.observer.exceptions['add'] = ValidateException('Some <Exception>')
     headers, result = self.performRequest(self.createRequestBody())
     self.assertTrue(
         """<ucp:operationStatus>fail</ucp:operationStatus>""" in result,
         result)
     diag = parse(StringIO(result))
     self.assertEquals(
         "info:srw/diagnostic/12/12",
         xpathFirst(
             diag,
             '/srw:updateResponse/srw:diagnostics/diag:diagnostic/diag:uri/text()'
         ))
     self.assertEquals(
         "Some <Exception>",
         xpathFirst(
             diag,
             '/srw:updateResponse/srw:diagnostics/diag:diagnostic/diag:details/text()'
         ))
     self.assertEquals(
         "Invalid data:  record rejected",
         xpathFirst(
             diag,
             '/srw:updateResponse/srw:diagnostics/diag:diagnostic/diag:message/text()'
         ))
Example #5
0
    def _normaliseRecord(self, lxmlNode):
        # MODS normalisation in 4 steps:
        # 1. Get Mods from the lxmlNode.
        # 2. Normalize it
        # 3. Put it back in place.
        # 4. return the lxmlNode containing the normalized MODS.

        #1: Get Mods from the lxmlNode:
        lxmlMODS = lxmlNode.xpath('(//mods:mods)[1]', namespaces=self._nsMap)

        ## Our normalisation functions to call:
        modsFunctions = [ self._convertFullMods2GHMods ]

        if len(lxmlMODS) > 0:
        #2: Normalize it
            str_norm_mods = ''
            for function in modsFunctions:
                str_norm_mods += function(lxmlMODS[0])

        #3: Put it back in DIDL/place:
            lxmlMODS[0].getparent().replace(lxmlMODS[0], etree.fromstring(str_norm_mods) )

        else: #This should never happen @runtime: record should have been validated up front...
            raise ValidateException(formatExceptionLine(EXCEPTION1, prefix=STR_MODS))

        #4: Return the lxmlNode containing the normalized MODS:
        #print(etree.tostring(lxmlNode, pretty_print=True))
        return lxmlNode
    def testCollectLogWithErrors(self):
        self.observer.exceptions['delete'] = Exception('Some <Exception>')
        requestBody = self.createRequestBody(action=DELETE,
                                             recordIdentifier='idDelete')
        headers, result = self.performRequest(requestBody)
        self.assertEquals(
            dict(sruRecordUpdate=dict(delete=['idDelete'],
                                      errorType=['Exception'],
                                      errorMessage=["Some <Exception>"])),
            self.logCollector)

        self.observer.exceptions['add'] = ValidateException('Nee')
        requestBody = self.createRequestBody(action=CREATE,
                                             recordIdentifier='idAdd')
        headers, result = self.performRequest(requestBody)
        self.assertEquals(
            dict(sruRecordUpdate=dict(add=['idAdd'],
                                      invalid=['idAdd'],
                                      errorType=['ValidateException'],
                                      errorMessage=["Nee"])),
            self.logCollector)

        headers, result = self.performRequest(
            '<srw:updateRequest>Will raise XMLSyntaxError')
        sru_error = self.logCollector['sruRecordUpdate']
        self.assertEqual(['XMLSyntaxError'], sru_error['errorType'])
        self.assertTrue(sru_error['errorMessage'][0].startswith(
            'Namespace prefix srw on updateRequest is not defined, line 1, column 19'
        ))
Example #7
0
    def _tlOrigininfo(self, childNode):
        hasDateIssued = False
        ## Select all children from originInfo having 'encoding' attribute:
        children = childNode.xpath(
            "self::mods:originInfo/child::*[@encoding='w3cdtf' or @encoding='iso8601']",
            namespaces=self._nsMap)
        if len(children) > 0:
            for child in children:
                if self._validateISO8601(child.text):
                    child.text = self._granulateDate(child.text)
                    child.set('encoding', 'w3cdtf')
                    if child.tag == ('{%s}dateIssued') % self._nsMap['mods']:
                        hasDateIssued = True
                else:
                    child.getparent().remove(child)
        if not hasDateIssued:
            raise ValidateException(
                formatExceptionLine(EXCEPTION7, prefix=STR_MODS))

        for child in childNode.xpath("self::mods:originInfo/mods:publisher",
                                     namespaces=self._nsMap):
            if not child.text:
                child.getparent().remove(child)

        return childNode if len(childNode) > 0 else None
Example #8
0
 def _isValidTitleInfoTag(self, lxmlNode):
     for title in lxmlNode.iterfind(('{%s}title') % self._nsMap['mods']):
         if not title.text or not title.text.strip():
             raise ValidateException(formatExceptionLine(EXCEPTION3, prefix=STR_MODS))
     for subtitle in lxmlNode.iterfind(('{%s}subTitle') % self._nsMap['mods']):
         if not subtitle.text or not subtitle.text.strip():
             subtitle.getparent().remove(subtitle)
     return True
Example #9
0
 def _normalizeTitleinfo(self, modsNode):
     ## Select all titleInfo's
     hasTitleInfo = False
     for child in modsNode.iterfind(('{%s}titleInfo') % self._nsMap['mods']):
         hasTitleInfo = True 
         if not self._isValidTitleInfoTag(child):
             modsNode.remove(child)
     if not hasTitleInfo:
         raise ValidateException(formatExceptionLine(EXCEPTION2, prefix=STR_MODS))
    def _detectAndValidate(self, *args, **kwargs):
        allArguments = list(args) + kwargs.values()
        for arg in allArguments:
            if type(arg) == _ElementTree:  #Should be only one...

                for strName, strXPath, schema in self._xmlSchemas:
                    ## Doe xpath op betreffende XML/argument:
                    xml = arg.xpath(strXPath, namespaces=self._namespacesMap)
                    if len(xml) > 0:
                        schema.validate(xml[0])
                        if schema.error_log:
                            exception = ValidateException(
                                formatXSDException(strName + " is NOT valid.",
                                                   None, schema))  #, arg
                            self.do.logException(exception)
                            raise exception
                    else:
                        exception = ValidateException(
                            formatExceptionLine("Mandatory " + strName +
                                                " NOT found."))
                        self.do.logException(exception)
                        raise exception
Example #11
0
    def _getHumanStartPage(self, lxmlNode):

        didl_hsp_item = lxmlNode.xpath(
            '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/humanStartPage"]',
            namespaces=self._nsMap)
        if len(didl_hsp_item) == 0:
            didl_hsp_item = lxmlNode.xpath(
                '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/humanStartPage"]',
                namespaces=self._nsMap)
            if len(didl_hsp_item) > 0:
                self.do.logMsg(self._uploadid, LOGGER9, prefix=STR_DIDL)
            if len(didl_hsp_item) == 0:
                didl_hsp_item = lxmlNode.xpath(
                    '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/humanStartPage"]',
                    namespaces=self._nsMap)
                if len(didl_hsp_item) > 0:
                    self.do.logMsg(self._uploadid, LOGGER10, prefix=STR_DIDL)
                if len(didl_hsp_item) == 0:
                    self.do.logMsg(self._uploadid, LOGGER11, prefix=STR_DIDL)
                    return ""

        uriref = didl_hsp_item[0].xpath(
            'self::didl:Item/didl:Component/didl:Resource/@ref',
            namespaces=self._nsMap)
        mimetype = didl_hsp_item[0].xpath(
            'self::didl:Item/didl:Component/didl:Resource/@mimeType',
            namespaces=self._nsMap)

        if len(mimetype) == 0:
            self.do.logMsg(self._uploadid, LOGGER13, prefix=STR_DIDL)

        if len(mimetype) > 0 and not comm.isMimeType(mimetype[0]):
            self.do.logMsg(self._uploadid,
                           LOGGER12 + mimetype[0],
                           prefix=STR_DIDL)

        if len(uriref) == 0 or not comm.isURL(uriref[0]):
            raise ValidateException(
                formatExceptionLine(EXCEPTION11, prefix=STR_DIDL))

        return """<didl:Item>
                    <didl:Descriptor>
                        <didl:Statement mimeType="application/xml">
                            <rdf:type rdf:resource="info:eu-repo/semantics/humanStartPage"/>
                        </didl:Statement>
                    </didl:Descriptor>
                    <didl:Component>
                        <didl:Resource ref="%s" mimeType="%s"/>
                    </didl:Component>
                </didl:Item>""" % (escapeXml(comm.urlQuote(
            uriref[0].strip())), escapeXml(mimetype[0]))
Example #12
0
 def _validateGenre(self, modsNode):
 
     fqGenre = None
     bln_hasValid = False
     ## Loop all 'genre' elements as separate nodes:
     for genre in modsNode.iterfind('{'+self._nsMap.get('mods')+'}genre'):
     
         for key, value in GENRES_SEMANTIEK.iteritems():
                 if genre.text and genre.text.strip().lower().find(key) >= 0: #found a (lowercased) genre
                     fqGenre = value
                     break
     
         if fqGenre is not None and not bln_hasValid:
             bln_hasValid = True
             genre.text = fqGenre
         else:
             modsNode.remove(genre)
             
     if not bln_hasValid:
         raise ValidateException(formatExceptionLine(EXCEPTION6, prefix=STR_MODS))
    def add(self, lxmlNode, **kwargs):

        self._fieldslist = []  # reset the fieldslist
        self._nids_aut_enriched.clear()  # Empty the set.
        self._record_pids.clear()
        # hier komt een compleet meresco:document binnen als LXMLnode:
        # self.uploadid = kwargs['identifier']

        # Get meta, header and metadata part(='long') from the normdoc:
        e_metapart = etree.fromstring(
            lxmlNode.xpath(
                '/document:document/document:part[@name="meta"]/text()',
                namespaces=namespacesmap)[0])
        wcp_collection = e_metapart.xpath(
            '/meta:meta/meta:repository/meta:collection/text()',
            namespaces=namespacesmap)
        if not wcp_collection:
            raise ValidateException(
                "Collection is missing from metapart! Please add collection in WCP."
            )
        self._wcp_collection = wcp_collection[0]

        e_recordpart = etree.fromstring(
            lxmlNode.xpath(
                '/document:document/document:part[@name="record"]/text()',
                namespaces=namespacesmap)[0])

        # Add known metapart fields for all records:
        for field, xpad in metaFieldNamesToXpath.iteritems():
            self._fieldslist.append(
                (field, e_metapart.xpath(xpad, namespaces=namespacesmap)[0]))
            if self._verbose:
                print 'addField:', field.upper(), "-->", e_metapart.xpath(
                    xpad, namespaces=namespacesmap)[0]

        record = None
        if self._wcp_collection in WCPNODCOLLECTION:
            record = e_recordpart.xpath(
                '//prs:persoon | //prj:activiteit | //org:organisatie',
                namespaces=namespacesmap)
        else:
            record = e_recordpart.xpath('//norm:normalized/long:knaw_long',
                                        namespaces=namespacesmap)

        self._fillFieldslist(record[0], '')  # Add fiels by path in xml.
        self._addAuthorsAndNamesFields(record[0])

        for field, xpad in fieldNamesXpathMap.iteritems(
        ):  # Add fields by xPath
            self._findAndAddToFieldslist(record[0], field, xpad)

        if self._wcp_collection in WCPEDUCOLLECTION:

            nidlist = self.all.lookupNameIds(pidlist=self._record_pids)

            for generator in nidlist:
                for nid in generator:
                    splitted = nid.split(":", 2)
                    nameId = NameIdentifierFactory.factory(
                        splitted[0], splitted[1])
                    if nameId.is_valid():
                        self._nids_aut_enriched.add(nameId.get_idx_id())
                        self._nids_aut_enriched.add(nameId.get_id())

            for nid in self._nids_aut_enriched:
                self._fieldslist.append(('nids_aut_enriched', nid))
                if self._verbose:
                    print 'addField:', 'nids_aut_enriched'.upper(), "-->", nid

        # Ready filling fieldslist, now call add method:
        yield self.all.add(fieldslist=self._fieldslist, **kwargs)
Example #14
0
 def _checkOriginInfoDateIssued(self, modsNode):
     if len(modsNode.xpath("//mods:mods/mods:originInfo/mods:dateIssued", namespaces=self._nsMap)) <= 0:
         raise ValidateException(formatExceptionLine(EXCEPTION7, prefix=STR_MODS))
Example #15
0
    def __init__(self, lxmlNode, uploadId):

        md_format = None
        if len(
                lxmlNode.xpath('//didl:DIDL[1]',
                               namespaces=Namespaces.NAMESPACEMAP)
        ) > 0:  # Check for DIDL container, Max. 1 according to EduStandaard.

            if int(
                    lxmlNode.xpath("count(//mods:mods)",
                                   namespaces=Namespaces.NAMESPACEMAP)
            ) >= 1:  # Check for MODS container.
                # Found MODS: Check op aanwezigheid rdf namespace, to differentiate between known versions:
                if lxmlNode.xpath("boolean(count(//rdf:*))",
                                  namespaces=Namespaces.NAMESPACEMAP):
                    md_format = MetadataFormat.DIDLM30
                else:
                    md_format = MetadataFormat.DIDLM23
            elif int(
                    lxmlNode.xpath("count(//oai_dc:dc)",
                                   namespaces=Namespaces.NAMESPACEMAP)
            ) == 1:  # Check for OAI_DC container.
                md_format = MetadataFormat.DIDLDC

        elif int(
                lxmlNode.xpath("count(//mods:mods)",
                               namespaces=Namespaces.NAMESPACEMAP)
        ) >= 1:  # Full MODS (MODS only)
            md_format = MetadataFormat.DIDLM36
        elif lxmlNode.xpath(
                "boolean(count(//oai_dc:dc))",
                namespaces=Namespaces.NAMESPACEMAP
        ):  # No DIDL, nor MODS was found, check for plain DC:
            md_format = MetadataFormat.OAIDC
        elif lxmlNode.xpath(
                "boolean(count(//org:organisatie))",
                namespaces=Namespaces.NAMESPACEMAP
        ):  # No DIDL, nor MODS was found, check for plain DC:
            md_format = MetadataFormat.ORG  # NOD organization
        elif lxmlNode.xpath(
                "boolean(count(//proj:activiteit))",
                namespaces=Namespaces.NAMESPACEMAP
        ):  # No DIDL, nor MODS was found, check for plain DC:
            md_format = MetadataFormat.PROJ  # NOD project
        elif lxmlNode.xpath(
                "boolean(count(//prs:persoon))",
                namespaces=Namespaces.NAMESPACEMAP
        ):  # No DIDL, nor MODS was found, check for plain DC:
            md_format = MetadataFormat.PRS  # NOD Person
        elif lxmlNode.xpath(
                "boolean(count(//datacite:resource))",
                namespaces=Namespaces.NAMESPACEMAP
        ):  # No DIDL, nor MODS or ORE was found, check for DATACITE:
            md_format = MetadataFormat.DATACITE

        if md_format == None:
            raise ValidateException(
                "No known EduStandaard format was found in the metadata for uploadid: %s! This record cannot be processed."
                % (uploadId))

        self._format = md_format
        self._namespace = Namespaces.getNamespace(format)
Example #16
0
    def _getObjectfiles(self, lxmlNode):
        of_container = ''
        objectfiles = lxmlNode.xpath(
            '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/objectFile"]',
            namespaces=self._nsMap)
        if len(objectfiles) == 0:
            objectfiles = lxmlNode.xpath(
                '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/objectFile"]',
                namespaces=self._nsMap)
            if len(objectfiles) > 0:
                self.do.logMsg(self._uploadid, LOGGER6, prefix=STR_DIDL)
        if len(objectfiles) == 0:
            objectfiles = lxmlNode.xpath(
                '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/objectFile"]',
                namespaces=self._nsMap)
            if len(objectfiles) > 0:
                self.do.logMsg(self._uploadid, LOGGER7, prefix=STR_DIDL)
        for objectfile in objectfiles:
            #1:Define correct ObjectFile descriptor:
            of_container += '<didl:Item><didl:Descriptor><didl:Statement mimeType="application/xml"><rdf:type rdf:resource="info:eu-repo/semantics/objectFile"/></didl:Statement></didl:Descriptor>'

            #2: Check geldige Identifier (feitelijk verplicht, hoewel vaak niet geimplemeteerd...)
            pi = objectfile.xpath(
                'self::didl:Item/didl:Descriptor/didl:Statement/dii:Identifier/text()',
                namespaces=self._nsMap)
            if len(pi) > 0:
                of_container += descr_templ % ('<dii:Identifier>' + escapeXml(
                    pi[0].strip()) + '</dii:Identifier>')

        #3: Check op geldige AccessRights:
            arights = objectfile.xpath(
                'self::didl:Item/didl:Descriptor/didl:Statement/dcterms:accessRights/text()',
                namespaces=self._nsMap)
            if len(arights) > 0:
                for key, value in accessRights.iteritems():
                    if arights[0].strip().lower().find(key) >= 0:
                        of_container += descr_templ % (
                            '<dcterms:accessRights>' + value +
                            '</dcterms:accessRights>')
                        break
                else:
                    raise ValidateException(
                        formatExceptionLine(arights[0] + EXCEPTION12,
                                            prefix=STR_DIDL))
            else:
                raise ValidateException(
                    formatExceptionLine(EXCEPTION8, prefix=STR_DIDL))

        #4: Check geldige datemodified (feitelijk verplicht, hoewel vaak niet geimplemeteerd...)
            modified = objectfile.xpath(
                'self::didl:Item/didl:Descriptor/didl:Statement/dcterms:modified/text()',
                namespaces=self._nsMap)
            if len(modified) > 0 and comm.isISO8601(modified[0]):
                of_container += descr_templ % ('<dcterms:modified>' +
                                               modified[0].strip() +
                                               '</dcterms:modified>')

        #5: Check for 'file' description:
            descr = objectfile.xpath(
                'self::didl:Item/didl:Descriptor/didl:Statement/dc:description/text()',
                namespaces=self._nsMap)
            if len(descr) > 0:
                of_container += descr_templ % ('<dc:description>' + escapeXml(
                    descr[0].strip()) + '</dc:description>')

        ## SKIPPING: Not in EduStandaard.
        #6.0: Check for embargo:
        #    embargo = objectfile.xpath('self::didl:Item/didl:Descriptor/didl:Statement/dcterms:available/text()', namespaces=self._nsMap)
        #    if len(embargo) > 0 and comm.isISO8601(embargo[0]):
        #        of_container += descr_templ % ('<dcterms:available>'+embargo[0].strip()+'</dcterms:available>')

        ## SKIPPING: Not in EduStandaard.
        #6.1: Check for dateSubmitted:
        #    dembargo = objectfile.xpath('self::didl:Item/didl:Descriptor/didl:Statement/dcterms:dateSubmitted/text()', namespaces=self._nsMap)
        #    if len(dembargo) > 0 and comm.isISO8601(dembargo[0]):
        #        of_container += descr_templ % ('<dcterms:dateSubmitted>'+dembargo[0].strip()+'</dcterms:dateSubmitted>')
        #    else:
        #        #6.2: Check for issued (depricated, normalize to dateSubmitted):
        #        issued = objectfile.xpath('self::didl:Item/didl:Descriptor/didl:Statement/dcterms:issued/text()', namespaces=self._nsMap)
        #        if len(issued) > 0 and comm.isISO8601(issued[0]):
        #            of_container += descr_templ % ('<dcterms:dateSubmitted>'+issued[0].strip()+'</dcterms:dateSubmitted>')

        #7: Check for published version(author/publisher):
            pubVersion = objectfile.xpath(
                'self::didl:Item/didl:Descriptor/didl:Statement/rdf:type/@rdf:resource',
                namespaces=self._nsMap)
            if len(
                    pubVersion
            ) > 0:  ## Both (author/publisher) may be available: we'll take the first one...
                for key, value in pubVersions.iteritems():
                    if pubVersion[0].strip().lower().find(key) >= 0:
                        of_container += descr_templ % (
                            '<rdf:type rdf:resource="' + value + '"/>')
                        break

        #8:Check for MANDATORY resources and mimetypes:
            didl_resources = objectfile.xpath(
                'self::didl:Item/didl:Component/didl:Resource[@mimeType and @ref]',
                namespaces=self._nsMap)
            resources = ''
            _url_list = []
            for resource in didl_resources:
                mimeType = resource.xpath('self::didl:Resource/@mimeType',
                                          namespaces=self._nsMap)
                uri = resource.xpath('self::didl:Resource/@ref',
                                     namespaces=self._nsMap)
                ## We need both mimeType and URI: (MIMETYPE is required by DIDL schema, @ref not).
                if len(mimeType) > 0 and len(uri) > 0:
                    if not comm.isMimeType(mimeType[0]):
                        self.do.logMsg(self._uploadid,
                                       LOGGER8 + mimeType[0],
                                       prefix=STR_DIDL)
                    if comm.isURL(uri[0].strip()):
                        resources += """<didl:Resource mimeType="%s" ref="%s"/>""" % (
                            escapeXml(mimeType[0].strip()),
                            escapeXml(comm.urlQuote(uri[0].strip())))
                        _url_list.append(
                            """<didl:Resource mimeType="%s" ref="%s"/>""" %
                            (escapeXml(mimeType[0].strip()),
                             escapeXml(comm.urlQuote(uri[0].strip()))))
                    else:
                        raise ValidateException(
                            formatExceptionLine(EXCEPTION9 + uri[0],
                                                prefix=STR_DIDL))

            if resources != '':
                of_container += """<didl:Component>
                %s
            </didl:Component>""" % (resources)
            else:
                raise ValidateException(
                    formatExceptionLine(EXCEPTION10, prefix=STR_DIDL))
            of_container += '</didl:Item>'
        return of_container
Example #17
0
    def _getTopItem(self, lxmlNode):
        ## Wrappers:
        pid, modified, mimetype, pidlocation = '', '', "application/xml", ''

        #1:     Get persistentIdentifier:
        pidlist = lxmlNode.xpath(
            '//didl:DIDL/didl:Item/didl:Descriptor/didl:Statement/dii:Identifier/text()',
            namespaces=self._nsMap)
        if len(pidlist) > 0:
            pid = pidlist[0].strip()
            if not comm.isURNNBN(pid):
                raise ValidateException(
                    formatExceptionLine(EXCEPTION0 + pid, prefix=STR_DIDL))
        else:
            raise ValidateException(
                formatExceptionLine(EXCEPTION1, prefix=STR_DIDL))

#2:     Get toplevel modificationDate: comm.isISO8601()
        tl_modified = lxmlNode.xpath(
            '//didl:DIDL/didl:Item/didl:Descriptor/didl:Statement/dcterms:modified/text()',
            namespaces=self._nsMap)
        ## Check op geldig/aanwezigheid tlModified, anders exception:
        if len(tl_modified) > 0 and not comm.isISO8601(tl_modified[0]):
            raise ValidateException(
                formatExceptionLine(EXCEPTION2 + tl_modified[0],
                                    prefix=STR_DIDL))
        elif len(tl_modified) == 0:
            raise ValidateException(
                formatExceptionLine(EXCEPTION3, prefix=STR_DIDL))

        ## Get all modified dates:
        all_modified = lxmlNode.xpath(
            '//didl:Item/didl:Descriptor/didl:Statement/dcterms:modified/text()',
            namespaces=self._nsMap)

        ## Get most recent date from all items, to add to toplevelItem:
        if len(all_modified) > 0:
            datedict = {}
            for date in all_modified:
                if comm.isISO8601(date.strip()):
                    #datedict[parseDate(date.strip())] = date.strip()
                    pd = parseDate(date.strip())
                    datedict["%s %s" %
                             (str(pd.date()), str(pd.time()))] = date.strip()

            ## Get first sorted key:
            for key in reversed(sorted(datedict.iterkeys())):
                modified = datedict[key]
                break
        if not tl_modified[0].strip() == modified:
            self.do.logMsg(self._uploadid, LOGGER1, prefix=STR_DIDL)

#3:     Get PidResourceMimetype
        mimetypelist = lxmlNode.xpath(
            '//didl:DIDL/didl:Item/didl:Component/didl:Resource/@mimeType',
            namespaces=self._nsMap)
        if len(mimetypelist) > 0:
            mimetype = mimetypelist[0].strip()
            if not comm.isMimeType(mimetype):
                self.do.logMsg(self._uploadid,
                               LOGGER2 + mimetype,
                               prefix=STR_DIDL)

#4:     Get PidResourceLocation:
        pidlocation = self._findAndBindFirst(
            lxmlNode,
            '%s',
            '//didl:DIDL/didl:Item/didl:Component/didl:Resource/@ref',
            '//didl:DIDL/didl:Item/didl:Component/didl:Resource/text()'
            '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref',  #DIDL 3.0
            '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref',  #DIDL 3.0, without @rdf:resource
            '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref',  #fallback DIDL 2.3.1
            '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref',  #fallback DIDL 3.0
            '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref',  #fallback DIDL 3.0, without @rdf:resource
            '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref'  #fallback DIDL 2.3.1
        ).strip()

        if pidlocation == '':
            raise ValidateException(
                formatExceptionLine(EXCEPTION4, prefix=STR_DIDL))
        if not comm.isURL(pidlocation):
            raise ValidateException(
                formatExceptionLine(EXCEPTION5 + pidlocation, prefix=STR_DIDL))

        return """<didl:Item>
        <didl:Descriptor><didl:Statement mimeType="application/xml"><dii:Identifier>%s</dii:Identifier></didl:Statement></didl:Descriptor>
        <didl:Descriptor><didl:Statement mimeType="application/xml"><dcterms:modified>%s</dcterms:modified></didl:Statement></didl:Descriptor>
        <didl:Component><didl:Resource mimeType="%s" ref="%s"/></didl:Component>""" % (
            escapeXml(pid), modified, escapeXml(mimetype),
            comm.urlQuote(pidlocation))