def testCurieToTagSpeed(self): from time import time t = 0 for i in xrange(20000): t0 = time() namespaces.curieToTag('dc:%s' % (i / 200)) t += (time() - t0) self.assertTiming(0.025, t, 0.035)
def testCurieToTagSpeed(self): from time import time t = 0 for i in range(20000): t0 = time() namespaces.curieToTag('dc:%s' % (i / 200)) t += (time() - t0) self.assertTiming(0.015, t, 0.030)
def _findFragmentNodesWithAboutUris(self, lxmlNode): for descriptionNode in xpath(lxmlNode, "*[@rdf:about]"): uri = str(descriptionNode.attrib[curieToTag("rdf:about")]) yield descriptionNode, uri for statementNode in xpath(lxmlNode, "rdf:Statement"): uri = str(xpathFirst(statementNode, 'rdf:subject/@rdf:resource')) yield statementNode, uri
def add(self, lxmlNode, **kwargs): fieldslist = [] for child in lxmlNode.getroot().getchildren(): if child.tag == curieToTag('meta:repository'): for repokind in child.iterchildren(): fieldname = tagToCurie(repokind.tag) fieldslist.append((fieldname, repokind.text)) yield self.all.add(fieldslist=fieldslist, **kwargs) # <meta xmlns="http://meresco.org/namespace/harvester/meta"> # <upload> # <id>knaw:record:4</id> # </upload> # <record> # <id>record:4</id> # <harvestdate>2016-10-05T10:30:45Z</harvestdate> # <metadataNamespace>http://www.loc.gov/mods/v3</metadataNamespace> # </record> # <repository> # <id>knaw</id> # <set>oa_publications</set> # <baseurl>http://depot.knaw.nl/cgi/oai2</baseurl> # <repositoryGroupId>knaw</repositoryGroupId> # <metadataPrefix>nl_didl</metadataPrefix> # <collection>publication</collection> # </repository> # </meta>
def buildBodyDescription(hasBodyElement, elements): prefixes = determinePrefixes(elements) bodyDescriptionElement = SubElement( hasBodyElement, namespaces.curieToTag('rdf:Description'), nsmap=dict((prefix, namespaces[prefix]) for prefix in prefixes)) bodyDescriptionElement.extend(elements)
for rdfType in rdfTypes: typeTagCurie = self.nodePromotedTypes.get(rdfType) if typeTagCurie: resourceDescription['relations'].remove((RDF_TYPE, Uri(rdfType))) return typeTagCurie return 'rdf:Description' def _subjectUriOrder(self, (s, resourceDescription)): return ( min([self.relativeTypePositions.get(type, 0) for type in resourceDescription['types']] or [0]), len(self._leftHandSides(BNode(s) if s.startswith('_:') else Uri(s))), -len(resourceDescription['relations']) ) RDF_STATEMENT_TAG = curieToTag('rdf:Statement') RDF_ABOUT_TAG = curieToTag('rdf:about') RDF_TYPE = curieToUri('rdf:type') RDF_SUBJECT = curieToUri('rdf:subject') RDF_PREDICATE = curieToUri('rdf:predicate') RDF_OBJECT = curieToUri('rdf:object') REIFICATION_RELATIONS = set([RDF_SUBJECT, RDF_PREDICATE, RDF_OBJECT]) NODE_PROMOTED_TYPES = set(['rdf:Statement', 'oa:Annotation']) RELATIVE_TYPE_POSITIONS = { curieToUri('oa:Annotation'): -10, curieToUri('rdf:Statement'): 100, }
fieldname = self._namespaces.tagToCurie(node.tag) if fieldname != 'rdf:Description' and (fieldname != 'rdf:type' or not parent): if parent: parent += "." fieldname = parent + fieldname else: fieldname = parent for name, value in ( ('.uri', node.attrib.get(RDF_RESOURCE)), ('.uri', node.attrib.get(RDF_ABOUT)), ('', node.text) ): if value is None or value.strip() == '': continue yield self._modifyField(fieldname + name, value) for child in node.iterchildren(): yield self._yieldField(child, parent=fieldname) def _valueMethod(max_length=None): if max_length: return lambda value: None if value is None else value[:max_length] return lambda value: value RDF_RESOURCE = namespaces.curieToTag('rdf:resource') RDF_ABOUT = namespaces.curieToTag('rdf:about') IS_FORMAT_OF = namespaces.curieToTag('dcterms:isFormatOf') MOTIVATED_BY = namespaces.curieToTag('oa:motivatedBy') HAS_TARGET = namespaces.curieToTag('oa:hasTarget') HAS_BODY = namespaces.curieToTag('oa:hasBody')
def testCurieToTag(self): self.assertEquals('{http://www.loc.gov/zing/srw/}record', namespaces.expandNsTag('srw:record')) self.assertEquals('{http://purl.org/dc/elements/1.1/}title', namespaces.curieToTag('dc:title'))
annotationElement, OA_ANNOTATED_BY_TAG, attrib={RDF_RESOURCE_TAG: unicode(annotatedByUri)} ) SubElement( annotationElement, OA_MOTIVATED_BY_TAG, attrib={RDF_RESOURCE_TAG: unicode(motiveUri)} ) SubElement( annotationElement, OA_HAS_TARGET_TAG, attrib={RDF_RESOURCE_TAG: unicode(targetUri)} ) hasBodyElement = SubElement( annotationElement, OA_HAS_BODY_TAG ) return rdfElement, hasBodyElement OA_ANNOTATED_BY_TAG = curieToTag('oa:annotatedBy') OA_ANNOTATION_TAG = curieToTag('oa:Annotation') OA_HAS_BODY_TAG = curieToTag('oa:hasBody') OA_HAS_TARGET_TAG = curieToTag('oa:hasTarget') OA_MOTIVATED_BY_TAG = curieToTag('oa:motivatedBy') RDF_ABOUT_TAG = curieToTag('rdf:about') RDF_RDF_TAG = curieToTag('rdf:RDF') RDF_RESOURCE_TAG = curieToTag('rdf:resource') NSMAP_RDF = namespaces.select('rdf') NSMAP_OA = namespaces.select('oa')
@property def path(self): return self._processor._path @property def metadataPrefix(self): return self._processor._metadataPrefix @property def set(self): return self._processor._set @property def nextRequestTime(self): return self._processor._earliestNextRequestTime RESUMPTIONTOKEN_STATE = "Resumptiontoken: " VERB_TAGNAME = { 'ListRecords': curieToTag('oai:record'), 'ListIdentifiers': curieToTag('oai:header') } _USER_AGENT = "Meresco-Oai-DownloadProcessor/%s" % VERSION HEADER_TAG = curieToTag('oai:header') IDENTIFIER_TAG = curieToTag('oai:identifier') DATESTAMP_TAG = curieToTag('oai:datestamp') __all__ = ['OaiDownloadProcessor']
def name(self): return self._processor.observable_name() @property def path(self): return self._processor._path @property def metadataPrefix(self): return self._processor._metadataPrefix @property def set(self): return self._processor._set @property def nextRequestTime(self): return self._processor._earliestNextRequestTime RESUMPTIONTOKEN_STATE = "Resumptiontoken: " VERB_TAGNAME = { 'ListRecords': curieToTag('oai:record'), 'ListIdentifiers': curieToTag('oai:header') } _USER_AGENT = "Meresco-Oai-DownloadProcessor/%s" % VERSION HEADER_TAG = curieToTag('oai:header') IDENTIFIER_TAG = curieToTag('oai:identifier') DATESTAMP_TAG = curieToTag('oai:datestamp')
def testCurieToTag(self): self.assertEqual('{http://www.loc.gov/zing/srw/}record', namespaces.expandNsTag('srw:record')) self.assertEqual('{http://purl.org/dc/elements/1.1/}title', namespaces.curieToTag('dc:title'))
self.addTriple(r, rdf_type_uri, Uri(rdf_Statement_uri)) def getText(node): # *Only* call with an Element allText = node.text or '' for c in node.getchildren(): tail = c.tail if tail: allText += tail return allText or None x_lang_tag = curieToTag("xml:lang") rdf_RDF_tag = curieToTag("rdf:RDF") rdf_ID_tag = curieToTag("rdf:ID") rdf_about_tag = curieToTag("rdf:about") rdf_aboutEach_tag = curieToTag("rdf:aboutEach") rdf_aboutEachPrefix_tag = curieToTag("rdf:aboutEachPrefix") rdf_type_tag = curieToTag("rdf:type") rdf_resource_tag = curieToTag("rdf:resource") rdf_Description_tag = curieToTag("rdf:Description") rdf_bagID_tag = curieToTag("rdf:bagID") rdf_parseType_tag = curieToTag("rdf:parseType") rdf_nodeID_tag = curieToTag("rdf:nodeID") rdf_datatype_tag = curieToTag("rdf:datatype") rdf_li_tag = curieToTag("rdf:li") rdf_Statement_uri = curieToUri('rdf:Statement')