Example #1
0
 def testCurieToTagSpeed(self):
     from time import time
     t = 0
     for i in xrange(20000):
         t0 = time()
         namespaces.curieToTag('dc:%s' % (i / 200))
         t += (time() - t0)
     self.assertTiming(0.025, t, 0.035)
Example #2
0
 def testCurieToTagSpeed(self):
     from time import time
     t = 0
     for i in range(20000):
         t0 = time()
         namespaces.curieToTag('dc:%s' % (i / 200))
         t += (time() - t0)
     self.assertTiming(0.015, t, 0.030)
Example #3
0
 def _findFragmentNodesWithAboutUris(self, lxmlNode):
     for descriptionNode in xpath(lxmlNode, "*[@rdf:about]"):
         uri = str(descriptionNode.attrib[curieToTag("rdf:about")])
         yield descriptionNode, uri
     for statementNode in xpath(lxmlNode, "rdf:Statement"):
         uri = str(xpathFirst(statementNode, 'rdf:subject/@rdf:resource'))
         yield statementNode, uri
    def add(self, lxmlNode, **kwargs):
        fieldslist = []
        for child in lxmlNode.getroot().getchildren():
            if child.tag == curieToTag('meta:repository'):
                for repokind in child.iterchildren():
                    fieldname = tagToCurie(repokind.tag)
                    fieldslist.append((fieldname, repokind.text))

        yield self.all.add(fieldslist=fieldslist, **kwargs)



# <meta xmlns="http://meresco.org/namespace/harvester/meta">
#     <upload>
#         <id>knaw:record:4</id>
#     </upload>
#     <record>
#         <id>record:4</id>
#         <harvestdate>2016-10-05T10:30:45Z</harvestdate>
#         <metadataNamespace>http://www.loc.gov/mods/v3</metadataNamespace>
#     </record>
#     <repository>
#         <id>knaw</id>
#         <set>oa_publications</set>
#         <baseurl>http://depot.knaw.nl/cgi/oai2</baseurl>
#         <repositoryGroupId>knaw</repositoryGroupId>
#         <metadataPrefix>nl_didl</metadataPrefix>
#         <collection>publication</collection>
#     </repository>
# </meta>
Example #5
0
    def add(self, lxmlNode, **kwargs):
        fieldslist = []
        for child in lxmlNode.getroot().getchildren():
            if child.tag == curieToTag('meta:repository'):
                for repokind in child.iterchildren():
                    fieldname = tagToCurie(repokind.tag)
                    fieldslist.append((fieldname, repokind.text))

        yield self.all.add(fieldslist=fieldslist, **kwargs)


# <meta xmlns="http://meresco.org/namespace/harvester/meta">
#     <upload>
#         <id>knaw:record:4</id>
#     </upload>
#     <record>
#         <id>record:4</id>
#         <harvestdate>2016-10-05T10:30:45Z</harvestdate>
#         <metadataNamespace>http://www.loc.gov/mods/v3</metadataNamespace>
#     </record>
#     <repository>
#         <id>knaw</id>
#         <set>oa_publications</set>
#         <baseurl>http://depot.knaw.nl/cgi/oai2</baseurl>
#         <repositoryGroupId>knaw</repositoryGroupId>
#         <metadataPrefix>nl_didl</metadataPrefix>
#         <collection>publication</collection>
#     </repository>
# </meta>
Example #6
0
def buildBodyDescription(hasBodyElement, elements):
    prefixes = determinePrefixes(elements)
    bodyDescriptionElement = SubElement(
        hasBodyElement,
        namespaces.curieToTag('rdf:Description'),
        nsmap=dict((prefix, namespaces[prefix]) for prefix in prefixes))
    bodyDescriptionElement.extend(elements)
Example #7
0
        for rdfType in rdfTypes:
            typeTagCurie = self.nodePromotedTypes.get(rdfType)
            if typeTagCurie:
                resourceDescription['relations'].remove((RDF_TYPE, Uri(rdfType)))
                return typeTagCurie
        return 'rdf:Description'

    def _subjectUriOrder(self, (s, resourceDescription)):
        return (
            min([self.relativeTypePositions.get(type, 0) for type in resourceDescription['types']] or [0]),
            len(self._leftHandSides(BNode(s) if s.startswith('_:') else Uri(s))),
            -len(resourceDescription['relations'])
        )


RDF_STATEMENT_TAG = curieToTag('rdf:Statement')
RDF_ABOUT_TAG = curieToTag('rdf:about')

RDF_TYPE = curieToUri('rdf:type')
RDF_SUBJECT = curieToUri('rdf:subject')
RDF_PREDICATE = curieToUri('rdf:predicate')
RDF_OBJECT = curieToUri('rdf:object')
REIFICATION_RELATIONS = set([RDF_SUBJECT, RDF_PREDICATE, RDF_OBJECT])

NODE_PROMOTED_TYPES = set(['rdf:Statement', 'oa:Annotation'])

RELATIVE_TYPE_POSITIONS = {
    curieToUri('oa:Annotation'): -10,
    curieToUri('rdf:Statement'): 100,
}
        fieldname = self._namespaces.tagToCurie(node.tag)
        if fieldname != 'rdf:Description' and (fieldname != 'rdf:type' or not parent):
            if parent:
                parent += "."
            fieldname = parent + fieldname
        else:
            fieldname = parent

        for name, value in (
                ('.uri', node.attrib.get(RDF_RESOURCE)),
                ('.uri', node.attrib.get(RDF_ABOUT)),
                ('', node.text)
            ):
            if value is None or value.strip() == '':
                continue
            yield self._modifyField(fieldname + name, value)
        for child in node.iterchildren():
            yield self._yieldField(child, parent=fieldname)

def _valueMethod(max_length=None):
    if max_length:
        return lambda value: None if value is None else value[:max_length]
    return lambda value: value

RDF_RESOURCE = namespaces.curieToTag('rdf:resource')
RDF_ABOUT = namespaces.curieToTag('rdf:about')
IS_FORMAT_OF = namespaces.curieToTag('dcterms:isFormatOf')
MOTIVATED_BY = namespaces.curieToTag('oa:motivatedBy')
HAS_TARGET = namespaces.curieToTag('oa:hasTarget')
HAS_BODY = namespaces.curieToTag('oa:hasBody')
Example #9
0
 def testCurieToTag(self):
     self.assertEquals('{http://www.loc.gov/zing/srw/}record', namespaces.expandNsTag('srw:record'))
     self.assertEquals('{http://purl.org/dc/elements/1.1/}title', namespaces.curieToTag('dc:title'))
Example #10
0
        annotationElement,
        OA_ANNOTATED_BY_TAG,
        attrib={RDF_RESOURCE_TAG: unicode(annotatedByUri)}
    )
    SubElement(
        annotationElement,
        OA_MOTIVATED_BY_TAG,
        attrib={RDF_RESOURCE_TAG: unicode(motiveUri)}
    )
    SubElement(
        annotationElement,
        OA_HAS_TARGET_TAG,
        attrib={RDF_RESOURCE_TAG: unicode(targetUri)}
    )
    hasBodyElement = SubElement(
        annotationElement,
        OA_HAS_BODY_TAG
    )
    return rdfElement, hasBodyElement


OA_ANNOTATED_BY_TAG = curieToTag('oa:annotatedBy')
OA_ANNOTATION_TAG = curieToTag('oa:Annotation')
OA_HAS_BODY_TAG = curieToTag('oa:hasBody')
OA_HAS_TARGET_TAG = curieToTag('oa:hasTarget')
OA_MOTIVATED_BY_TAG = curieToTag('oa:motivatedBy')
RDF_ABOUT_TAG = curieToTag('rdf:about')
RDF_RDF_TAG = curieToTag('rdf:RDF')
RDF_RESOURCE_TAG = curieToTag('rdf:resource')
NSMAP_RDF = namespaces.select('rdf')
NSMAP_OA = namespaces.select('oa')
    @property
    def path(self):
        return self._processor._path

    @property
    def metadataPrefix(self):
        return self._processor._metadataPrefix

    @property
    def set(self):
        return self._processor._set

    @property
    def nextRequestTime(self):
        return self._processor._earliestNextRequestTime


RESUMPTIONTOKEN_STATE = "Resumptiontoken: "

VERB_TAGNAME = {
    'ListRecords': curieToTag('oai:record'),
    'ListIdentifiers': curieToTag('oai:header')
}
_USER_AGENT = "Meresco-Oai-DownloadProcessor/%s" % VERSION

HEADER_TAG = curieToTag('oai:header')
IDENTIFIER_TAG = curieToTag('oai:identifier')
DATESTAMP_TAG = curieToTag('oai:datestamp')

__all__ = ['OaiDownloadProcessor']
Example #12
0
    def name(self):
        return self._processor.observable_name()

    @property
    def path(self):
        return self._processor._path

    @property
    def metadataPrefix(self):
        return self._processor._metadataPrefix

    @property
    def set(self):
        return self._processor._set

    @property
    def nextRequestTime(self):
        return self._processor._earliestNextRequestTime

RESUMPTIONTOKEN_STATE = "Resumptiontoken: "

VERB_TAGNAME = {
    'ListRecords': curieToTag('oai:record'),
    'ListIdentifiers': curieToTag('oai:header')
}
_USER_AGENT = "Meresco-Oai-DownloadProcessor/%s" % VERSION

HEADER_TAG = curieToTag('oai:header')
IDENTIFIER_TAG = curieToTag('oai:identifier')
DATESTAMP_TAG = curieToTag('oai:datestamp')
Example #13
0
 def testCurieToTag(self):
     self.assertEqual('{http://www.loc.gov/zing/srw/}record',
                      namespaces.expandNsTag('srw:record'))
     self.assertEqual('{http://purl.org/dc/elements/1.1/}title',
                      namespaces.curieToTag('dc:title'))
Example #14
0
        self.addTriple(r, rdf_type_uri, Uri(rdf_Statement_uri))


def getText(node):
    # *Only* call with an Element
    allText = node.text or ''

    for c in node.getchildren():
        tail = c.tail
        if tail:
            allText += tail

    return allText or None


x_lang_tag = curieToTag("xml:lang")
rdf_RDF_tag = curieToTag("rdf:RDF")
rdf_ID_tag = curieToTag("rdf:ID")
rdf_about_tag = curieToTag("rdf:about")
rdf_aboutEach_tag = curieToTag("rdf:aboutEach")
rdf_aboutEachPrefix_tag = curieToTag("rdf:aboutEachPrefix")
rdf_type_tag = curieToTag("rdf:type")
rdf_resource_tag = curieToTag("rdf:resource")
rdf_Description_tag = curieToTag("rdf:Description")
rdf_bagID_tag = curieToTag("rdf:bagID")
rdf_parseType_tag = curieToTag("rdf:parseType")
rdf_nodeID_tag = curieToTag("rdf:nodeID")
rdf_datatype_tag = curieToTag("rdf:datatype")
rdf_li_tag = curieToTag("rdf:li")

rdf_Statement_uri = curieToUri('rdf:Statement')