Esempio n. 1
0
    def testListMetadataFormatsForIdentifier(self):
        header, body = self._request(verb=['ListMetadataFormats'], identifier=[self.prefix + 'record:id:01'])

        self.assertEquals(0, len(xpath(body, '/oai:OAI-PMH/oai:error')), lxmltostring(body, pretty_print=True))
        formats = xpath(body, '/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat')
        self.assertEquals(1, len(formats), lxmltostring(body, pretty_print=True))
        self.assertEquals(['oai_dc'], xpath(formats[0], 'oai:metadataPrefix/text()'))
Esempio n. 2
0
def html_to_etree(in_str, remove_blank_text=True):
    """
    Parses a tree of possibly malformed HTML5, according to WHATWG HTML5 rules.

    Result is either:
     - parsed input, or;
     - if multiple fragments (> 1 top-level tags) are given: parsed input wrapped in either a `div' or `span', or;
     - None for empty input.
    """
    if in_str is None:
        return None

    if not isinstance(in_str, basestring):
        raise ValueError('input must be a string')

    in_str = _nfc(in_str).strip()

    if not in_str:
        return None

    # Double-parse to remove (hopefully irrelevant) whitespace - some not-so-irrelevant whitespace will most likely be removed too
    etree = fromstring(in_str, parser=_html5Parser) # ATTENTION: tag/attributes namespace-info mangled here due to html5lib bugs.
    _etree_mutate_fix_localname(etree)
    if remove_blank_text:
        s = lxmltostring(etree)
        etree = parse(StringIO(s), parser=_xmlParser)
        etree = fromstring(lxmltostring(etree), parser=_html5Parser)
        _etree_mutate_fix_localname(etree)  # and they spawn again after fromstring, so remove them again.

    return etree.getroot() if hasattr(etree, 'getroot') else etree
Esempio n. 3
0
    def testNamespaces(self):
        xmlXPath = XmlXPath(['/a:aNode/b:bNode'],
                            fromKwarg='lxmlNode',
                            namespaces={
                                'a': 'aNamespace',
                                'b': 'bNamespace'
                            })
        lxmlNode = parse(
            StringIO(
                '<aNode xmlns="aNamespace"><bNode xmlns="bNamespace">ccc</bNode></aNode>'
            ))
        observer = CallTrace('Observer')
        observable = Observable()
        observable.addObserver(xmlXPath)
        xmlXPath.addObserver(observer)

        observable.do.message(lxmlNode=lxmlNode)

        message = observer.calledMethods[0]
        self.assertEqual('message', message.name)
        newNode = message.kwargs['lxmlNode']
        self.assertEqualsWS('<bNode xmlns="bNamespace">ccc</bNode>',
                            lxmltostring(newNode))

        newNamespaces = newNode.getroot().nsmap
        nameSpacesAfterParsing = parse(StringIO(
            lxmltostring(newNode))).getroot().nsmap
        self.assertEqual(nameSpacesAfterParsing, newNamespaces)
Esempio n. 4
0
 def testHandleWithTwoRecords(self):
     observer = CallTrace(methods={'add': lambda **kwargs: (x for x in [])})
     oaiDownloadProcessor = OaiDownloadProcessor(
         path="/oai",
         metadataPrefix="oai_dc",
         workingDirectory=self.tempdir,
         xWait=True)
     oaiDownloadProcessor.addObserver(observer)
     secondRecord = '<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:2</identifier><datestamp>2011-08-22T07:41:00Z</datestamp></header><metadata>ignored</metadata></record>'
     list(
         compose(
             oaiDownloadProcessor.handle(
                 parse(
                     StringIO(LISTRECORDS_RESPONSE %
                              (secondRecord + RESUMPTION_TOKEN))))))
     self.assertEqual(['startOaiBatch', 'add', 'add', 'stopOaiBatch'],
                      [m.name for m in observer.calledMethods])
     addMethod0, addMethod1 = observer.calledMethods[1:3]
     self.assertEqual(0, len(addMethod0.args))
     self.assertEqualsWS(ONE_RECORD,
                         lxmltostring(addMethod0.kwargs['lxmlNode']))
     self.assertEqual('2011-08-22T07:34:00Z',
                      addMethod0.kwargs['datestamp'])
     self.assertEqual('oai:identifier:1', addMethod0.kwargs['identifier'])
     self.assertEqualsWS(secondRecord,
                         lxmltostring(addMethod1.kwargs['lxmlNode']))
     self.assertEqual('2011-08-22T07:41:00Z',
                      addMethod1.kwargs['datestamp'])
     self.assertEqual('oai:identifier:2', addMethod1.kwargs['identifier'])
Esempio n. 5
0
    def testQueryTimeInExtraResponse(self):
        handler = SruHandler(includeQueryTimes=True)
        observer = CallTrace('observer', emptyGeneratorMethods=['echoedExtraRequestData', 'extraResponseData'])

        times = [1, 2.5, 3.5]
        def timeNow():
            return times.pop(0)
        handler._timeNow = timeNow

        def executeQuery(**kwargs):
            response = Response(total=0, hits=[])
            response.queryTime=5
            raise StopIteration(response)
            yield
        observer.methods['executeQuery'] = executeQuery
        handler.addObserver(observer)
        arguments = dict(startRecord=11, maximumRecords=15, query='query', recordPacking='string', recordSchema='schema')
        result = "".join(compose(handler.searchRetrieve(sruArguments=arguments, **arguments)))
        sruResponse = parse(StringIO(result))
        extraResponseData = sruResponse.xpath('/srw:searchRetrieveResponse/srw:extraResponseData', namespaces={'srw':"http://www.loc.gov/zing/srw/"})[0]
        self.assertEqualsWS("""<srw:extraResponseData %(xmlns_srw)s %(xmlns_diag)s %(xmlns_xcql)s %(xmlns_dc)s %(xmlns_meresco_srw)s>
        <querytimes xmlns="http://meresco.org/namespace/timing">
            <sruHandling>PT2.500S</sruHandling>
            <sruQueryTime>PT1.500S</sruQueryTime>
            <index>PT0.005S</index>
        </querytimes>
</srw:extraResponseData>""" % namespaces, lxmltostring(extraResponseData))
        queryTimes = lxmltostring(extraResponseData.xpath('//ti:querytimes', namespaces={'ti':"http://meresco.org/namespace/timing"})[0])
        assertValid(queryTimes, join(schemasPath, 'timing-20120827.xsd'))
        self.assertEquals(['executeQuery', 'echoedExtraRequestData', 'extraResponseData', 'handleQueryTimes'], observer.calledMethodNames())
        self.assertEquals({'sru': Decimal("2.500"), 'queryTime': Decimal("1.500"), 'index': Decimal("0.005")}, observer.calledMethods[3].kwargs)
    def testTailTakenCareOfWithoutAffectingOriginal(self):
        observer = CallTrace('observer', methods={'test': lambda *args, **kwargs: (x for x in [])})
        observable = be(
            (Observable(),
                (XmlXPath(
                        ['/myns:root/myns:path'],
                        fromKwarg='lxmlNode',
                        namespaces={'myns': 'http://myns.org/'}
                    ),
                    (observer, ),
                )
            )
        )

        XML = """\
<root xmlns:myns="http://myns.org/" xmlns="http://myns.org/">
    <myns:path>
        <to>me</to>
    </myns:path>\n
</root>"""

        lxmlNode = parse(StringIO(XML))
        self.assertEquals(XML, lxmltostring(lxmlNode))
        list(compose(observable.all.test('een tekst', lxmlNode=lxmlNode)))

        self.assertEquals(1, len(observer.calledMethods))
        method = observer.calledMethods[0]
        self.assertEquals('test', method.name)
        self.assertEqualsWS('<myns:path xmlns:myns="http://myns.org/" xmlns="http://myns.org/"><to>me</to></myns:path>', lxmltostring(method.kwargs['lxmlNode']))
        self.assertEquals("""\
<myns:path xmlns:myns="http://myns.org/" xmlns="http://myns.org/">
        <to>me</to>
    </myns:path>""", lxmltostring(method.kwargs['lxmlNode']))

        self.assertEquals(XML, lxmltostring(lxmlNode))
Esempio n. 7
0
    def testListMetadataFormatsForIdentifier(self):
        header, body = self._request(verb=['ListMetadataFormats'], identifier=[self.prefix + 'record:id:01'])

        self.assertEqual(0, len(xpath(body, '/oai:OAI-PMH/oai:error')), lxmltostring(body, pretty_print=True))
        formats = xpath(body, '/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat')
        self.assertEqual(1, len(formats), lxmltostring(body, pretty_print=True))
        self.assertEqual(['oai_dc'], xpath(formats[0], 'oai:metadataPrefix/text()'))
Esempio n. 8
0
    def testNearRealtimeOaiSavesState(self):
        observer = CallTrace("observer",
                             ignoredAttributes=["observer_init"],
                             methods={'add': lambda **kwargs: (x for x in [])})
        oaiJazz = OaiJazz(join(self.tempdir, 'oai'))
        oaiJazz.updateMetadataFormat(prefix="prefix", schema="", namespace="")
        suspendRegister = SuspendRegister()
        oaiJazz.addObserver(suspendRegister)
        storageComponent = MultiSequentialStorage(join(self.tempdir,
                                                       'storage'))
        self._addOaiRecords(storageComponent, oaiJazz, 1)

        oaiPmhThread = None
        harvestThread = None

        def start():
            global oaiPmhThread, harvestThread
            self.run = True
            portNumber = randint(50000, 60000)
            oaiPmhThread = Thread(
                None, lambda: self.startOaiPmh(
                    portNumber, oaiJazz, storageComponent, suspendRegister))
            harvestThread = Thread(
                None, lambda: self.startOaiHarvester(portNumber, observer))
            oaiPmhThread.start()
            harvestThread.start()

        def stop():
            global oaiPmhThread, harvestThread
            self.run = False
            oaiPmhThread.join()
            oaiPmhThread = None
            harvestThread.join()
            harvestThread = None

        start()
        requests = 1
        sleepWheel(1.0 + 1.0 * requests)
        self.assertEqual(['startOaiBatch', 'add', 'stopOaiBatch'],
                         [m.name for m in observer.calledMethods])
        kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode'])
        self.assertTrue("id0" in kwarg, kwarg)
        stop()
        observer.calledMethods.reset()

        storageComponent.addData(identifier="id1",
                                 name="prefix",
                                 data=b"<a>a1</a>")
        oaiJazz.addOaiRecord(identifier="id1", metadataPrefixes=["prefix"])

        start()
        requests = 1
        sleepWheel(1.0 + 1.0 * requests)
        self.assertEqual(['startOaiBatch', 'add', 'stopOaiBatch'],
                         [m.name for m in observer.calledMethods])
        kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode'])
        self.assertFalse("id0" in kwarg, kwarg)
        self.assertTrue("id1" in kwarg, kwarg)
        stop()
Esempio n. 9
0
 def assertWaterMarked(**oaiArgs):
     header, body = self._request(**oaiArgs)
     try:
         comment = xpath(body, "/oai:OAI-PMH/comment()")[0]
     except:
         print lxmltostring(body, pretty_print=True)
         raise
     self.assertEquals(" Watermarked by Seecr ", comment.text)
Esempio n. 10
0
    def testNoLxmlTailOnPart(self):
        inputEvent = fromstring("""<document><part name="partone">&lt;some&gt;message&lt;/some&gt;\n\n\n\n</part><part name="parttwo"><second>message</second>\n\n\n\n</part></document>""")
        interceptor = CallTrace('Interceptor', methods={'add': yieldNothing})
        v = createVenturiHelix([{'partname': 'partone', 'xpath': '/document/part[@name="partone"]/text()'}, {'partname': 'parttwo', 'xpath': '/document/part/second'}], [], interceptor)
        list(compose(v.all.add('identifier', 'document', inputEvent)))

        self.assertEqual('<some>message</some>', lxmltostring(interceptor.calledMethods[1].kwargs['lxmlNode']))
        secondXml = interceptor.calledMethods[2].kwargs['lxmlNode']
        self.assertEqual('<second>message</second>', lxmltostring(secondXml))
Esempio n. 11
0
    def testNoLxmlTailOnPart(self):
        inputEvent = fromstring("""<document><part name="partone">&lt;some&gt;message&lt;/some&gt;\n\n\n\n</part><part name="parttwo"><second>message</second>\n\n\n\n</part></document>""")
        interceptor = CallTrace('Interceptor', methods={'add': yieldNothing})
        v = createVenturiHelix([{'partname': 'partone', 'xpath': '/document/part[@name="partone"]/text()'}, {'partname': 'parttwo', 'xpath': '/document/part/second'}], [], interceptor)
        list(compose(v.all.add('identifier', 'document', inputEvent)))

        self.assertEquals('<some>message</some>', lxmltostring(interceptor.calledMethods[1].kwargs['lxmlNode']))
        secondXml = interceptor.calledMethods[2].kwargs['lxmlNode']
        self.assertEquals('<second>message</second>', lxmltostring(secondXml))
    def testFindUsingMultipleXPaths(self):
        self.createXmlXPath(['/does/not/exist', '/a/b', '/a/b/c'], {})

        self.observable.do.test(data='<a><b><c>one</c></b><b><d>two</d></b></a>')

        self.assertEquals(3, len(self.observer.calledMethods))
        allResults = []
        for method in self.observer.calledMethods:
            allResults.append(method.kwargs['lxmlNode'])
        self.assertEqualsWS('<b><c>one</c></b>', lxmltostring(allResults[0]))
        self.assertEqualsWS('<b><d>two</d></b>', lxmltostring(allResults[1]))
        self.assertEqualsWS('<c>one</c>', lxmltostring(allResults[2]))
Esempio n. 13
0
    def testPartsWithUnicodeChars(self):
        inputEvent = fromstring("""<document><part name="partone">&lt;some&gt;t€xt&lt;/some&gt;\n\n\n\n</part><part name="parttwo"><second>t€xt</second>\n\n\n\n</part></document>""")
        interceptor = CallTrace('Interceptor', methods={'add': yieldNothing})
        v = createVenturiHelix([{'partname': 'partone', 'xpath': '/document/part[@name="partone"]/text()'}, {'partname': 'parttwo', 'xpath': '/document/part/second'}], [], interceptor)
        list(compose(v.all.add('identifier', 'document', inputEvent)))

        firstXml = interceptor.calledMethods[1].kwargs['lxmlNode']
        self.assertEquals('<some>t€xt</some>', lxmltostring(firstXml))
        self.assertEquals('t€xt', firstXml.getroot().text)
        secondXml = interceptor.calledMethods[2].kwargs['lxmlNode']
        self.assertEquals('<second>t€xt</second>', lxmltostring(secondXml))
        self.assertEquals('t€xt', secondXml.getroot().text)
Esempio n. 14
0
    def testPartsWithUnicodeChars(self):
        inputEvent = fromstring("""<document><part name="partone">&lt;some&gt;t€xt&lt;/some&gt;\n\n\n\n</part><part name="parttwo"><second>t€xt</second>\n\n\n\n</part></document>""")
        interceptor = CallTrace('Interceptor', methods={'add': yieldNothing})
        v = createVenturiHelix([{'partname': 'partone', 'xpath': '/document/part[@name="partone"]/text()'}, {'partname': 'parttwo', 'xpath': '/document/part/second'}], [], interceptor)
        list(compose(v.all.add('identifier', 'document', inputEvent)))

        firstXml = interceptor.calledMethods[1].kwargs['lxmlNode']
        self.assertEqual('<some>t€xt</some>', lxmltostring(firstXml))
        self.assertEqual('t€xt', firstXml.getroot().text)
        secondXml = interceptor.calledMethods[2].kwargs['lxmlNode']
        self.assertEqual('<second>t€xt</second>', lxmltostring(secondXml))
        self.assertEqual('t€xt', secondXml.getroot().text)
    def testSimpleXPath(self):
        self.createXmlXPath(['/root/path'], {})

        xml = '<root><path><to>me</to></path>\n</root>'
        self.observable.do.test('een tekst', data=xml)

        self.assertEquals(1, len(self.observer.calledMethods))
        method = self.observer.calledMethods[0]
        self.assertEquals('test', method.name)
        self.assertEquals(1, len(method.args))
        self.assertEquals('een tekst', method.args[0])
        self.assertEqualsWS('<path><to>me</to></path>', lxmltostring(method.kwargs['lxmlNode']))
        self.assertEquals('<path><to>me</to></path>', lxmltostring(method.kwargs['lxmlNode']))
    def testXPathWithMultipleResults(self):
        self.createXmlXPath(['/root/element/data'], {})

        self.observable.do.aMethod(data="""<root>
    <element>
        <data>one</data>
    </element>
    <element>
        <data>two</data>
    </element>
</root>""")
        self.assertEquals(2, len(self.observer.calledMethods))
        self.assertEqualsWS('<data>one</data>', lxmltostring(self.observer.calledMethods[0].kwargs['lxmlNode']))
        self.assertEqualsWS('<data>two</data>', lxmltostring(self.observer.calledMethods[1].kwargs['lxmlNode']))
Esempio n. 17
0
 def testHandleWithTwoRecords(self):
     observer = CallTrace(methods={'add': lambda **kwargs: (x for x in [])})
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True)
     oaiDownloadProcessor.addObserver(observer)
     secondRecord = '<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:2</identifier><datestamp>2011-08-22T07:41:00Z</datestamp></header><metadata>ignored</metadata></record>'
     list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % (secondRecord + RESUMPTION_TOKEN))))))
     self.assertEquals(['startOaiBatch', 'add', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods])
     addMethod0, addMethod1 = observer.calledMethods[1:3]
     self.assertEquals(0, len(addMethod0.args))
     self.assertEqualsWS(ONE_RECORD, lxmltostring(addMethod0.kwargs['lxmlNode']))
     self.assertEquals('2011-08-22T07:34:00Z', addMethod0.kwargs['datestamp'])
     self.assertEquals('oai:identifier:1', addMethod0.kwargs['identifier'])
     self.assertEqualsWS(secondRecord, lxmltostring(addMethod1.kwargs['lxmlNode']))
     self.assertEquals('2011-08-22T07:41:00Z', addMethod1.kwargs['datestamp'])
     self.assertEquals('oai:identifier:2', addMethod1.kwargs['identifier'])
Esempio n. 18
0
    def testNearRealtimeOaiSavesState(self):
        observer = CallTrace("observer", ignoredAttributes=["observer_init"], methods={'add': lambda **kwargs: (x for x in [])})
        oaiJazz = OaiJazz(join(self.tempdir, 'oai'))
        suspendRegister = SuspendRegister()
        oaiJazz.addObserver(suspendRegister)
        storageComponent = MultiSequentialStorage(join(self.tempdir, 'storage'))
        self._addOaiRecords(storageComponent, oaiJazz, 1)

        oaiPmhThread = None
        harvestThread = None

        def start():
            global oaiPmhThread, harvestThread
            self.run = True
            portNumber = randint(50000, 60000)
            oaiPmhThread = Thread(None, lambda: self.startOaiPmh(portNumber, oaiJazz, storageComponent, suspendRegister))
            harvestThread = Thread(None, lambda: self.startOaiHarvester(portNumber, observer))
            oaiPmhThread.start()
            harvestThread.start()

        def stop():
            global oaiPmhThread, harvestThread
            self.run = False
            oaiPmhThread.join()
            oaiPmhThread = None
            harvestThread.join()
            harvestThread = None

        start()
        requests = 1
        sleepWheel(1.0 + 1.0 * requests)
        self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods])
        kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode'])
        self.assertTrue("id0" in kwarg, kwarg)
        stop()
        observer.calledMethods.reset()

        storageComponent.addData(identifier="id1", name="prefix", data="<a>a1</a>")
        oaiJazz.addOaiRecord(identifier="id1", sets=[], metadataFormats=[("prefix", "", "")])

        start()
        requests = 1
        sleepWheel(1.0 + 1.0 * requests)
        self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods])
        kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode'])
        self.assertFalse("id0" in kwarg, kwarg)
        self.assertTrue("id1" in kwarg, kwarg)
        stop()
Esempio n. 19
0
 def testOutline(self):
     inputEvent = fromstring(
         """<document><part name="partone">&lt;some&gt;message&lt;/some&gt;</part><part name="parttwo"><second>message</second></part></document>"""
     )
     interceptor = CallTrace('Interceptor', methods={'add': yieldNothing})
     v = createVenturiHelix([
         dict(partname='partone',
              xpath='/document/part[@name="partone"]/text()',
              asString=True),
         dict(partname='parttwo', xpath='/document/part/second')
     ], [], interceptor)
     list(compose(v.all.add('identifier', 'document', inputEvent)))
     self.assertEquals(['begin', 'add', 'add'],
                       [m.name for m in interceptor.calledMethods])
     self.assertEquals('identifier',
                       interceptor.calledMethods[1].kwargs['identifier'])
     self.assertEquals('partone',
                       interceptor.calledMethods[1].kwargs['partname'])
     self.assertEquals('<some>message</some>',
                       interceptor.calledMethods[1].kwargs['data'])
     self.assertEquals('identifier',
                       interceptor.calledMethods[2].kwargs['identifier'])
     self.assertEquals('parttwo',
                       interceptor.calledMethods[2].kwargs['partname'])
     secondXml = interceptor.calledMethods[2].kwargs['lxmlNode']
     self.assertEquals('<second>message</second>', lxmltostring(secondXml))
     self.assertEquals('second', secondXml.getroot().tag)
 def _processRecords(self, lxmlNode):
     verbNode = xpathFirst(lxmlNode, "/oai:OAI-PMH/oai:%s" % self._verb)
     for item in verbNode.iterchildren(tag=VERB_TAGNAME[self._verb]):
         header = None
         for h in item.iterchildren():
             if h.tag == HEADER_TAG:
                 header = h
                 break
         else:
             if item.tag != HEADER_TAG:
                 raise IndexError("Invalid oai header")
             header = item
         for child in header.iterchildren():
             if child.tag == IDENTIFIER_TAG:
                 identifier = child.text
             elif child.tag == DATESTAMP_TAG:
                 datestamp = child.text
         try:
             yield self._add(identifier=identifier,
                             lxmlNode=ElementTree(item),
                             datestamp=datestamp)
         except Exception as e:
             self._logError(format_exc())
             self._logError("While processing:")
             self._logError(lxmltostring(item))
             self._errorState = "ERROR while processing '%s': %s" % (
                 identifier, str(e))
             raise
         yield  # some room for others
     self._resumptionToken = xpathFirst(verbNode,
                                        "oai:resumptionToken/text()")
Esempio n. 21
0
 def testListRecords(self):
     header, body = self._request(
         verb=['ListRecords'], metadataPrefix=['prefix2'])
     records = xpath(body, '/oai:OAI-PMH/oai:ListRecords/oai:record')
     self.assertEquals(10, len(records))
     self.assertEquals([self.prefix + 'record:id:11'],
                       xpath(records[1],
                             'oai:header/oai:identifier/text()'))
     self.assertEquals(['record:id:11'],
                       xpath(records[1],
                             'oai:metadata/oai_dc:dc/dc:subject/text()'),
                       lxmltostring(records[1]))
     self.assertEquals(['hierarchical', 'setSpec10'],
                       sorted(
                           xpath(records[1],
                                 'oai:header/oai:setSpec/text()')))
     deletedRecords = xpath(
         body,
         '/oai:OAI-PMH/oai:ListRecords/oai:record[oai:header/@status="deleted"]'
     )
     self.assertEquals(2, len(deletedRecords))
     self.assertEquals(
         [0, 0], [len(xpath(r, 'oai:metadata')) for r in deletedRecords])
     self.assertEquals(['hierarchical', 'setSpec10'],
                       sorted(
                           xpath(deletedRecords[0],
                                 'oai:header/oai:setSpec/text()')))
Esempio n. 22
0
 def testReadFromStorage(self):
     inputEvent = fromstring('<document/>')
     interceptor = CallTrace('Interceptor',
                             ignoredAttributes=[
                                 'getData', 'all_unknown', 'any_unknown',
                                 'call_unknown'
                             ])
     interceptor.methods['add'] = yieldNothing
     storage = CallTrace('Storage',
                         ignoredAttributes=['add', 'all_unknown'])
     storage.returnValues['getData'] = '<some>this is partone</some>'
     v = createVenturiHelix(
         [{
             'partname': 'partone',
             'xpath': '/document/part[@name="partone"]/text()'
         }], [], interceptor, storage)
     list(compose(v.all.add('identifier', 'document', inputEvent)))
     self.assertEquals(['begin', 'add'],
                       [m.name for m in interceptor.calledMethods])
     self.assertEquals(
         '<some>this is partone</some>',
         lxmltostring(interceptor.calledMethods[1].kwargs['lxmlNode']))
     self.assertEquals(['begin', 'getData'], storage.calledMethodNames())
     self.assertEquals(dict(identifier='identifier', name='partone'),
                       storage.calledMethods[1].kwargs)
Esempio n. 23
0
 def _processRecords(self, lxmlNode):
     verbNode = xpathFirst(lxmlNode, "/oai:OAI-PMH/oai:%s" % self._verb)
     for item in verbNode.iterchildren(tag=VERB_TAGNAME[self._verb]):
         header = None
         for h in item.iterchildren():
             if h.tag == HEADER_TAG:
                 header = h
                 break
         else:
             if item.tag != HEADER_TAG:
                 raise IndexError("Invalid oai header")
             header = item
         for child in header.iterchildren():
             if child.tag == IDENTIFIER_TAG:
                 identifier = child.text
             elif child.tag == DATESTAMP_TAG:
                 datestamp = child.text
         try:
             yield self._add(identifier=identifier, lxmlNode=ElementTree(item), datestamp=datestamp)
         except Exception, e:
             self._logError(format_exc())
             self._logError("While processing:")
             self._logError(lxmltostring(item))
             self._errorState = "ERROR while processing '%s': %s" % (identifier, str(e))
             raise
         yield # some room for others
Esempio n. 24
0
 def testOnlyPassPartsSpecified(self):
     inputEvent = fromstring("""<document><part name="partone">&lt;some&gt;message&lt;/some&gt;</part><part name="parttwo"><second/></part></document>""")
     interceptor = CallTrace('Interceptor', methods={'add': yieldNothing})
     v = createVenturiHelix([{'partname': 'partone', 'xpath': '/document/part[@name="partone"]/text()'}], [], interceptor)
     list(compose(v.all.add('identifier', 'document', inputEvent)))
     self.assertEqual(['begin', 'add'], [m.name for m in interceptor.calledMethods])
     self.assertEqual('<some>message</some>', lxmltostring(interceptor.calledMethods[1].kwargs['lxmlNode']))
Esempio n. 25
0
 def testTwoTags(self):
     target = Target('aap')
     p = XMLParser(target = target)
     p.feed("<aap>")
     p.feed("noot")
     p.feed("</aap>")
     self.assertEquals("<aap>noot</aap>", lxmltostring(target.root))
    def testTestWithCondition(self):
        self.createXmlXPath(['/a/*[not(self::b) and not(self::c)]'], {})

        self.observable.do.test(data='<a><b>zero</b><c>one</c><d>two</d></a>')

        self.assertEquals(1, len(self.observer.calledMethods))
        self.assertEqualsWS('<d>two</d>', lxmltostring(self.observer.calledMethods[0].kwargs['lxmlNode']))
Esempio n. 27
0
 def testOnlyPassPartsSpecified(self):
     inputEvent = fromstring("""<document><part name="partone">&lt;some&gt;message&lt;/some&gt;</part><part name="parttwo"><second/></part></document>""")
     interceptor = CallTrace('Interceptor', methods={'add': yieldNothing})
     v = createVenturiHelix([{'partname': 'partone', 'xpath': '/document/part[@name="partone"]/text()'}], [], interceptor)
     list(compose(v.all.add('identifier', 'document', inputEvent)))
     self.assertEquals(['begin', 'add'], [m.name for m in interceptor.calledMethods])
     self.assertEquals('<some>message</some>', lxmltostring(interceptor.calledMethods[1].kwargs['lxmlNode']))
Esempio n. 28
0
    def testListMetadataFormatsForWrongIdentifier(self):
        header, body = self._request(
            verb=['ListMetadataFormats'], identifier=['does:not:exist'])

        self.assertEquals(['idDoesNotExist'],
                          xpath(body, '/oai:OAI-PMH/oai:error/@code'),
                          lxmltostring(body, pretty_print=True))
Esempio n. 29
0
 def testTwoTags(self):
     target = Target('aap')
     p = XMLParser(target = target)
     p.feed("<aap>")
     p.feed("noot")
     p.feed("</aap>")
     self.assertEqual("<aap>noot</aap>", lxmltostring(target.root))
Esempio n. 30
0
 def testCouldHave(self):
     inputEvent = fromstring('<document><one/></document>')
     interceptor = CallTrace('Interceptor', ignoredAttributes=['getData', 'all_unknown', 'any_unknown', 'call_unknown'], methods={'add': yieldNothing})
     v = createVenturiHelix([], [{'partname': 'one', 'xpath': '/document/one'}], interceptor)
     list(compose(v.all.add('identifier', 'document', inputEvent)))
     self.assertEqual(['begin', 'add'], [m.name for m in interceptor.calledMethods])
     self.assertEqual('<one/>', lxmltostring(interceptor.calledMethods[1].kwargs['lxmlNode']))
Esempio n. 31
0
 def testCouldHave(self):
     inputEvent = fromstring('<document><one/></document>')
     interceptor = CallTrace('Interceptor', ignoredAttributes=['getData', 'all_unknown', 'any_unknown', 'call_unknown'], methods={'add': yieldNothing})
     v = createVenturiHelix([], [{'partname': 'one', 'xpath': '/document/one'}], interceptor)
     list(compose(v.all.add('identifier', 'document', inputEvent)))
     self.assertEquals(['begin', 'add'], [m.name for m in interceptor.calledMethods])
     self.assertEquals('<one/>', lxmltostring(interceptor.calledMethods[1].kwargs['lxmlNode']))
 def _combine(self, erfgeoEnrichment, summary):
     yield '<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">\n'
     for data in [summary, erfgeoEnrichment]:
         rdfLxml = XML(data)
         for child in xpath(rdfLxml, '/rdf:RDF/*'):
             yield lxmltostring(child)
     yield '</rdf:RDF>'
Esempio n. 33
0
def dumpOai(port,
            path,
            oaiDumpDir,
            metadataPrefix,
            set_=None,
            host=None,
            limit=None,
            append=False):
    host = host or '127.0.0.1'
    baseurl = 'http://%s:%s%s' % (host, port, path)
    if not append:
        isdir(oaiDumpDir) and rmtree(oaiDumpDir)
        makedirs(oaiDumpDir)
    with open(join(oaiDumpDir, 'oai.ids'), 'a') as ids:
        for oaiItem in islice(
                iterateOaiPmh(baseurl=baseurl,
                              metadataPrefix=metadataPrefix,
                              set=set_), limit):
            filename = '%s.%s' % (oaiItem.identifier, metadataPrefix)
            ids.write('%s %s |%s|\n' %
                      ('DEL' if oaiItem.deleted else 'ADD', filename, '|'.join(
                          sorted(oaiItem.setSpecs))))
            if not oaiItem.deleted:
                open(join(oaiDumpDir, escapeFilename(filename)), 'w').write(
                    lxmltostring(oaiItem.metadata, pretty_print=True))
    print("Oai dump created in %s" % oaiDumpDir)
Esempio n. 34
0
    def testAddInitialRecord(self):
        uri = "some:uri"

        rdfDescription = """<rdf:Description rdf:about="%s" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://www.openarchives.org/OAI/2.0/">
    <dc:title xmlns:dc="http://purl.org/dc/elements/1.1/" xml:lang="en">title</dc:title>
    <prov:wasDerivedFrom xmlns:prov="http://www.w3.org/ns/prov#">
        <prov:Entity>
            <dcterms:source rdf:resource="http://first.example.org"/>
        </prov:Entity>
    </prov:wasDerivedFrom>
</rdf:Description>""" % uri

        lxmlNode = parse(StringIO("""<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        %s
</rdf:RDF>""" % rdfDescription))

        consume(self.dna.all.add(identifier="identifier", lxmlNode=lxmlNode))

        record = self.oaiJazz.getRecord(identifier=uri)
        expected = XML(lxmltostring(xpathFirst(lxmlNode, '//rdf:RDF')))
        cleanup_namespaces(expected)
        self.assertXmlEquals(expected, self.storage.getData(identifier=record.identifier, name='rdf'))

        self.assertEquals(set(['rdf']), record.prefixes)
        self.assertEquals(set(), record.sets)

        self.plein.close()
        plein2 = self._newPlein()
        self.assertEquals(['some:uri'], [fragment.uri for fragment in plein2._fragmentsForRecord('identifier')])
Esempio n. 35
0
 def handleRequest(self, Body='', **kwargs):
     yield '\r\n'.join(['HTTP/1.0 200 Ok', 'Content-Type: text/xml; charset=utf-8\r\n', ''])
     try:
         updateRequest = XML(Body)
         recordId = xpathFirst(updateRequest, 'ucp:recordIdentifier/text()')
         action = xpathFirst(updateRequest, 'ucp:action/text()')
         if self._allInvalid and action == "info:srw/action/1/replace":
             if 'oai:record:02' in recordId:
                 raise InvalidDataException()
             raise InvalidDataException('Invalid data')
         if recordId in self._raiseExceptionOnIds:
             raise Exception("ERROR")
         self._number +=1
         filename = '%05d_%s.updateRequest' %(self._number, action.rsplit('/')[-1])
         with open(join(self._dumpdir, filename), 'w') as f:
             stdout.flush()
             f.write(lxmltostring(updateRequest, pretty_print=True))
         answer = RESPONSE_XML % {
             "operationStatus": "success",
             "diagnostics": ""}
     except InvalidDataException, e:
         answer = RESPONSE_XML % {
             "operationStatus": "fail",
             "diagnostics": DIAGNOSTIC_XML % {
                 'uri': 'info:srw/diagnostic/12/12',
                 'details': escapeXml(str(e)),
                 'message': 'Invalid data:  record rejected'}}
Esempio n. 36
0
 def _log(self, message, *args, **kwargs):
     printKwargs = dict(kwargs)
     for key, value in kwargs.items():
         if type(value) == ElementTreeType:
             printKwargs[key] = "%s(%s)" % (value.__class__.__name__, lxmltostring(value))
     sys.stdout.write("[%s] %s(*%s, **%s)\n" % (self.observable_name(), message, args, printKwargs))
     sys.stdout.flush()
Esempio n. 37
0
    def testNearRealtimeOai(self):
        self.run = True
        portNumber = randint(50000, 60000)
        suspendRegister = SuspendRegister()
        oaiJazz = OaiJazz(join(self.tempdir, 'oai'))
        oaiJazz.updateMetadataFormat(prefix="prefix", schema="", namespace="")
        oaiJazz.addObserver(suspendRegister)
        storageComponent = MultiSequentialStorage(join(self.tempdir,
                                                       'storage'))
        self._addOaiRecords(storageComponent, oaiJazz, 3)
        oaiPmhThread = Thread(
            None, lambda: self.startOaiPmh(portNumber, oaiJazz,
                                           storageComponent, suspendRegister))

        observer = CallTrace("observer",
                             ignoredAttributes=["observer_init"],
                             methods={'add': lambda **kwargs: (x for x in [])})
        harvestThread = Thread(
            None, lambda: self.startOaiHarvester(portNumber, observer))

        oaiPmhThread.start()
        harvestThread.start()

        try:
            requests = 3
            sleepWheel(1.0 + 1.0 * requests)

            self.assertEqual([
                'startOaiBatch', 'add', 'add', 'stopOaiBatch', 'startOaiBatch',
                'add', 'stopOaiBatch'
            ], [m.name for m in observer.calledMethods])
            ids = [
                xpath(m.kwargs['lxmlNode'],
                      '//oai:header/oai:identifier/text()')
                for m in observer.calledMethods if m.name == 'add'
            ]
            self.assertEqual([['id0'], ['id1'], ['id2']], ids)

            self.assertEqual(1, len(suspendRegister))
            observer.calledMethods.reset()

            requests += 1
            storageComponent.addData(identifier="id3",
                                     name="prefix",
                                     data=b"<a>a3</a>")
            oaiJazz.addOaiRecord(identifier="id3", metadataPrefixes=["prefix"])
            sleepWheel(1)

            self.assertEqual(0, len(suspendRegister))
            self.assertEqual(['startOaiBatch', 'add', 'stopOaiBatch'],
                             [m.name for m in observer.calledMethods])
            kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode'])
            self.assertTrue("id3" in kwarg, kwarg)
            sleepWheel(1.0)
            self.assertEqual(1, len(suspendRegister))
        finally:
            self.run = False
            oaiPmhThread.join()
            harvestThread.join()
            oaiJazz.close()
Esempio n. 38
0
 def assertWaterMarked(**oaiArgs):
     header, body = self._request(**oaiArgs)
     try:
         comment = xpath(body, "/oai:OAI-PMH/comment()")[0]
     except:
         print(lxmltostring(body, pretty_print=True))
         raise
     self.assertEqual(" Watermarked by Seecr ", comment.text)
Esempio n. 39
0
 def testFilterTag(self):
     target = Target('mies')
     p = XMLParser(target = target)
     p.feed("<mies><mies>")
     p.feed("noot")
     p.feed("</mies>")
     p.feed("</mies>")
     self.assertEqual("<mies><mies>noot</mies></mies>", lxmltostring(target.root))
Esempio n. 40
0
 def testLxmltostring(self):
     from lxml.etree import tostring
     uri = "Baháma's"
     xml = """<root><sub><subsub attribute="%s">%s</subsub></sub></root>""" % (
         uri, uri)
     lxmlNode = parse(StringIO(xml))
     subnode = lxmlNode.xpath("sub")[0]
     self.assertEqual(
         b"""<sub><subsub attribute="Bah\xc3\xa1ma's">Bah\xc3\xa1ma's</subsub></sub>""",
         lxmltostring(subnode).encode('utf-8'))
     subsubnode = lxmlNode.xpath("sub/subsub")[0]
     self.assertEqual(
         b"""<subsub attribute="Bah&#xE1;ma's">Bah\xc3\xa1ma's</subsub>""",
         tostring(subsubnode, encoding='UTF-8'))
     self.assertEqual(
         b"""<subsub attribute="Bah\xc3\xa1ma's">Bah\xc3\xa1ma's</subsub>""",
         lxmltostring(subsubnode).encode('utf-8'))
Esempio n. 41
0
    def testFileParseLxml(self):
        observable = Observable()
        observer = CallTrace('observer')
        p = FileParseLxml(fromKwarg='filedata', toKwarg='lxmlNode')
        observable.addObserver(p)
        p.addObserver(observer)
        a = StringIO('<a>aaa</a>')
        observable.do.someMessage(filedata=a)
        lxmlA = observer.calledMethods[0].kwargs['lxmlNode']
        self.assertEqual('<a>aaa</a>', lxmltostring(lxmlA))

        with open(self.tempfile, 'w') as f:
            f.write('<b>bbb</b>')
        with open(self.tempfile) as b:
            observable.do.someMessage(filedata=b)
            lxmlB = observer.calledMethods[1].kwargs['lxmlNode']
            self.assertEqual('<b>bbb</b>', lxmltostring(lxmlB))
Esempio n. 42
0
 def testFilterTag(self):
     target = Target('mies')
     p = XMLParser(target = target)
     p.feed("<mies><mies>")
     p.feed("noot")
     p.feed("</mies>")
     p.feed("</mies>")
     self.assertEquals("<mies><mies>noot</mies></mies>", lxmltostring(target.root))
Esempio n. 43
0
 def testBugListRecordsReturnsDoubleValueOnNoRecordsMatch(self):
     header, body = self._request(
         verb=['ListRecords'],
         metadataPrefix=['oai_dc'],
         from_=['9999-01-01'])
     self.assertEquals(['noRecordsMatch'],
                       xpath(body, '/oai:OAI-PMH/oai:error/@code'),
                       lxmltostring(body, pretty_print=True))
Esempio n. 44
0
    def testListAllMetadataFormats(self):
        header, body = self._request(verb=['ListMetadataFormats'])

        self.assertEqual(0, len(xpath(body, '/oai:OAI-PMH/oai:error')))
        formats = xpath(body, '/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat')
        self.assertEqual(2, len(formats), lxmltostring(body, pretty_print=True))
        self.assertEqual(['oai_dc', 'prefix2'], [xpath(f, 'oai:metadataPrefix/text()')[0] for f in formats])
        self.assertEqual(['http://www.openarchives.org/OAI/2.0/oai_dc.xsd', 'http://example.org/prefix2/?format=xsd&prefix=2'], [xpath(f, 'oai:schema/text()')[0] for f in formats])
        self.assertEqual(['http://www.openarchives.org/OAI/2.0/oai_dc/', 'http://example.org/prefix2/'], [xpath(f, 'oai:metadataNamespace/text()')[0] for f in formats])
Esempio n. 45
0
    def testPlainXml(self):
        client = HttpClient()
        self.response = """HTTP/1.0 200 OK\r\nContent-Type: text/xml\r\n\r\n<xml/>"""

        gen = client.httpGet(hostname='localhost', port=80, path='/', arguments={})
        headers, body = retval(gen)

        self.assertEquals('<xml/>', lxmltostring(body))
        self.assertEquals(['HTTP/1.0 200 OK', 'Content-Type: text/xml'], headers.split(CRLF))
Esempio n. 46
0
 def _log(self, message, *args, **kwargs):
     printKwargs = dict(kwargs)
     for key, value in kwargs.items():
         if type(value) == ElementTreeType:
             printKwargs[key] = "%s(%s)" % (value.__class__.__name__,
                                            lxmltostring(value))
     sys.stdout.write("[%s] %s(*%s, **%s)\n" %
                      (self.observable_name(), message, args, printKwargs))
     sys.stdout.flush()
Esempio n. 47
0
    def testListAllMetadataFormats(self):
        header, body = self._request(verb=['ListMetadataFormats'])

        self.assertEquals(0, len(xpath(body, '/oai:OAI-PMH/oai:error')))
        formats = xpath(body, '/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat')
        self.assertEquals(2, len(formats), lxmltostring(body, pretty_print=True))
        self.assertEquals(['oai_dc', 'prefix2'], [xpath(f, 'oai:metadataPrefix/text()')[0] for f in formats])
        self.assertEquals(['http://www.openarchives.org/OAI/2.0/oai_dc.xsd', 'http://example.org/prefix2/?format=xsd&prefix=2'], [xpath(f, 'oai:schema/text()')[0] for f in formats])
        self.assertEquals(['http://www.openarchives.org/OAI/2.0/oai_dc/', 'http://example.org/prefix2/'], [xpath(f, 'oai:metadataNamespace/text()')[0] for f in formats])
Esempio n. 48
0
    def testNamespaces(self):
        xmlXPath = XmlXPath(['/a:aNode/b:bNode'], fromKwarg='lxmlNode', namespaces={'a':'aNamespace', 'b':'bNamespace' })
        lxmlNode = parse(StringIO('<aNode xmlns="aNamespace"><bNode xmlns="bNamespace">ccc</bNode></aNode>'))
        observer = CallTrace('Observer')
        observable = Observable()
        observable.addObserver(xmlXPath)
        xmlXPath.addObserver(observer)

        observable.do.message(lxmlNode=lxmlNode)

        message = observer.calledMethods[0]
        self.assertEquals('message', message.name)
        newNode = message.kwargs['lxmlNode']
        self.assertEqualsWS('<bNode xmlns="bNamespace">ccc</bNode>', lxmltostring(newNode))

        newNamespaces = newNode.getroot().nsmap
        nameSpacesAfterParsing = parse(StringIO(lxmltostring(newNode))).getroot().nsmap
        self.assertEquals(nameSpacesAfterParsing, newNamespaces)
Esempio n. 49
0
    def testFileParseLxml(self):
        observable = Observable()
        observer = CallTrace('observer')
        p = FileParseLxml(fromKwarg='filedata', toKwarg='lxmlNode')
        observable.addObserver(p)
        p.addObserver(observer)
        a = StringIO('<a>aaa</a>')
        f = open(self.tempfile, 'w')
        f.write('<b>bbb</b>')
        f.close()
        b = open(self.tempfile)

        observable.do.someMessage(filedata=a)
        lxmlA = observer.calledMethods[0].kwargs['lxmlNode']
        self.assertEquals('<a>aaa</a>', lxmltostring(lxmlA))

        observable.do.someMessage(filedata=b)
        lxmlB = observer.calledMethods[1].kwargs['lxmlNode']
        self.assertEquals('<b>bbb</b>', lxmltostring(lxmlB))
    def testSendWithAbout(self):
        ABOUT = '<about xmlns="%(oai)s">abouttext</about>' % namespaces
        recordFile = self.tempdir + '/group/repo/id.record'
        self.uploader._filenameFor = lambda *args: recordFile

        upload = createUpload(about=ABOUT)
        self.uploader.send(upload)

        self.assertTrue(isfile(recordFile))
        self.assertEquals(ABOUT, lxmltostring(xpathFirst(parse(open(recordFile)), '//oai:about')))
    def testElementInKwargs(self):
        self.createXmlXPath(['/root/path'], {})

        self.observable.do.aMethod('otherArgument', data='<root><path><to>me</to></path></root>', otherKeyword='okay')

        self.assertEquals(1, len(self.observer.calledMethods))
        method = self.observer.calledMethods[0]
        self.assertEquals('aMethod', method.name)
        self.assertEquals(1, len(method.args))
        self.assertEquals(set(['otherKeyword', 'lxmlNode']), set(method.kwargs.keys()))
        self.assertEqualsWS('<path><to>me</to></path>', lxmltostring(method.kwargs['lxmlNode']))
Esempio n. 52
0
def assertValidOai(lxmlTree=None, aXmlString=None):
    schema = getSchema()
    aXmlString = lxmltostring(
        lxmlTree, pretty_print=True) if aXmlString == None else aXmlString
    tree = parse(BytesIO(aXmlString.encode()))
    schema.validate(tree)
    if schema.error_log:
        for nr, line in enumerate(aXmlString.split('\n')):
            print(nr + 1, line)
        raise AssertionError(schema.error_log.last_error)
    return tree
Esempio n. 53
0
 def delete(self, anUpload):
     filename = self._filenameFor(anUpload)
     if not self._target.oaiEnvelope:
         os.path.isfile(filename) and os.remove(filename)
         with open(os.path.join(self._target.path, 'deleted_records'),
                   'a') as f:
             f.write('%s\n' % escapeFilename(anUpload.id))
     else:
         xmlResult = self._createOutput(anUpload)
         with open(filename, 'w') as fd:
             fd.write(lxmltostring(xmlResult))
     self._logDelete(anUpload.id)