def testListMetadataFormatsForIdentifier(self): header, body = self._request(verb=['ListMetadataFormats'], identifier=[self.prefix + 'record:id:01']) self.assertEquals(0, len(xpath(body, '/oai:OAI-PMH/oai:error')), lxmltostring(body, pretty_print=True)) formats = xpath(body, '/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat') self.assertEquals(1, len(formats), lxmltostring(body, pretty_print=True)) self.assertEquals(['oai_dc'], xpath(formats[0], 'oai:metadataPrefix/text()'))
def html_to_etree(in_str, remove_blank_text=True): """ Parses a tree of possibly malformed HTML5, according to WHATWG HTML5 rules. Result is either: - parsed input, or; - if multiple fragments (> 1 top-level tags) are given: parsed input wrapped in either a `div' or `span', or; - None for empty input. """ if in_str is None: return None if not isinstance(in_str, basestring): raise ValueError('input must be a string') in_str = _nfc(in_str).strip() if not in_str: return None # Double-parse to remove (hopefully irrelevant) whitespace - some not-so-irrelevant whitespace will most likely be removed too etree = fromstring(in_str, parser=_html5Parser) # ATTENTION: tag/attributes namespace-info mangled here due to html5lib bugs. _etree_mutate_fix_localname(etree) if remove_blank_text: s = lxmltostring(etree) etree = parse(StringIO(s), parser=_xmlParser) etree = fromstring(lxmltostring(etree), parser=_html5Parser) _etree_mutate_fix_localname(etree) # and they spawn again after fromstring, so remove them again. return etree.getroot() if hasattr(etree, 'getroot') else etree
def testNamespaces(self): xmlXPath = XmlXPath(['/a:aNode/b:bNode'], fromKwarg='lxmlNode', namespaces={ 'a': 'aNamespace', 'b': 'bNamespace' }) lxmlNode = parse( StringIO( '<aNode xmlns="aNamespace"><bNode xmlns="bNamespace">ccc</bNode></aNode>' )) observer = CallTrace('Observer') observable = Observable() observable.addObserver(xmlXPath) xmlXPath.addObserver(observer) observable.do.message(lxmlNode=lxmlNode) message = observer.calledMethods[0] self.assertEqual('message', message.name) newNode = message.kwargs['lxmlNode'] self.assertEqualsWS('<bNode xmlns="bNamespace">ccc</bNode>', lxmltostring(newNode)) newNamespaces = newNode.getroot().nsmap nameSpacesAfterParsing = parse(StringIO( lxmltostring(newNode))).getroot().nsmap self.assertEqual(nameSpacesAfterParsing, newNamespaces)
def testHandleWithTwoRecords(self): observer = CallTrace(methods={'add': lambda **kwargs: (x for x in [])}) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True) oaiDownloadProcessor.addObserver(observer) secondRecord = '<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:2</identifier><datestamp>2011-08-22T07:41:00Z</datestamp></header><metadata>ignored</metadata></record>' list( compose( oaiDownloadProcessor.handle( parse( StringIO(LISTRECORDS_RESPONSE % (secondRecord + RESUMPTION_TOKEN)))))) self.assertEqual(['startOaiBatch', 'add', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) addMethod0, addMethod1 = observer.calledMethods[1:3] self.assertEqual(0, len(addMethod0.args)) self.assertEqualsWS(ONE_RECORD, lxmltostring(addMethod0.kwargs['lxmlNode'])) self.assertEqual('2011-08-22T07:34:00Z', addMethod0.kwargs['datestamp']) self.assertEqual('oai:identifier:1', addMethod0.kwargs['identifier']) self.assertEqualsWS(secondRecord, lxmltostring(addMethod1.kwargs['lxmlNode'])) self.assertEqual('2011-08-22T07:41:00Z', addMethod1.kwargs['datestamp']) self.assertEqual('oai:identifier:2', addMethod1.kwargs['identifier'])
def testQueryTimeInExtraResponse(self): handler = SruHandler(includeQueryTimes=True) observer = CallTrace('observer', emptyGeneratorMethods=['echoedExtraRequestData', 'extraResponseData']) times = [1, 2.5, 3.5] def timeNow(): return times.pop(0) handler._timeNow = timeNow def executeQuery(**kwargs): response = Response(total=0, hits=[]) response.queryTime=5 raise StopIteration(response) yield observer.methods['executeQuery'] = executeQuery handler.addObserver(observer) arguments = dict(startRecord=11, maximumRecords=15, query='query', recordPacking='string', recordSchema='schema') result = "".join(compose(handler.searchRetrieve(sruArguments=arguments, **arguments))) sruResponse = parse(StringIO(result)) extraResponseData = sruResponse.xpath('/srw:searchRetrieveResponse/srw:extraResponseData', namespaces={'srw':"http://www.loc.gov/zing/srw/"})[0] self.assertEqualsWS("""<srw:extraResponseData %(xmlns_srw)s %(xmlns_diag)s %(xmlns_xcql)s %(xmlns_dc)s %(xmlns_meresco_srw)s> <querytimes xmlns="http://meresco.org/namespace/timing"> <sruHandling>PT2.500S</sruHandling> <sruQueryTime>PT1.500S</sruQueryTime> <index>PT0.005S</index> </querytimes> </srw:extraResponseData>""" % namespaces, lxmltostring(extraResponseData)) queryTimes = lxmltostring(extraResponseData.xpath('//ti:querytimes', namespaces={'ti':"http://meresco.org/namespace/timing"})[0]) assertValid(queryTimes, join(schemasPath, 'timing-20120827.xsd')) self.assertEquals(['executeQuery', 'echoedExtraRequestData', 'extraResponseData', 'handleQueryTimes'], observer.calledMethodNames()) self.assertEquals({'sru': Decimal("2.500"), 'queryTime': Decimal("1.500"), 'index': Decimal("0.005")}, observer.calledMethods[3].kwargs)
def testTailTakenCareOfWithoutAffectingOriginal(self): observer = CallTrace('observer', methods={'test': lambda *args, **kwargs: (x for x in [])}) observable = be( (Observable(), (XmlXPath( ['/myns:root/myns:path'], fromKwarg='lxmlNode', namespaces={'myns': 'http://myns.org/'} ), (observer, ), ) ) ) XML = """\ <root xmlns:myns="http://myns.org/" xmlns="http://myns.org/"> <myns:path> <to>me</to> </myns:path>\n </root>""" lxmlNode = parse(StringIO(XML)) self.assertEquals(XML, lxmltostring(lxmlNode)) list(compose(observable.all.test('een tekst', lxmlNode=lxmlNode))) self.assertEquals(1, len(observer.calledMethods)) method = observer.calledMethods[0] self.assertEquals('test', method.name) self.assertEqualsWS('<myns:path xmlns:myns="http://myns.org/" xmlns="http://myns.org/"><to>me</to></myns:path>', lxmltostring(method.kwargs['lxmlNode'])) self.assertEquals("""\ <myns:path xmlns:myns="http://myns.org/" xmlns="http://myns.org/"> <to>me</to> </myns:path>""", lxmltostring(method.kwargs['lxmlNode'])) self.assertEquals(XML, lxmltostring(lxmlNode))
def testListMetadataFormatsForIdentifier(self): header, body = self._request(verb=['ListMetadataFormats'], identifier=[self.prefix + 'record:id:01']) self.assertEqual(0, len(xpath(body, '/oai:OAI-PMH/oai:error')), lxmltostring(body, pretty_print=True)) formats = xpath(body, '/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat') self.assertEqual(1, len(formats), lxmltostring(body, pretty_print=True)) self.assertEqual(['oai_dc'], xpath(formats[0], 'oai:metadataPrefix/text()'))
def testNearRealtimeOaiSavesState(self): observer = CallTrace("observer", ignoredAttributes=["observer_init"], methods={'add': lambda **kwargs: (x for x in [])}) oaiJazz = OaiJazz(join(self.tempdir, 'oai')) oaiJazz.updateMetadataFormat(prefix="prefix", schema="", namespace="") suspendRegister = SuspendRegister() oaiJazz.addObserver(suspendRegister) storageComponent = MultiSequentialStorage(join(self.tempdir, 'storage')) self._addOaiRecords(storageComponent, oaiJazz, 1) oaiPmhThread = None harvestThread = None def start(): global oaiPmhThread, harvestThread self.run = True portNumber = randint(50000, 60000) oaiPmhThread = Thread( None, lambda: self.startOaiPmh( portNumber, oaiJazz, storageComponent, suspendRegister)) harvestThread = Thread( None, lambda: self.startOaiHarvester(portNumber, observer)) oaiPmhThread.start() harvestThread.start() def stop(): global oaiPmhThread, harvestThread self.run = False oaiPmhThread.join() oaiPmhThread = None harvestThread.join() harvestThread = None start() requests = 1 sleepWheel(1.0 + 1.0 * requests) self.assertEqual(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode']) self.assertTrue("id0" in kwarg, kwarg) stop() observer.calledMethods.reset() storageComponent.addData(identifier="id1", name="prefix", data=b"<a>a1</a>") oaiJazz.addOaiRecord(identifier="id1", metadataPrefixes=["prefix"]) start() requests = 1 sleepWheel(1.0 + 1.0 * requests) self.assertEqual(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode']) self.assertFalse("id0" in kwarg, kwarg) self.assertTrue("id1" in kwarg, kwarg) stop()
def assertWaterMarked(**oaiArgs): header, body = self._request(**oaiArgs) try: comment = xpath(body, "/oai:OAI-PMH/comment()")[0] except: print lxmltostring(body, pretty_print=True) raise self.assertEquals(" Watermarked by Seecr ", comment.text)
def testNoLxmlTailOnPart(self): inputEvent = fromstring("""<document><part name="partone"><some>message</some>\n\n\n\n</part><part name="parttwo"><second>message</second>\n\n\n\n</part></document>""") interceptor = CallTrace('Interceptor', methods={'add': yieldNothing}) v = createVenturiHelix([{'partname': 'partone', 'xpath': '/document/part[@name="partone"]/text()'}, {'partname': 'parttwo', 'xpath': '/document/part/second'}], [], interceptor) list(compose(v.all.add('identifier', 'document', inputEvent))) self.assertEqual('<some>message</some>', lxmltostring(interceptor.calledMethods[1].kwargs['lxmlNode'])) secondXml = interceptor.calledMethods[2].kwargs['lxmlNode'] self.assertEqual('<second>message</second>', lxmltostring(secondXml))
def testNoLxmlTailOnPart(self): inputEvent = fromstring("""<document><part name="partone"><some>message</some>\n\n\n\n</part><part name="parttwo"><second>message</second>\n\n\n\n</part></document>""") interceptor = CallTrace('Interceptor', methods={'add': yieldNothing}) v = createVenturiHelix([{'partname': 'partone', 'xpath': '/document/part[@name="partone"]/text()'}, {'partname': 'parttwo', 'xpath': '/document/part/second'}], [], interceptor) list(compose(v.all.add('identifier', 'document', inputEvent))) self.assertEquals('<some>message</some>', lxmltostring(interceptor.calledMethods[1].kwargs['lxmlNode'])) secondXml = interceptor.calledMethods[2].kwargs['lxmlNode'] self.assertEquals('<second>message</second>', lxmltostring(secondXml))
def testFindUsingMultipleXPaths(self): self.createXmlXPath(['/does/not/exist', '/a/b', '/a/b/c'], {}) self.observable.do.test(data='<a><b><c>one</c></b><b><d>two</d></b></a>') self.assertEquals(3, len(self.observer.calledMethods)) allResults = [] for method in self.observer.calledMethods: allResults.append(method.kwargs['lxmlNode']) self.assertEqualsWS('<b><c>one</c></b>', lxmltostring(allResults[0])) self.assertEqualsWS('<b><d>two</d></b>', lxmltostring(allResults[1])) self.assertEqualsWS('<c>one</c>', lxmltostring(allResults[2]))
def testPartsWithUnicodeChars(self): inputEvent = fromstring("""<document><part name="partone"><some>t€xt</some>\n\n\n\n</part><part name="parttwo"><second>t€xt</second>\n\n\n\n</part></document>""") interceptor = CallTrace('Interceptor', methods={'add': yieldNothing}) v = createVenturiHelix([{'partname': 'partone', 'xpath': '/document/part[@name="partone"]/text()'}, {'partname': 'parttwo', 'xpath': '/document/part/second'}], [], interceptor) list(compose(v.all.add('identifier', 'document', inputEvent))) firstXml = interceptor.calledMethods[1].kwargs['lxmlNode'] self.assertEquals('<some>t€xt</some>', lxmltostring(firstXml)) self.assertEquals('t€xt', firstXml.getroot().text) secondXml = interceptor.calledMethods[2].kwargs['lxmlNode'] self.assertEquals('<second>t€xt</second>', lxmltostring(secondXml)) self.assertEquals('t€xt', secondXml.getroot().text)
def testPartsWithUnicodeChars(self): inputEvent = fromstring("""<document><part name="partone"><some>t€xt</some>\n\n\n\n</part><part name="parttwo"><second>t€xt</second>\n\n\n\n</part></document>""") interceptor = CallTrace('Interceptor', methods={'add': yieldNothing}) v = createVenturiHelix([{'partname': 'partone', 'xpath': '/document/part[@name="partone"]/text()'}, {'partname': 'parttwo', 'xpath': '/document/part/second'}], [], interceptor) list(compose(v.all.add('identifier', 'document', inputEvent))) firstXml = interceptor.calledMethods[1].kwargs['lxmlNode'] self.assertEqual('<some>t€xt</some>', lxmltostring(firstXml)) self.assertEqual('t€xt', firstXml.getroot().text) secondXml = interceptor.calledMethods[2].kwargs['lxmlNode'] self.assertEqual('<second>t€xt</second>', lxmltostring(secondXml)) self.assertEqual('t€xt', secondXml.getroot().text)
def testSimpleXPath(self): self.createXmlXPath(['/root/path'], {}) xml = '<root><path><to>me</to></path>\n</root>' self.observable.do.test('een tekst', data=xml) self.assertEquals(1, len(self.observer.calledMethods)) method = self.observer.calledMethods[0] self.assertEquals('test', method.name) self.assertEquals(1, len(method.args)) self.assertEquals('een tekst', method.args[0]) self.assertEqualsWS('<path><to>me</to></path>', lxmltostring(method.kwargs['lxmlNode'])) self.assertEquals('<path><to>me</to></path>', lxmltostring(method.kwargs['lxmlNode']))
def testXPathWithMultipleResults(self): self.createXmlXPath(['/root/element/data'], {}) self.observable.do.aMethod(data="""<root> <element> <data>one</data> </element> <element> <data>two</data> </element> </root>""") self.assertEquals(2, len(self.observer.calledMethods)) self.assertEqualsWS('<data>one</data>', lxmltostring(self.observer.calledMethods[0].kwargs['lxmlNode'])) self.assertEqualsWS('<data>two</data>', lxmltostring(self.observer.calledMethods[1].kwargs['lxmlNode']))
def testHandleWithTwoRecords(self): observer = CallTrace(methods={'add': lambda **kwargs: (x for x in [])}) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True) oaiDownloadProcessor.addObserver(observer) secondRecord = '<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:2</identifier><datestamp>2011-08-22T07:41:00Z</datestamp></header><metadata>ignored</metadata></record>' list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % (secondRecord + RESUMPTION_TOKEN)))))) self.assertEquals(['startOaiBatch', 'add', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) addMethod0, addMethod1 = observer.calledMethods[1:3] self.assertEquals(0, len(addMethod0.args)) self.assertEqualsWS(ONE_RECORD, lxmltostring(addMethod0.kwargs['lxmlNode'])) self.assertEquals('2011-08-22T07:34:00Z', addMethod0.kwargs['datestamp']) self.assertEquals('oai:identifier:1', addMethod0.kwargs['identifier']) self.assertEqualsWS(secondRecord, lxmltostring(addMethod1.kwargs['lxmlNode'])) self.assertEquals('2011-08-22T07:41:00Z', addMethod1.kwargs['datestamp']) self.assertEquals('oai:identifier:2', addMethod1.kwargs['identifier'])
def testNearRealtimeOaiSavesState(self): observer = CallTrace("observer", ignoredAttributes=["observer_init"], methods={'add': lambda **kwargs: (x for x in [])}) oaiJazz = OaiJazz(join(self.tempdir, 'oai')) suspendRegister = SuspendRegister() oaiJazz.addObserver(suspendRegister) storageComponent = MultiSequentialStorage(join(self.tempdir, 'storage')) self._addOaiRecords(storageComponent, oaiJazz, 1) oaiPmhThread = None harvestThread = None def start(): global oaiPmhThread, harvestThread self.run = True portNumber = randint(50000, 60000) oaiPmhThread = Thread(None, lambda: self.startOaiPmh(portNumber, oaiJazz, storageComponent, suspendRegister)) harvestThread = Thread(None, lambda: self.startOaiHarvester(portNumber, observer)) oaiPmhThread.start() harvestThread.start() def stop(): global oaiPmhThread, harvestThread self.run = False oaiPmhThread.join() oaiPmhThread = None harvestThread.join() harvestThread = None start() requests = 1 sleepWheel(1.0 + 1.0 * requests) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode']) self.assertTrue("id0" in kwarg, kwarg) stop() observer.calledMethods.reset() storageComponent.addData(identifier="id1", name="prefix", data="<a>a1</a>") oaiJazz.addOaiRecord(identifier="id1", sets=[], metadataFormats=[("prefix", "", "")]) start() requests = 1 sleepWheel(1.0 + 1.0 * requests) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode']) self.assertFalse("id0" in kwarg, kwarg) self.assertTrue("id1" in kwarg, kwarg) stop()
def testOutline(self): inputEvent = fromstring( """<document><part name="partone"><some>message</some></part><part name="parttwo"><second>message</second></part></document>""" ) interceptor = CallTrace('Interceptor', methods={'add': yieldNothing}) v = createVenturiHelix([ dict(partname='partone', xpath='/document/part[@name="partone"]/text()', asString=True), dict(partname='parttwo', xpath='/document/part/second') ], [], interceptor) list(compose(v.all.add('identifier', 'document', inputEvent))) self.assertEquals(['begin', 'add', 'add'], [m.name for m in interceptor.calledMethods]) self.assertEquals('identifier', interceptor.calledMethods[1].kwargs['identifier']) self.assertEquals('partone', interceptor.calledMethods[1].kwargs['partname']) self.assertEquals('<some>message</some>', interceptor.calledMethods[1].kwargs['data']) self.assertEquals('identifier', interceptor.calledMethods[2].kwargs['identifier']) self.assertEquals('parttwo', interceptor.calledMethods[2].kwargs['partname']) secondXml = interceptor.calledMethods[2].kwargs['lxmlNode'] self.assertEquals('<second>message</second>', lxmltostring(secondXml)) self.assertEquals('second', secondXml.getroot().tag)
def _processRecords(self, lxmlNode): verbNode = xpathFirst(lxmlNode, "/oai:OAI-PMH/oai:%s" % self._verb) for item in verbNode.iterchildren(tag=VERB_TAGNAME[self._verb]): header = None for h in item.iterchildren(): if h.tag == HEADER_TAG: header = h break else: if item.tag != HEADER_TAG: raise IndexError("Invalid oai header") header = item for child in header.iterchildren(): if child.tag == IDENTIFIER_TAG: identifier = child.text elif child.tag == DATESTAMP_TAG: datestamp = child.text try: yield self._add(identifier=identifier, lxmlNode=ElementTree(item), datestamp=datestamp) except Exception as e: self._logError(format_exc()) self._logError("While processing:") self._logError(lxmltostring(item)) self._errorState = "ERROR while processing '%s': %s" % ( identifier, str(e)) raise yield # some room for others self._resumptionToken = xpathFirst(verbNode, "oai:resumptionToken/text()")
def testListRecords(self): header, body = self._request( verb=['ListRecords'], metadataPrefix=['prefix2']) records = xpath(body, '/oai:OAI-PMH/oai:ListRecords/oai:record') self.assertEquals(10, len(records)) self.assertEquals([self.prefix + 'record:id:11'], xpath(records[1], 'oai:header/oai:identifier/text()')) self.assertEquals(['record:id:11'], xpath(records[1], 'oai:metadata/oai_dc:dc/dc:subject/text()'), lxmltostring(records[1])) self.assertEquals(['hierarchical', 'setSpec10'], sorted( xpath(records[1], 'oai:header/oai:setSpec/text()'))) deletedRecords = xpath( body, '/oai:OAI-PMH/oai:ListRecords/oai:record[oai:header/@status="deleted"]' ) self.assertEquals(2, len(deletedRecords)) self.assertEquals( [0, 0], [len(xpath(r, 'oai:metadata')) for r in deletedRecords]) self.assertEquals(['hierarchical', 'setSpec10'], sorted( xpath(deletedRecords[0], 'oai:header/oai:setSpec/text()')))
def testReadFromStorage(self): inputEvent = fromstring('<document/>') interceptor = CallTrace('Interceptor', ignoredAttributes=[ 'getData', 'all_unknown', 'any_unknown', 'call_unknown' ]) interceptor.methods['add'] = yieldNothing storage = CallTrace('Storage', ignoredAttributes=['add', 'all_unknown']) storage.returnValues['getData'] = '<some>this is partone</some>' v = createVenturiHelix( [{ 'partname': 'partone', 'xpath': '/document/part[@name="partone"]/text()' }], [], interceptor, storage) list(compose(v.all.add('identifier', 'document', inputEvent))) self.assertEquals(['begin', 'add'], [m.name for m in interceptor.calledMethods]) self.assertEquals( '<some>this is partone</some>', lxmltostring(interceptor.calledMethods[1].kwargs['lxmlNode'])) self.assertEquals(['begin', 'getData'], storage.calledMethodNames()) self.assertEquals(dict(identifier='identifier', name='partone'), storage.calledMethods[1].kwargs)
def _processRecords(self, lxmlNode): verbNode = xpathFirst(lxmlNode, "/oai:OAI-PMH/oai:%s" % self._verb) for item in verbNode.iterchildren(tag=VERB_TAGNAME[self._verb]): header = None for h in item.iterchildren(): if h.tag == HEADER_TAG: header = h break else: if item.tag != HEADER_TAG: raise IndexError("Invalid oai header") header = item for child in header.iterchildren(): if child.tag == IDENTIFIER_TAG: identifier = child.text elif child.tag == DATESTAMP_TAG: datestamp = child.text try: yield self._add(identifier=identifier, lxmlNode=ElementTree(item), datestamp=datestamp) except Exception, e: self._logError(format_exc()) self._logError("While processing:") self._logError(lxmltostring(item)) self._errorState = "ERROR while processing '%s': %s" % (identifier, str(e)) raise yield # some room for others
def testOnlyPassPartsSpecified(self): inputEvent = fromstring("""<document><part name="partone"><some>message</some></part><part name="parttwo"><second/></part></document>""") interceptor = CallTrace('Interceptor', methods={'add': yieldNothing}) v = createVenturiHelix([{'partname': 'partone', 'xpath': '/document/part[@name="partone"]/text()'}], [], interceptor) list(compose(v.all.add('identifier', 'document', inputEvent))) self.assertEqual(['begin', 'add'], [m.name for m in interceptor.calledMethods]) self.assertEqual('<some>message</some>', lxmltostring(interceptor.calledMethods[1].kwargs['lxmlNode']))
def testTwoTags(self): target = Target('aap') p = XMLParser(target = target) p.feed("<aap>") p.feed("noot") p.feed("</aap>") self.assertEquals("<aap>noot</aap>", lxmltostring(target.root))
def testTestWithCondition(self): self.createXmlXPath(['/a/*[not(self::b) and not(self::c)]'], {}) self.observable.do.test(data='<a><b>zero</b><c>one</c><d>two</d></a>') self.assertEquals(1, len(self.observer.calledMethods)) self.assertEqualsWS('<d>two</d>', lxmltostring(self.observer.calledMethods[0].kwargs['lxmlNode']))
def testOnlyPassPartsSpecified(self): inputEvent = fromstring("""<document><part name="partone"><some>message</some></part><part name="parttwo"><second/></part></document>""") interceptor = CallTrace('Interceptor', methods={'add': yieldNothing}) v = createVenturiHelix([{'partname': 'partone', 'xpath': '/document/part[@name="partone"]/text()'}], [], interceptor) list(compose(v.all.add('identifier', 'document', inputEvent))) self.assertEquals(['begin', 'add'], [m.name for m in interceptor.calledMethods]) self.assertEquals('<some>message</some>', lxmltostring(interceptor.calledMethods[1].kwargs['lxmlNode']))
def testListMetadataFormatsForWrongIdentifier(self): header, body = self._request( verb=['ListMetadataFormats'], identifier=['does:not:exist']) self.assertEquals(['idDoesNotExist'], xpath(body, '/oai:OAI-PMH/oai:error/@code'), lxmltostring(body, pretty_print=True))
def testTwoTags(self): target = Target('aap') p = XMLParser(target = target) p.feed("<aap>") p.feed("noot") p.feed("</aap>") self.assertEqual("<aap>noot</aap>", lxmltostring(target.root))
def testCouldHave(self): inputEvent = fromstring('<document><one/></document>') interceptor = CallTrace('Interceptor', ignoredAttributes=['getData', 'all_unknown', 'any_unknown', 'call_unknown'], methods={'add': yieldNothing}) v = createVenturiHelix([], [{'partname': 'one', 'xpath': '/document/one'}], interceptor) list(compose(v.all.add('identifier', 'document', inputEvent))) self.assertEqual(['begin', 'add'], [m.name for m in interceptor.calledMethods]) self.assertEqual('<one/>', lxmltostring(interceptor.calledMethods[1].kwargs['lxmlNode']))
def testCouldHave(self): inputEvent = fromstring('<document><one/></document>') interceptor = CallTrace('Interceptor', ignoredAttributes=['getData', 'all_unknown', 'any_unknown', 'call_unknown'], methods={'add': yieldNothing}) v = createVenturiHelix([], [{'partname': 'one', 'xpath': '/document/one'}], interceptor) list(compose(v.all.add('identifier', 'document', inputEvent))) self.assertEquals(['begin', 'add'], [m.name for m in interceptor.calledMethods]) self.assertEquals('<one/>', lxmltostring(interceptor.calledMethods[1].kwargs['lxmlNode']))
def _combine(self, erfgeoEnrichment, summary): yield '<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">\n' for data in [summary, erfgeoEnrichment]: rdfLxml = XML(data) for child in xpath(rdfLxml, '/rdf:RDF/*'): yield lxmltostring(child) yield '</rdf:RDF>'
def dumpOai(port, path, oaiDumpDir, metadataPrefix, set_=None, host=None, limit=None, append=False): host = host or '127.0.0.1' baseurl = 'http://%s:%s%s' % (host, port, path) if not append: isdir(oaiDumpDir) and rmtree(oaiDumpDir) makedirs(oaiDumpDir) with open(join(oaiDumpDir, 'oai.ids'), 'a') as ids: for oaiItem in islice( iterateOaiPmh(baseurl=baseurl, metadataPrefix=metadataPrefix, set=set_), limit): filename = '%s.%s' % (oaiItem.identifier, metadataPrefix) ids.write('%s %s |%s|\n' % ('DEL' if oaiItem.deleted else 'ADD', filename, '|'.join( sorted(oaiItem.setSpecs)))) if not oaiItem.deleted: open(join(oaiDumpDir, escapeFilename(filename)), 'w').write( lxmltostring(oaiItem.metadata, pretty_print=True)) print("Oai dump created in %s" % oaiDumpDir)
def testAddInitialRecord(self): uri = "some:uri" rdfDescription = """<rdf:Description rdf:about="%s" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://www.openarchives.org/OAI/2.0/"> <dc:title xmlns:dc="http://purl.org/dc/elements/1.1/" xml:lang="en">title</dc:title> <prov:wasDerivedFrom xmlns:prov="http://www.w3.org/ns/prov#"> <prov:Entity> <dcterms:source rdf:resource="http://first.example.org"/> </prov:Entity> </prov:wasDerivedFrom> </rdf:Description>""" % uri lxmlNode = parse(StringIO("""<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> %s </rdf:RDF>""" % rdfDescription)) consume(self.dna.all.add(identifier="identifier", lxmlNode=lxmlNode)) record = self.oaiJazz.getRecord(identifier=uri) expected = XML(lxmltostring(xpathFirst(lxmlNode, '//rdf:RDF'))) cleanup_namespaces(expected) self.assertXmlEquals(expected, self.storage.getData(identifier=record.identifier, name='rdf')) self.assertEquals(set(['rdf']), record.prefixes) self.assertEquals(set(), record.sets) self.plein.close() plein2 = self._newPlein() self.assertEquals(['some:uri'], [fragment.uri for fragment in plein2._fragmentsForRecord('identifier')])
def handleRequest(self, Body='', **kwargs): yield '\r\n'.join(['HTTP/1.0 200 Ok', 'Content-Type: text/xml; charset=utf-8\r\n', '']) try: updateRequest = XML(Body) recordId = xpathFirst(updateRequest, 'ucp:recordIdentifier/text()') action = xpathFirst(updateRequest, 'ucp:action/text()') if self._allInvalid and action == "info:srw/action/1/replace": if 'oai:record:02' in recordId: raise InvalidDataException() raise InvalidDataException('Invalid data') if recordId in self._raiseExceptionOnIds: raise Exception("ERROR") self._number +=1 filename = '%05d_%s.updateRequest' %(self._number, action.rsplit('/')[-1]) with open(join(self._dumpdir, filename), 'w') as f: stdout.flush() f.write(lxmltostring(updateRequest, pretty_print=True)) answer = RESPONSE_XML % { "operationStatus": "success", "diagnostics": ""} except InvalidDataException, e: answer = RESPONSE_XML % { "operationStatus": "fail", "diagnostics": DIAGNOSTIC_XML % { 'uri': 'info:srw/diagnostic/12/12', 'details': escapeXml(str(e)), 'message': 'Invalid data: record rejected'}}
def _log(self, message, *args, **kwargs): printKwargs = dict(kwargs) for key, value in kwargs.items(): if type(value) == ElementTreeType: printKwargs[key] = "%s(%s)" % (value.__class__.__name__, lxmltostring(value)) sys.stdout.write("[%s] %s(*%s, **%s)\n" % (self.observable_name(), message, args, printKwargs)) sys.stdout.flush()
def testNearRealtimeOai(self): self.run = True portNumber = randint(50000, 60000) suspendRegister = SuspendRegister() oaiJazz = OaiJazz(join(self.tempdir, 'oai')) oaiJazz.updateMetadataFormat(prefix="prefix", schema="", namespace="") oaiJazz.addObserver(suspendRegister) storageComponent = MultiSequentialStorage(join(self.tempdir, 'storage')) self._addOaiRecords(storageComponent, oaiJazz, 3) oaiPmhThread = Thread( None, lambda: self.startOaiPmh(portNumber, oaiJazz, storageComponent, suspendRegister)) observer = CallTrace("observer", ignoredAttributes=["observer_init"], methods={'add': lambda **kwargs: (x for x in [])}) harvestThread = Thread( None, lambda: self.startOaiHarvester(portNumber, observer)) oaiPmhThread.start() harvestThread.start() try: requests = 3 sleepWheel(1.0 + 1.0 * requests) self.assertEqual([ 'startOaiBatch', 'add', 'add', 'stopOaiBatch', 'startOaiBatch', 'add', 'stopOaiBatch' ], [m.name for m in observer.calledMethods]) ids = [ xpath(m.kwargs['lxmlNode'], '//oai:header/oai:identifier/text()') for m in observer.calledMethods if m.name == 'add' ] self.assertEqual([['id0'], ['id1'], ['id2']], ids) self.assertEqual(1, len(suspendRegister)) observer.calledMethods.reset() requests += 1 storageComponent.addData(identifier="id3", name="prefix", data=b"<a>a3</a>") oaiJazz.addOaiRecord(identifier="id3", metadataPrefixes=["prefix"]) sleepWheel(1) self.assertEqual(0, len(suspendRegister)) self.assertEqual(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode']) self.assertTrue("id3" in kwarg, kwarg) sleepWheel(1.0) self.assertEqual(1, len(suspendRegister)) finally: self.run = False oaiPmhThread.join() harvestThread.join() oaiJazz.close()
def assertWaterMarked(**oaiArgs): header, body = self._request(**oaiArgs) try: comment = xpath(body, "/oai:OAI-PMH/comment()")[0] except: print(lxmltostring(body, pretty_print=True)) raise self.assertEqual(" Watermarked by Seecr ", comment.text)
def testFilterTag(self): target = Target('mies') p = XMLParser(target = target) p.feed("<mies><mies>") p.feed("noot") p.feed("</mies>") p.feed("</mies>") self.assertEqual("<mies><mies>noot</mies></mies>", lxmltostring(target.root))
def testLxmltostring(self): from lxml.etree import tostring uri = "Baháma's" xml = """<root><sub><subsub attribute="%s">%s</subsub></sub></root>""" % ( uri, uri) lxmlNode = parse(StringIO(xml)) subnode = lxmlNode.xpath("sub")[0] self.assertEqual( b"""<sub><subsub attribute="Bah\xc3\xa1ma's">Bah\xc3\xa1ma's</subsub></sub>""", lxmltostring(subnode).encode('utf-8')) subsubnode = lxmlNode.xpath("sub/subsub")[0] self.assertEqual( b"""<subsub attribute="Baháma's">Bah\xc3\xa1ma's</subsub>""", tostring(subsubnode, encoding='UTF-8')) self.assertEqual( b"""<subsub attribute="Bah\xc3\xa1ma's">Bah\xc3\xa1ma's</subsub>""", lxmltostring(subsubnode).encode('utf-8'))
def testFileParseLxml(self): observable = Observable() observer = CallTrace('observer') p = FileParseLxml(fromKwarg='filedata', toKwarg='lxmlNode') observable.addObserver(p) p.addObserver(observer) a = StringIO('<a>aaa</a>') observable.do.someMessage(filedata=a) lxmlA = observer.calledMethods[0].kwargs['lxmlNode'] self.assertEqual('<a>aaa</a>', lxmltostring(lxmlA)) with open(self.tempfile, 'w') as f: f.write('<b>bbb</b>') with open(self.tempfile) as b: observable.do.someMessage(filedata=b) lxmlB = observer.calledMethods[1].kwargs['lxmlNode'] self.assertEqual('<b>bbb</b>', lxmltostring(lxmlB))
def testFilterTag(self): target = Target('mies') p = XMLParser(target = target) p.feed("<mies><mies>") p.feed("noot") p.feed("</mies>") p.feed("</mies>") self.assertEquals("<mies><mies>noot</mies></mies>", lxmltostring(target.root))
def testBugListRecordsReturnsDoubleValueOnNoRecordsMatch(self): header, body = self._request( verb=['ListRecords'], metadataPrefix=['oai_dc'], from_=['9999-01-01']) self.assertEquals(['noRecordsMatch'], xpath(body, '/oai:OAI-PMH/oai:error/@code'), lxmltostring(body, pretty_print=True))
def testListAllMetadataFormats(self): header, body = self._request(verb=['ListMetadataFormats']) self.assertEqual(0, len(xpath(body, '/oai:OAI-PMH/oai:error'))) formats = xpath(body, '/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat') self.assertEqual(2, len(formats), lxmltostring(body, pretty_print=True)) self.assertEqual(['oai_dc', 'prefix2'], [xpath(f, 'oai:metadataPrefix/text()')[0] for f in formats]) self.assertEqual(['http://www.openarchives.org/OAI/2.0/oai_dc.xsd', 'http://example.org/prefix2/?format=xsd&prefix=2'], [xpath(f, 'oai:schema/text()')[0] for f in formats]) self.assertEqual(['http://www.openarchives.org/OAI/2.0/oai_dc/', 'http://example.org/prefix2/'], [xpath(f, 'oai:metadataNamespace/text()')[0] for f in formats])
def testPlainXml(self): client = HttpClient() self.response = """HTTP/1.0 200 OK\r\nContent-Type: text/xml\r\n\r\n<xml/>""" gen = client.httpGet(hostname='localhost', port=80, path='/', arguments={}) headers, body = retval(gen) self.assertEquals('<xml/>', lxmltostring(body)) self.assertEquals(['HTTP/1.0 200 OK', 'Content-Type: text/xml'], headers.split(CRLF))
def testListAllMetadataFormats(self): header, body = self._request(verb=['ListMetadataFormats']) self.assertEquals(0, len(xpath(body, '/oai:OAI-PMH/oai:error'))) formats = xpath(body, '/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat') self.assertEquals(2, len(formats), lxmltostring(body, pretty_print=True)) self.assertEquals(['oai_dc', 'prefix2'], [xpath(f, 'oai:metadataPrefix/text()')[0] for f in formats]) self.assertEquals(['http://www.openarchives.org/OAI/2.0/oai_dc.xsd', 'http://example.org/prefix2/?format=xsd&prefix=2'], [xpath(f, 'oai:schema/text()')[0] for f in formats]) self.assertEquals(['http://www.openarchives.org/OAI/2.0/oai_dc/', 'http://example.org/prefix2/'], [xpath(f, 'oai:metadataNamespace/text()')[0] for f in formats])
def testNamespaces(self): xmlXPath = XmlXPath(['/a:aNode/b:bNode'], fromKwarg='lxmlNode', namespaces={'a':'aNamespace', 'b':'bNamespace' }) lxmlNode = parse(StringIO('<aNode xmlns="aNamespace"><bNode xmlns="bNamespace">ccc</bNode></aNode>')) observer = CallTrace('Observer') observable = Observable() observable.addObserver(xmlXPath) xmlXPath.addObserver(observer) observable.do.message(lxmlNode=lxmlNode) message = observer.calledMethods[0] self.assertEquals('message', message.name) newNode = message.kwargs['lxmlNode'] self.assertEqualsWS('<bNode xmlns="bNamespace">ccc</bNode>', lxmltostring(newNode)) newNamespaces = newNode.getroot().nsmap nameSpacesAfterParsing = parse(StringIO(lxmltostring(newNode))).getroot().nsmap self.assertEquals(nameSpacesAfterParsing, newNamespaces)
def testFileParseLxml(self): observable = Observable() observer = CallTrace('observer') p = FileParseLxml(fromKwarg='filedata', toKwarg='lxmlNode') observable.addObserver(p) p.addObserver(observer) a = StringIO('<a>aaa</a>') f = open(self.tempfile, 'w') f.write('<b>bbb</b>') f.close() b = open(self.tempfile) observable.do.someMessage(filedata=a) lxmlA = observer.calledMethods[0].kwargs['lxmlNode'] self.assertEquals('<a>aaa</a>', lxmltostring(lxmlA)) observable.do.someMessage(filedata=b) lxmlB = observer.calledMethods[1].kwargs['lxmlNode'] self.assertEquals('<b>bbb</b>', lxmltostring(lxmlB))
def testSendWithAbout(self): ABOUT = '<about xmlns="%(oai)s">abouttext</about>' % namespaces recordFile = self.tempdir + '/group/repo/id.record' self.uploader._filenameFor = lambda *args: recordFile upload = createUpload(about=ABOUT) self.uploader.send(upload) self.assertTrue(isfile(recordFile)) self.assertEquals(ABOUT, lxmltostring(xpathFirst(parse(open(recordFile)), '//oai:about')))
def testElementInKwargs(self): self.createXmlXPath(['/root/path'], {}) self.observable.do.aMethod('otherArgument', data='<root><path><to>me</to></path></root>', otherKeyword='okay') self.assertEquals(1, len(self.observer.calledMethods)) method = self.observer.calledMethods[0] self.assertEquals('aMethod', method.name) self.assertEquals(1, len(method.args)) self.assertEquals(set(['otherKeyword', 'lxmlNode']), set(method.kwargs.keys())) self.assertEqualsWS('<path><to>me</to></path>', lxmltostring(method.kwargs['lxmlNode']))
def assertValidOai(lxmlTree=None, aXmlString=None): schema = getSchema() aXmlString = lxmltostring( lxmlTree, pretty_print=True) if aXmlString == None else aXmlString tree = parse(BytesIO(aXmlString.encode())) schema.validate(tree) if schema.error_log: for nr, line in enumerate(aXmlString.split('\n')): print(nr + 1, line) raise AssertionError(schema.error_log.last_error) return tree
def delete(self, anUpload): filename = self._filenameFor(anUpload) if not self._target.oaiEnvelope: os.path.isfile(filename) and os.remove(filename) with open(os.path.join(self._target.path, 'deleted_records'), 'a') as f: f.write('%s\n' % escapeFilename(anUpload.id)) else: xmlResult = self._createOutput(anUpload) with open(filename, 'w') as fd: fd.write(lxmltostring(xmlResult)) self._logDelete(anUpload.id)