def testIncrementalHarvestReScheduleIfNoRecordsMatch(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", incrementalHarvestSchedule=Schedule(period=0), workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) consume(oaiDownloadProcessor.handle(parse(StringIO(NO_RECORDS_MATCH_RESPONSE)))) self.assertEquals(None, oaiDownloadProcessor._errorState) self.assertEquals('2012-06-01T19:20:30Z', oaiDownloadProcessor._from)
def testHandleYieldsAtLeastOnceAfterEachRecord(self): def add(**kwargs): return yield observer = CallTrace(methods={'add': add}) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False) oaiDownloadProcessor.addObserver(observer) yields = list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % ''))))) self.assertEquals(1, len(yields)) secondRecord = '<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:2</identifier><datestamp>2011-08-22T07:41:00Z</datestamp></header><metadata>ignored</metadata></record>' yields = list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % secondRecord))))) self.assertEquals(2, len(yields))
def testSetIncrementalHarvestSchedule(self): oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), incrementalHarvestSchedule=None) oaiDownloadProcessor._time = lambda: 10 oaiDownloadProcessor.setIncrementalHarvestSchedule(schedule=Schedule(period=3)) self.assertEquals(0, oaiDownloadProcessor._earliestNextRequestTime) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEquals(13, oaiDownloadProcessor._earliestNextRequestTime)
def testIncrementalHarvestScheduleNoneOverruledWithSetIncrementalHarvestSchedule(self): oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), incrementalHarvestSchedule=None) oaiDownloadProcessor._time = lambda: 10 consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEquals(None, oaiDownloadProcessor._resumptionToken) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) self.assertEquals(None, oaiDownloadProcessor._earliestNextRequestTime) oaiDownloadProcessor.setIncrementalHarvestSchedule(schedule=Schedule(period=3)) self.assertEquals(None, oaiDownloadProcessor.buildRequest()) self.assertEquals(None, oaiDownloadProcessor._earliestNextRequestTime) oaiDownloadProcessor.scheduleNextRequest() self.assertNotEquals(None, oaiDownloadProcessor.buildRequest()) self.assertEquals(0, oaiDownloadProcessor._earliestNextRequestTime) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEquals(13, oaiDownloadProcessor._earliestNextRequestTime)
def testIncrementalHarvestScheduleSetToNone(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", incrementalHarvestSchedule=Schedule(period=0), workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) self.assertNotEqual(None, oaiDownloadProcessor._earliestNextRequestTime) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], observer.calledMethodNames()) observer.calledMethods.reset() oaiDownloadProcessor.setFrom(from_=None) oaiDownloadProcessor.setIncrementalHarvestSchedule(schedule=None) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) self.assertEquals(None, oaiDownloadProcessor._earliestNextRequestTime) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], observer.calledMethodNames())
def testBuildRequestNoneWhenNoResumptionToken(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEquals(None, oaiDownloadProcessor._resumptionToken) self.assertEquals(None, oaiDownloadProcessor.buildRequest())
def testSignalHarvestingDone(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path='/p', metadataPrefix='p', workingDirectory=self.tempdir, incrementalHarvestSchedule=None) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], observer.calledMethodNames())
def testRestartAfterFinish(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), restartAfterFinish=True) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEquals(None, oaiDownloadProcessor._resumptionToken) request = oaiDownloadProcessor.buildRequest() self.assertTrue(request.startswith('GET /oai?verb=ListRecords&metadataPrefix=oai_dc HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: '), request)
def testIncrementalHarvestScheduleNone(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), incrementalHarvestSchedule=None) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEquals(None, oaiDownloadProcessor._resumptionToken) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) self.assertEquals(None, oaiDownloadProcessor._earliestNextRequestTime)
def testYieldSuspendFromAdd(self): observer = CallTrace() oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False) oaiDownloadProcessor.addObserver(observer) suspend = Suspend() observer.returnValues['add'] = (x for x in [suspend]) yields = list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % ''))))) self.assertEquals([suspend, None], yields)
def testRaiseErrorOnBadResponse(self): oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True) badRecord = '<record>No Header</record>' try: list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % badRecord))))) self.fail() except IndexError: pass
def testUseResumptionToken(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))) self.assertEquals('x?y&z', oaiDownloadProcessor._resumptionToken) self.assertEquals('GET /oai?verb=ListRecords&resumptionToken=x%%3Fy%%26z&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) self.assertEquals('x?y&z', oaiDownloadProcessor._resumptionToken)
def testResponseDateAsFrom(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO()) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from)
def testIncrementalHarvestWithFromWithDefaultScheduleMidnight(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor._time = oaiDownloadProcessor._incrementalHarvestSchedule._time = lambda: 01 * 60 * 60 oaiDownloadProcessor._incrementalHarvestSchedule._utcnow = lambda: datetime.strptime("01:00", "%H:%M") oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEquals(None, oaiDownloadProcessor._resumptionToken) self.assertEquals(24 * 60 * 60.0, oaiDownloadProcessor._earliestNextRequestTime)
def testShutdownPersistsStateOnAutocommit(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, autoCommit=False) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))) state = oaiDownloadProcessor.getState() self.assertFalse(isfile(join(self.tempdir, 'harvester.state'))) oaiDownloadProcessor.handleShutdown() self.assertEquals({"errorState": None, 'from': '2002-06-01T19:20:30Z', "resumptionToken": state.resumptionToken}, load(open(join(self.tempdir, 'harvester.state'))))
def testListIdentifiersHandle(self): observer = CallTrace(methods={'add': lambda **kwargs: (x for x in [])}) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, verb='ListIdentifiers') oaiDownloadProcessor.addObserver(observer) list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTIDENTIFIERS_RESPONSE))))) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], [m.name for m in observer.calledMethods]) addMethod = observer.calledMethods[1] self.assertEquals(0, len(addMethod.args)) self.assertEqualsWS(ONE_HEADER, lxmltostring(addMethod.kwargs['lxmlNode'])) self.assertEquals('2011-08-22T07:34:00Z', addMethod.kwargs['datestamp']) self.assertEquals('oai:identifier:1', addMethod.kwargs['identifier'])
def testListRecordsRequestError(self): resumptionToken = "u|c1286437597991025|mprefix|s|f" open(join(self.tempdir, 'harvester.state'), 'w').write("Resumptiontoken: %s\n" % resumptionToken) observer = CallTrace() oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) oaiDownloadProcessor.addObserver(observer) self.assertEquals('GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('resumptionToken', resumptionToken), ('x-wait', 'True')]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest()) consume(oaiDownloadProcessor.handle(parse(StringIO(ERROR_RESPONSE)))) self.assertEquals(0, len(observer.calledMethods)) self.assertEquals("someError: Some error occurred.\n", oaiDownloadProcessor._err.getvalue()) self.assertEquals('GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('metadataPrefix', 'oai_dc'), ('x-wait', 'True')]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest())
def testKeepResumptionTokenOnFailingAddCall(self): resumptionToken = "u|c1286437597991025|mprefix|s|f" open(join(self.tempdir, 'harvester.state'), 'w').write("Resumptiontoken: %s\n" % resumptionToken) observer = CallTrace() observer.exceptions={'add': Exception("Could be anything")} oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) oaiDownloadProcessor.addObserver(observer) self.assertEquals('GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('resumptionToken', resumptionToken), ('x-wait', 'True')]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest()) self.assertRaises(Exception, lambda: list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))))) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) errorOutput = oaiDownloadProcessor._err.getvalue() self.assertTrue(errorOutput.startswith('Traceback'), errorOutput) self.assertTrue('Exception: Could be anything\nWhile processing:\n<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:1' in errorOutput, errorOutput) self.assertEquals('GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('resumptionToken', resumptionToken), ('x-wait', 'True')]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest())
def testHandleWithTwoRecords(self): observer = CallTrace(methods={'add': lambda **kwargs: (x for x in [])}) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True) oaiDownloadProcessor.addObserver(observer) secondRecord = '<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:2</identifier><datestamp>2011-08-22T07:41:00Z</datestamp></header><metadata>ignored</metadata></record>' list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % (secondRecord + RESUMPTION_TOKEN)))))) self.assertEquals(['startOaiBatch', 'add', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) addMethod0, addMethod1 = observer.calledMethods[1:3] self.assertEquals(0, len(addMethod0.args)) self.assertEqualsWS(ONE_RECORD, lxmltostring(addMethod0.kwargs['lxmlNode'])) self.assertEquals('2011-08-22T07:34:00Z', addMethod0.kwargs['datestamp']) self.assertEquals('oai:identifier:1', addMethod0.kwargs['identifier']) self.assertEqualsWS(secondRecord, lxmltostring(addMethod1.kwargs['lxmlNode'])) self.assertEquals('2011-08-22T07:41:00Z', addMethod1.kwargs['datestamp']) self.assertEquals('oai:identifier:2', addMethod1.kwargs['identifier'])
def testIncrementalHarvestWithFromAfterSomePeriod(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), incrementalHarvestSchedule=Schedule(period=10)) oaiDownloadProcessor._time = lambda: 1.0 oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEquals(None, oaiDownloadProcessor._resumptionToken) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) self.assertEquals(None, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor._time = lambda: 6.0 self.assertEquals(None, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor._time = lambda: 10.0 self.assertEquals(None, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor._time = lambda: 11.1 request = oaiDownloadProcessor.buildRequest() self.assertTrue(request.startswith('GET /oai?verb=ListRecords&from=2002-06-01T19%3A20%3A30Z&metadataPrefix=oai_dc'), request)
def testHarvesterStateWithError(self): resumptionToken = "u|c1286437597991025|mprefix|s|f" open(join(self.tempdir, 'harvester.state'), 'w').write("Resumptiontoken: %s\n" % resumptionToken) observer = CallTrace() observer.exceptions={'add': Exception("Could be anything")} oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO(), name="Name") oaiDownloadProcessor.addObserver(observer) self.assertRaises(Exception, lambda: list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))))) state = oaiDownloadProcessor.getState() self.assertEquals(resumptionToken, state.resumptionToken) self.assertEquals(None, state.from_) self.assertEquals("ERROR while processing 'oai:identifier:1': Could be anything", state.errorState) self.assertEquals("Name", state.name) oaiDownloadProcessor2 = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) state2 = oaiDownloadProcessor2.getState() self.assertEquals(resumptionToken, state2.resumptionToken) self.assertEquals("ERROR while processing 'oai:identifier:1': Could be anything", state2.errorState)
def testScheduleNextRequest(self): oaiDownloadProcessor = OaiDownloadProcessor(path='/p', metadataPrefix='p', workingDirectory=self.tempdir) oaiDownloadProcessor._time = lambda: 17 consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertTrue(oaiDownloadProcessor._earliestNextRequestTime > 17) oaiDownloadProcessor.scheduleNextRequest() self.assertEquals(0, oaiDownloadProcessor._earliestNextRequestTime) self.assertEquals(True, oaiDownloadProcessor._timeForNextRequest()) self.assertNotEqual(None, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor.scheduleNextRequest(Schedule(period=0)) self.assertEquals(17, oaiDownloadProcessor._earliestNextRequestTime) self.assertEquals(True, oaiDownloadProcessor._timeForNextRequest()) self.assertNotEqual(None, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor.scheduleNextRequest(Schedule(period=120)) self.assertEquals(137, oaiDownloadProcessor._earliestNextRequestTime) self.assertEquals(False, oaiDownloadProcessor._timeForNextRequest()) self.assertEquals(None, oaiDownloadProcessor.buildRequest())
def testHarvesterState(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) oaiDownloadProcessor.addObserver(observer) state = oaiDownloadProcessor.getState() self.assertEquals(None, state.resumptionToken) self.assertEquals(None, state.from_) self.assertEquals(None, state.errorState) self.assertEquals(None, state.name) self.assertEquals("/oai", state.path) self.assertEquals("oai_dc", state.metadataPrefix) self.assertEquals(None, state.set) self.assertEquals(0, state.nextRequestTime) oaiDownloadProcessor.setSet('s') oaiDownloadProcessor.setPath('/p') oaiDownloadProcessor.setMetadataPrefix('pref') oaiDownloadProcessor.observable_setName('aName') consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))) state = oaiDownloadProcessor.getState() self.assertEquals("x?y&z", state.resumptionToken) self.assertEquals('2002-06-01T19:20:30Z', state.from_) self.assertEquals(None, state.errorState) self.assertEquals('aName', state.name) self.assertEquals("/p", state.path) self.assertEquals("pref", state.metadataPrefix) self.assertEquals('s', state.set) self.assertEquals(0, state.nextRequestTime) # Change state of oaiDownloadProcessor -> changes stateView. oaiDownloadProcessor.setSet('x') self.assertEquals('x', state.set) oaiDownloadProcessor2 = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) state2 = oaiDownloadProcessor2.getState() self.assertEquals(None, state2.name) self.assertEquals("oai_dc", state2.metadataPrefix) self.assertEquals("x?y&z", state2.resumptionToken) self.assertEquals('2002-06-01T19:20:30Z', state2.from_) self.assertEquals(None, state2.errorState) self.assertEquals(0, state.nextRequestTime)