def testIncrementalHarvestWithFromAfterSomePeriod(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), incrementalHarvestSchedule=Schedule(period=10)) oaiDownloadProcessor._time = lambda: 1.0 oaiDownloadProcessor.addObserver(observer) consume( oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEqual(None, oaiDownloadProcessor._resumptionToken) self.assertEqual('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) self.assertEqual(None, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor._time = lambda: 6.0 self.assertEqual(None, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor._time = lambda: 10.0 self.assertEqual(None, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor._time = lambda: 11.1 request = oaiDownloadProcessor.buildRequest() self.assertTrue( request.startswith( 'GET /oai?verb=ListRecords&from=2002-06-01T19%3A20%3A30Z&metadataPrefix=oai_dc' ), request)
def testIncrementalHarvestScheduleSetToNone(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", incrementalHarvestSchedule=Schedule(period=0), workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume( oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEqual('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) self.assertNotEqual(None, oaiDownloadProcessor._earliestNextRequestTime) self.assertEqual( ['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], observer.calledMethodNames()) observer.calledMethods.reset() oaiDownloadProcessor.setFrom(from_=None) oaiDownloadProcessor.setIncrementalHarvestSchedule(schedule=None) consume( oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEqual('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) self.assertEqual(None, oaiDownloadProcessor._earliestNextRequestTime) self.assertEqual( ['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], observer.calledMethodNames())
def testListRecordsRequestError(self): resumptionToken = "u|c1286437597991025|mprefix|s|f" with open(join(self.tempdir, 'harvester.state'), 'w') as f: f.write("Resumptiontoken: %s\n" % resumptionToken) observer = CallTrace() oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) oaiDownloadProcessor.addObserver(observer) self.assertEqual( 'GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('resumptionToken', resumptionToken), ('x-wait', 'True') ]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest()) consume(oaiDownloadProcessor.handle(parse(StringIO(ERROR_RESPONSE)))) self.assertEqual(0, len(observer.calledMethods)) self.assertEqual("someError: Some error occurred.\n", oaiDownloadProcessor._err.getvalue()) self.assertEqual( 'GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode( [('verb', 'ListRecords'), ('metadataPrefix', 'oai_dc'), ('x-wait', 'True')]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest())
def testHandleWithTwoRecords(self): observer = CallTrace(methods={'add': lambda **kwargs: (x for x in [])}) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True) oaiDownloadProcessor.addObserver(observer) secondRecord = '<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:2</identifier><datestamp>2011-08-22T07:41:00Z</datestamp></header><metadata>ignored</metadata></record>' list( compose( oaiDownloadProcessor.handle( parse( StringIO(LISTRECORDS_RESPONSE % (secondRecord + RESUMPTION_TOKEN)))))) self.assertEqual(['startOaiBatch', 'add', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) addMethod0, addMethod1 = observer.calledMethods[1:3] self.assertEqual(0, len(addMethod0.args)) self.assertEqualsWS(ONE_RECORD, lxmltostring(addMethod0.kwargs['lxmlNode'])) self.assertEqual('2011-08-22T07:34:00Z', addMethod0.kwargs['datestamp']) self.assertEqual('oai:identifier:1', addMethod0.kwargs['identifier']) self.assertEqualsWS(secondRecord, lxmltostring(addMethod1.kwargs['lxmlNode'])) self.assertEqual('2011-08-22T07:41:00Z', addMethod1.kwargs['datestamp']) self.assertEqual('oai:identifier:2', addMethod1.kwargs['identifier'])
def testHarvesterState(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) oaiDownloadProcessor.addObserver(observer) state = oaiDownloadProcessor.getState() self.assertEqual(None, state.resumptionToken) self.assertEqual(None, state.from_) self.assertEqual(None, state.errorState) self.assertEqual(None, state.name) self.assertEqual("/oai", state.path) self.assertEqual("oai_dc", state.metadataPrefix) self.assertEqual(None, state.set) self.assertEqual(0, state.nextRequestTime) oaiDownloadProcessor.setSet('s') oaiDownloadProcessor.setPath('/p') oaiDownloadProcessor.setMetadataPrefix('pref') oaiDownloadProcessor.observable_setName('aName') consume( oaiDownloadProcessor.handle( parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))) state = oaiDownloadProcessor.getState() self.assertEqual("x?y&z", state.resumptionToken) self.assertEqual('2002-06-01T19:20:30Z', state.from_) self.assertEqual(None, state.errorState) self.assertEqual('aName', state.name) self.assertEqual("/p", state.path) self.assertEqual("pref", state.metadataPrefix) self.assertEqual('s', state.set) self.assertEqual(0, state.nextRequestTime) # Change state of oaiDownloadProcessor -> changes stateView. oaiDownloadProcessor.setSet('x') self.assertEqual('x', state.set) oaiDownloadProcessor2 = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) state2 = oaiDownloadProcessor2.getState() self.assertEqual(None, state2.name) self.assertEqual("oai_dc", state2.metadataPrefix) self.assertEqual("x?y&z", state2.resumptionToken) self.assertEqual('2002-06-01T19:20:30Z', state2.from_) self.assertEqual(None, state2.errorState) self.assertEqual(0, state.nextRequestTime)
def testBuildRequestNoneWhenNoResumptionToken(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume( oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEqual(None, oaiDownloadProcessor._resumptionToken) self.assertEqual(None, oaiDownloadProcessor.buildRequest())
def testYieldSuspendFromAdd(self): observer = CallTrace() oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False) oaiDownloadProcessor.addObserver(observer) suspend = Suspend() observer.returnValues['add'] = (x for x in [suspend]) yields = list( compose( oaiDownloadProcessor.handle( parse(StringIO(LISTRECORDS_RESPONSE % ''))))) self.assertEqual([suspend, None], yields)
def testSignalHarvestingDone(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor( path='/p', metadataPrefix='p', workingDirectory=self.tempdir, incrementalHarvestSchedule=None) oaiDownloadProcessor.addObserver(observer) consume( oaiDownloadProcessor.handle( parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEqual( ['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], observer.calledMethodNames())
def testIncrementalHarvestScheduleNone(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), incrementalHarvestSchedule=None) oaiDownloadProcessor.addObserver(observer) consume( oaiDownloadProcessor.handle( parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEqual(None, oaiDownloadProcessor._resumptionToken) self.assertEqual('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) self.assertEqual(None, oaiDownloadProcessor._earliestNextRequestTime)
def testIncrementalHarvestWithFromWithDefaultScheduleMidnight(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor._time = oaiDownloadProcessor._incrementalHarvestSchedule._time = lambda: 0o1 * 60 * 60 oaiDownloadProcessor._incrementalHarvestSchedule._utcnow = lambda: datetime.strptime( "01:00", "%H:%M") oaiDownloadProcessor.addObserver(observer) consume( oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEqual(None, oaiDownloadProcessor._resumptionToken) self.assertEqual(24 * 60 * 60.0, oaiDownloadProcessor._earliestNextRequestTime)
def testRestartAfterFinish(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), restartAfterFinish=True) oaiDownloadProcessor.addObserver(observer) consume( oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEqual(None, oaiDownloadProcessor._resumptionToken) request = oaiDownloadProcessor.buildRequest() self.assertTrue( request.startswith( 'GET /oai?verb=ListRecords&metadataPrefix=oai_dc HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: ' ), request)
def testIncrementalHarvestReScheduleIfNoRecordsMatch(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", incrementalHarvestSchedule=Schedule(period=0), workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume( oaiDownloadProcessor.handle( parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEqual('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) consume( oaiDownloadProcessor.handle( parse(StringIO(NO_RECORDS_MATCH_RESPONSE)))) self.assertEqual(None, oaiDownloadProcessor._errorState) self.assertEqual('2012-06-01T19:20:30Z', oaiDownloadProcessor._from)
def testHandle(self): observer = CallTrace(methods={'add': lambda **kwargs: (x for x in [])}) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False) oaiDownloadProcessor.addObserver(observer) list( compose( oaiDownloadProcessor.handle( parse(StringIO(LISTRECORDS_RESPONSE % ''))))) self.assertEqual( ['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], [m.name for m in observer.calledMethods]) addMethod = observer.calledMethods[1] self.assertEqual(0, len(addMethod.args)) self.assertEqualsWS(ONE_RECORD, lxmltostring(addMethod.kwargs['lxmlNode'])) self.assertEqual('2011-08-22T07:34:00Z', addMethod.kwargs['datestamp']) self.assertEqual('oai:identifier:1', addMethod.kwargs['identifier'])
def testShutdownPersistsStateOnAutocommit(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, autoCommit=False) oaiDownloadProcessor.addObserver(observer) consume( oaiDownloadProcessor.handle( parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))) state = oaiDownloadProcessor.getState() self.assertFalse(isfile(join(self.tempdir, 'harvester.state'))) oaiDownloadProcessor.handleShutdown() self.assertEqual( { "errorState": None, 'from': '2002-06-01T19:20:30Z', "resumptionToken": state.resumptionToken }, JsonDict.load(join(self.tempdir, 'harvester.state')))
def testResponseDateAsFrom(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume( oaiDownloadProcessor.handle( parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))) self.assertEqual('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO()) self.assertEqual('2002-06-01T19:20:30Z', oaiDownloadProcessor._from)
def testKeepResumptionTokenOnFailingAddCall(self): resumptionToken = "u|c1286437597991025|mprefix|s|f" with open(join(self.tempdir, 'harvester.state'), 'w') as f: f.write("Resumptiontoken: %s\n" % resumptionToken) observer = CallTrace() observer.exceptions = {'add': Exception("Could be anything")} oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) oaiDownloadProcessor.addObserver(observer) self.assertEqual( 'GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('resumptionToken', resumptionToken), ('x-wait', 'True') ]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest()) self.assertRaises( Exception, lambda: list( compose( oaiDownloadProcessor.handle( parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN) ))))) self.assertEqual(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) errorOutput = oaiDownloadProcessor._err.getvalue() self.assertTrue(errorOutput.startswith('Traceback'), errorOutput) self.assertTrue( 'Exception: Could be anything\nWhile processing:\n<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:1' in errorOutput, errorOutput) self.assertEqual( 'GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('resumptionToken', resumptionToken), ('x-wait', 'True') ]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest())
def testHarvesterStateWithError(self): resumptionToken = "u|c1286437597991025|mprefix|s|f" with open(join(self.tempdir, 'harvester.state'), 'w') as f: f.write("Resumptiontoken: %s\n" % resumptionToken) observer = CallTrace() observer.exceptions = {'add': Exception("Could be anything")} oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO(), name="Name") oaiDownloadProcessor.addObserver(observer) self.assertRaises( Exception, lambda: list( compose( oaiDownloadProcessor.handle( parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN) ))))) state = oaiDownloadProcessor.getState() self.assertEqual(resumptionToken, state.resumptionToken) self.assertEqual(None, state.from_) self.assertEqual( "ERROR while processing 'oai:identifier:1': Could be anything", state.errorState) self.assertEqual("Name", state.name) oaiDownloadProcessor2 = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) state2 = oaiDownloadProcessor2.getState() self.assertEqual(resumptionToken, state2.resumptionToken) self.assertEqual( "ERROR while processing 'oai:identifier:1': Could be anything", state2.errorState)
def testUseResumptionToken(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume( oaiDownloadProcessor.handle( parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))) self.assertEqual('x?y&z', oaiDownloadProcessor._resumptionToken) self.assertEqual( 'GET /oai?verb=ListRecords&resumptionToken=x%%3Fy%%26z&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) self.assertEqual('x?y&z', oaiDownloadProcessor._resumptionToken)
def testHandleYieldsAtLeastOnceAfterEachRecord(self): def add(**kwargs): return yield observer = CallTrace(methods={'add': add}) oaiDownloadProcessor = OaiDownloadProcessor( path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False) oaiDownloadProcessor.addObserver(observer) yields = list( compose( oaiDownloadProcessor.handle( parse(StringIO(LISTRECORDS_RESPONSE % ''))))) self.assertEqual(1, len(yields)) secondRecord = '<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:2</identifier><datestamp>2011-08-22T07:41:00Z</datestamp></header><metadata>ignored</metadata></record>' yields = list( compose( oaiDownloadProcessor.handle( parse(StringIO(LISTRECORDS_RESPONSE % secondRecord))))) self.assertEqual(2, len(yields))