class HarvesterDataActionsTest(SeecrTestCase): def setUp(self): SeecrTestCase.setUp(self) self.hd = HarvesterData(self.tempdir) self.hd.addDomain('domain') self.hd.addRepositoryGroup('group', domainId='domain') self.hd.addRepository('repository', repositoryGroupId='group', domainId='domain') self.hda = HarvesterDataActions() self.hda.addObserver(self.hd) def testUpdateRepository(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "baseurl": "http://example.org/oai", "set": "ASET", "metadataPrefix": "oai_dc", "mappingId": "mapping_identifier", "targetId": "", "collection": "the collection", "maximumIgnore": "23", "complete": "1", "continuous": "60", "repositoryAction": "clear", "numberOfTimeslots": "0" } consume( self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals('group', repository["repositoryGroupId"]) self.assertEquals("repository", repository["identifier"]) self.assertEquals("http://example.org/oai", repository["baseurl"]) self.assertEquals("ASET", repository["set"]) self.assertEquals("oai_dc", repository["metadataPrefix"]) self.assertEquals("mapping_identifier", repository["mappingId"]) self.assertEquals(None, repository["targetId"]) self.assertEquals("the collection", repository["collection"]) self.assertEquals(23, repository["maximumIgnore"]) self.assertEquals(True, repository["complete"]) self.assertEquals(60, repository["continuous"]) self.assertEquals(False, repository["use"]) self.assertEquals("clear", repository["action"]) self.assertEquals([], repository['shopclosed']) def testMinimalInfo(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", } consume( self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals('group', repository["repositoryGroupId"]) self.assertEquals("repository", repository["identifier"]) self.assertEquals(None, repository["baseurl"]) self.assertEquals(None, repository["set"]) self.assertEquals(None, repository["metadataPrefix"]) self.assertEquals(None, repository["mappingId"]) self.assertEquals(None, repository["targetId"]) self.assertEquals(None, repository["collection"]) self.assertEquals(0, repository["maximumIgnore"]) self.assertEquals(None, repository["continuous"]) self.assertEquals(False, repository["complete"]) self.assertEquals(False, repository["use"]) self.assertEquals(None, repository["action"]) self.assertEquals([], repository['shopclosed']) def testShopClosedButNotAdded(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "0", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', } consume( self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals([], repository['shopclosed']) def testShopClosedAdded(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "0", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', "addTimeslot": "button pressed", } consume( self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals(['*:*:7:0-*:*:9:0'], repository['shopclosed']) def testModifyShopClosed(self): self.updateTheRepository(shopclosed=[ '1:2:7:0-1:2:9:0', '2:*:7:0-2:*:9:0', ]) data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "2", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', 'shopclosedWeek_1': '3', 'shopclosedWeekDay_1': '*', 'shopclosedBegin_1': '17', 'shopclosedEnd_1': '19', 'shopclosedWeek_2': '4', 'shopclosedWeekDay_2': '5', 'shopclosedBegin_2': '9', 'shopclosedEnd_2': '10', } consume( self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals([ '3:*:17:0-3:*:19:0', '4:5:9:0-4:5:10:0', ], repository['shopclosed']) def testDeleteShopClosed(self): self.updateTheRepository(shopclosed=[ '1:2:7:0-1:2:9:0', '2:*:7:0-2:*:9:0', ]) data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "2", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', 'shopclosedWeek_1': '3', 'shopclosedWeekDay_1': '*', 'shopclosedBegin_1': '17', 'shopclosedEnd_1': '19', 'shopclosedWeek_2': '4', 'shopclosedWeekDay_2': '5', 'shopclosedBegin_2': '9', 'shopclosedEnd_2': '10', 'deleteTimeslot_1.x': '10', 'deleteTimeslot_1.y': '20', } consume( self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals([ '4:5:9:0-4:5:10:0', ], repository['shopclosed']) def testSetRepositoryDone(self): self.updateTheRepository(action='refresh') repository = self.hd.getRepository('repository', 'domain') self.assertEquals('refresh', repository['action']) data = dict(domainId='domain', identifier='repository') consume( self.hda.handleRequest(Method='POST', path='/somewhere/repositoryDone', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals(None, repository['action']) def updateTheRepository(self, baseurl='', set='', metadataPrefix='', mappingId='', targetId='', collection='', maximumIgnore=0, use=False, continuous=False, complete=True, action='', shopclosed=None): self.hd.updateRepository('repository', domainId='domain', baseurl=baseurl, set=set, metadataPrefix=metadataPrefix, mappingId=mappingId, targetId=targetId, collection=collection, maximumIgnore=maximumIgnore, use=use, continuous=continuous, complete=complete, action=action, shopclosed=shopclosed or [])
class HarvesterTest(IntegrationTestCase): def setUp(self): IntegrationTestCase.setUp(self) system("rm -rf %s" % self.harvesterLogDir) system("rm -rf %s" % self.harvesterStateDir) self.filesystemDir = join(self.integrationTempdir, 'filesystem') system("rm -rf %s" % self.filesystemDir) self.emptyDumpDir() system("mkdir -p %s" % join(self.harvesterStateDir, DOMAIN)) self.harvesterData = HarvesterData(join(self.integrationTempdir, 'data')) try: self.harvesterData.addRepositoryGroup(identifier=REPOSITORYGROUP, domainId=DOMAIN) except ValueError: pass self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP) def tearDown(self): self.removeRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP) IntegrationTestCase.tearDown(self) def saveRepository(self, domain, repositoryId, repositoryGroupId, metadataPrefix="oai_dc", action=None, mappingId='MAPPING', targetId='SRUUPDATE', maximumIgnore=5, complete=False, continuous=None): try: self.harvesterData.addRepository(identifier=repositoryId, domainId=domain, repositoryGroupId=repositoryGroupId) except ValueError: pass self.harvesterData.updateRepository( identifier=repositoryId, domainId=domain, baseurl='http://localhost:%s/oai' % self.helperServerPortNumber, set=None, metadataPrefix=metadataPrefix, mappingId=mappingId, targetId=targetId, collection=None, maximumIgnore=maximumIgnore, use=True, complete=complete, continuous=continuous, action=action, shopclosed=[] ) def removeRepository(self, domain, repositoryId, repositoryGroupId): self.harvesterData.deleteRepository(identifier=repositoryId, domainId=domain, repositoryGroupId=repositoryGroupId) def testHarvestReturnsErrorWillNotSaveState(self): logs = self.getLogs() self.saveRepository(DOMAIN, "repo_invalid_metadataPrefix", REPOSITORYGROUP, metadataPrefix="not_existing") try: self.startHarvester(repository="repo_invalid_metadataPrefix") self.startHarvester(repository="repo_invalid_metadataPrefix") logs = self.getLogs()[len(logs):] self.assertEquals(2, len(logs)) self.assertEquals('/oai', logs[-2]['path']) self.assertEquals({'verb':['ListRecords'], 'metadataPrefix':['not_existing']}, logs[0]['arguments']) self.assertEquals('/oai', logs[-1]['path']) self.assertEquals({'verb':['ListRecords'], 'metadataPrefix':['not_existing']}, logs[1]['arguments']) finally: self.removeRepository(DOMAIN, 'repo_invalid_metadataPrefix', REPOSITORYGROUP) def testHarvestToSruUpdate(self): # initial harvest oldlogs = self.getLogs() self.startHarvester(repository=REPOSITORY) self.assertEquals(BATCHSIZE, self.sizeDumpDir()) self.assertEquals(2, len([f for f in listdir(self.dumpDir) if "info:srw/action/1/delete" in open(join(self.dumpDir, f)).read()])) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(8, len(ids)) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(invalidIds)) logs = self.getLogs()[len(oldlogs):] self.assertEquals(1, len(logs)) self.assertEquals('/oai', logs[-1]['path']) self.assertEquals({'verb':['ListRecords'], 'metadataPrefix':['oai_dc']}, logs[-1]['arguments']) statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY) token = getResumptionToken(open(statsFile).readlines()[-1]) # resumptionToken self.startHarvester(repository=REPOSITORY) self.assertEquals(15, self.sizeDumpDir()) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(13, len(ids)) logs = self.getLogs()[len(oldlogs):] self.assertEquals(2, len(logs)) self.assertEquals('/oai', logs[-1]['path']) self.assertEquals({'verb':['ListRecords'], 'resumptionToken':[token]}, logs[-1]['arguments']) # Nothing self.startHarvester(repository=REPOSITORY) logs = self.getLogs()[len(oldlogs):] self.assertEquals(2, len(logs)) self.assertEquals(None, getResumptionToken(open(statsFile).readlines()[-1])) def testContinuousHarvest(self): oldlogs = self.getLogs() self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, continuous=1) t = Thread(target=lambda: self.startHarvester(concurrency=1, runOnce=False, repository=REPOSITORY)) t.start() try: sleepWheel(5) logs = self.getLogs()[len(oldlogs):] self.assertTrue(len(logs) > 2, logs) self.assertEqual({'path': '/oai', 'arguments': {'verb': ['ListRecords'], 'metadataPrefix': ['oai_dc']}}, logs[0]) self.assertTrue('resumptionToken' in logs[1]['arguments'], logs[1]) self.assertTrue('from' in logs[2]['arguments'], logs[2]) finally: t.join() def testIncrementalHarvesting(self): oldlogs = self.getLogs() statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY) with open(statsFile, 'w') as f: f.write('Started: 2011-03-31 13:11:44, Harvested/Uploaded/Deleted/Total: 300/300/0/300, Done: 2011-03-31 13:12:36, ResumptionToken: xyz\n') f.write('Started: 2011-04-01 14:11:44, Harvested/Uploaded/Deleted/Total: 300/300/0/300, Done: 2011-04-01 14:12:36, ResumptionToken:\n') self.startHarvester(repository=REPOSITORY) self.assertEquals(BATCHSIZE, self.sizeDumpDir()) logs = self.getLogs()[len(oldlogs):] self.assertEquals(1, len(logs)) self.assertEquals('/oai', logs[-1]['path']) self.assertEquals({'verb':['ListRecords'], 'metadataPrefix':['oai_dc'], 'from':['2011-03-31']}, logs[-1]['arguments']) def testClear(self): self.startHarvester(repository=REPOSITORY) self.assertEquals(BATCHSIZE, self.sizeDumpDir()) header, result = getRequest(self.harvesterInternalServerPortNumber, '/get', {'verb': 'GetStatus', 'domainId': DOMAIN, 'repositoryId': REPOSITORY}, parse=False) data = JsonDict.loads(result) self.assertEquals(8, data['response']['GetStatus'][0]['total']) self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='clear') self.startHarvester(repository=REPOSITORY) self.assertEquals(18, self.sizeDumpDir()) for filename in sorted(listdir(self.dumpDir))[-8:]: self.assertTrue('_delete.updateRequest' in filename, filename) header, result = getRequest(self.harvesterInternalServerPortNumber, '/get', {'verb': 'GetStatus', 'domainId': DOMAIN, 'repositoryId': REPOSITORY}, parse=False) self.assertEqual(0, JsonDict.loads(result)['response']['GetStatus'][0]['total']) def testRefresh(self): oldlogs = self.getLogs() log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY) log.startRepository() for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,7,120,121]]: log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123]]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='refresh') self.startHarvester(repository=REPOSITORY) logs = self.getLogs()[len(oldlogs):] self.assertEquals(0, len(logs)) self.startHarvester(repository=REPOSITORY) logs = self.getLogs() self.assertEquals('/oai', logs[-1]["path"]) self.assertEquals({'verb': ['ListRecords'], 'metadataPrefix': ['oai_dc']}, logs[-1]["arguments"]) statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY) token = getResumptionToken(open(statsFile).readlines()[-1]) self.startHarvester(repository=REPOSITORY) logs = self.getLogs() self.assertEquals('/oai', logs[-1]["path"]) self.assertEquals({'verb': ['ListRecords'], 'resumptionToken': [token]}, logs[-1]["arguments"]) self.assertEquals(15, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) self.assertEquals(17, self.sizeDumpDir()) deleteFiles = [join(self.dumpDir, f) for f in listdir(self.dumpDir) if '_delete' in f] deletedIds = set([xpathFirst(parse(open(x)), '//ucp:recordIdentifier/text()') for x in deleteFiles]) self.assertEquals(set(['%s:oai:record:03' % REPOSITORY, '%s:oai:record:06' % REPOSITORY, '%s:oai:record:120' % REPOSITORY, '%s:oai:record:121' % REPOSITORY]), deletedIds) logs = self.getLogs()[len(oldlogs):] self.startHarvester(repository=REPOSITORY) self.assertEquals(len(logs), len(self.getLogs()[len(oldlogs):]), 'Action is over, expect nothing more.') def testInvalidIgnoredUptoMaxIgnore(self): maxIgnore = 5 self.controlHelper(action='allInvalid') nrOfDeleted = 2 self.startHarvester(repository=REPOSITORY) self.assertEquals(nrOfDeleted, self.sizeDumpDir()) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(ids)) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(maxIgnore + 1, len(invalidIds), invalidIds) invalidDataMessagesDir = join(self.harvesterLogDir, DOMAIN, "invalid", REPOSITORY) self.assertEquals(maxIgnore + 1, len(listdir(invalidDataMessagesDir))) invalidDataMessage01 = open(join(invalidDataMessagesDir, "oai:record:01")).read() self.assertTrue('uploadId: "integrationtest:oai:record:01"', invalidDataMessage01) self.controlHelper(action='noneInvalid') self.startHarvester(repository=REPOSITORY) self.assertEquals(nrOfDeleted + BATCHSIZE, self.sizeDumpDir()) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(BATCHSIZE - nrOfDeleted, len(ids)) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(invalidIds), invalidIds) self.assertEquals(0, len(listdir(invalidDataMessagesDir))) def testHarvestToFilesystemTarget(self): self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, targetId='FILESYSTEM') self.startHarvester(repository=REPOSITORY) self.assertEquals(8, len(listdir(join(self.filesystemDir, REPOSITORYGROUP, REPOSITORY)))) self.assertEquals(['%s:oai:record:%02d' % (REPOSITORY, i) for i in [3,6]], [id.strip() for id in open(join(self.filesystemDir, 'deleted_records'))]) def testClearOnFilesystemTarget(self): self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, targetId='FILESYSTEM') self.startHarvester(repository=REPOSITORY) self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, targetId='FILESYSTEM', action='clear') self.startHarvester(repository=REPOSITORY) self.assertEquals(0, len(listdir(join(self.filesystemDir, REPOSITORYGROUP, REPOSITORY)))) self.assertEquals(set([ 'harvestertestrepository:oai:record:10', 'harvestertestrepository:oai:record:09', 'harvestertestrepository:oai:record:08', 'harvestertestrepository:oai:record:07', 'harvestertestrepository:oai:record:06', 'harvestertestrepository:oai:record:05', 'harvestertestrepository:oai:record:04', 'harvestertestrepository:oai:record:03', 'harvestertestrepository:oai:record:02%2F&gkn', 'harvestertestrepository:oai:record:01' ]), set([id.strip() for id in open(join(self.filesystemDir, 'deleted_records'))]) ) def testHarvestWithError(self): self.startHarvester(repository=REPOSITORY) self.emptyDumpDir() self.controlHelper(action='raiseExceptionOnIds', id=['%s:oai:record:12' % REPOSITORY]) self.startHarvester(repository=REPOSITORY) successFullRecords=['oai:record:11'] self.assertEquals(len(successFullRecords), self.sizeDumpDir()) self.emptyDumpDir() self.controlHelper(action='raiseExceptionOnIds', id=[]) self.startHarvester(repository=REPOSITORY) secondBatchSize = 5 self.assertEquals(secondBatchSize, self.sizeDumpDir()) def testClearWithError(self): self.startHarvester(repository=REPOSITORY) self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='clear') self.controlHelper(action='raiseExceptionOnIds', id=['%s:oai:record:05' % REPOSITORY]) self.emptyDumpDir() self.startHarvester(repository=REPOSITORY) successFullDeletes = [1,2,4] deletesTodo = [5,7,8,9,10] self.assertEquals(len(successFullDeletes), self.sizeDumpDir()) self.controlHelper(action='raiseExceptionOnIds', id=[]) self.emptyDumpDir() self.startHarvester(repository=REPOSITORY) self.assertEquals(len(deletesTodo), self.sizeDumpDir()) def testRefreshWithIgnoredRecords(self): log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY) log.startRepository() for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,2,120,121]]: if uploadId == '%s:oai:record:02' % (REPOSITORY): uploadId = '%s:oai:record:02/&gkn' % (REPOSITORY) log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123,124]]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [7,8,125,126,127,128]]: log.notifyHarvestedRecord(uploadId) log.logInvalidData(uploadId, 'ignored message') log.logIgnoredIdentifierWarning(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() totalRecords = 15 oldUploads = 2 oldDeletes = 3 oldIgnoreds = 4 self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='refresh') self.startHarvester(repository=REPOSITORY) # Smoot init self.assertEquals(0, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) # Smooth harvest self.startHarvester(repository=REPOSITORY) # Smooth harvest self.assertEquals(totalRecords, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) # Smooth finish self.assertEquals(totalRecords + oldUploads + oldIgnoreds, self.sizeDumpDir()) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(invalidIds), invalidIds) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(13, len(ids), ids) def testClearWithInvalidRecords(self): log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY) log.startRepository() for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,2,120,121]]: log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123,124]]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [7,8,125,126,127,128]]: log.notifyHarvestedRecord(uploadId) log.logInvalidData(uploadId, 'ignored message') log.logIgnoredIdentifierWarning(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() oldUploads = 4 oldDeletes = 5 oldInvalids = 6 self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='clear') self.startHarvester(repository=REPOSITORY) self.assertEquals(oldUploads+oldInvalids, self.sizeDumpDir()) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(invalidIds), invalidIds) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(ids), ids) def testConcurrentHarvestToSruUpdate(self): self.startHarvester(concurrency=3) requestsLogged = sorted(listdir(self.dumpDir)) repositoryIds = [] for f in requestsLogged: lxml = parse(open(join(self.dumpDir, f))) repositoryIds.append(xpath(lxml, '//ucp:recordIdentifier/text()')[0].split(':', 1)[0]) repositoryIdsSet = set(repositoryIds) self.assertEquals(set(['repository2', 'integrationtest', 'harvestertestrepository']), repositoryIdsSet) lastSeenRepoId = None try: for repo in repositoryIds: if repo != lastSeenRepoId: repositoryIdsSet.remove(repo) lastSeenRepoId = repo continue except KeyError: pass else: self.fail('Records should have been inserted out-of-order.') def testConcurrentHarvestToSruUpdateBUG(self): self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, complete=True) self.startHarvester(concurrency=1) requestsLogged = sorted(listdir(self.dumpDir)) repositoryIds = [] for f in requestsLogged: lxml = parse(open(join(self.dumpDir, f))) repositoryIds.append(xpath(lxml, '//ucp:recordIdentifier/text()')[0].split(':', 1)[0]) self.assertEquals(15, repositoryIds.count(REPOSITORY)) self.assertEquals(10, repositoryIds.count('repository2')) self.assertEquals(10, repositoryIds.count('integrationtest')) def testStartHarvestingAddedRepository(self): t = Thread(target=lambda: self.startHarvester(concurrency=1, runOnce=False)) t.start() while not listdir(self.dumpDir): sleep(0.1) self.saveRepository(DOMAIN, 'xyz', REPOSITORYGROUP) stdoutfile = join(self.integrationTempdir, "stdouterr-meresco-harvester-harvester.log") sleepWheel(5) log = open(stdoutfile).read() try: self.assertTrue('xyz' in log, log) finally: self.removeRepository(DOMAIN, 'xyz', REPOSITORYGROUP) t.join() def testDontHarvestDeletedRepository(self): stdoutfile = join(self.integrationTempdir, "stdouterr-meresco-harvester-harvester.log") self.saveRepository(DOMAIN, 'xyz', REPOSITORYGROUP) t = Thread(target=lambda: self.startHarvester(concurrency=1, runOnce=False)) t.start() while not listdir(self.dumpDir): sleep(0.1) sleepWheel(1) log = open(stdoutfile).read() xyzOccurrences = log.count('[xyz]') self.removeRepository(DOMAIN, 'xyz', REPOSITORYGROUP) sleepWheel(5) log = open(stdoutfile).read() try: self.assertFalse('Traceback' in log, log) newXyzOccurrences = log.count('[xyz]') self.assertEquals(xyzOccurrences, newXyzOccurrences, "%s!=%s\n%s" % (xyzOccurrences, newXyzOccurrences, log)) finally: t.join() def testConcurrencyAtLeastOne(self): stdouterrlog = self.startHarvester(concurrency=0, expectedReturnCode=2) self.assertTrue("Concurrency must be at least 1" in stdouterrlog, stdouterrlog) stdouterrlog = self.startHarvester(concurrency=-1, expectedReturnCode=2) self.assertTrue("Concurrency must be at least 1" in stdouterrlog, stdouterrlog) def testCompleteInOnAttempt(self): self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, complete=True) stdouterrlog = self.startHarvester(repository=REPOSITORY, runOnce=True, timeoutInSeconds=5) self.assertEquals(15, self.sizeDumpDir()) self.assertTrue("Repository will be completed in one attempt" in stdouterrlog, stdouterrlog) def testHarvestingContinues4Ever(self): try: self.startHarvester(repository=REPOSITORY, runOnce=False, timeoutInSeconds=5) except SystemExit, e: self.assertTrue('took more than 5 seconds' in str(e), str(e)) self.assertEquals(15, self.sizeDumpDir())
class HarvesterDataActionsTest(SeecrTestCase): def setUp(self): SeecrTestCase.setUp(self) self.hd = HarvesterData(self.tempdir) self.hd.addDomain('domain') self.hd.addRepositoryGroup('group', domainId='domain') self.hd.addRepository('repository', repositoryGroupId='group', domainId='domain') self.hda = HarvesterDataActions() self.hda.addObserver(self.hd) def testUpdateRepository(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "baseurl": "http://example.org/oai", "set": "ASET", "metadataPrefix": "oai_dc", "mappingId": "mapping_identifier", "targetId": "", "collection": "the collection", "maximumIgnore": "23", "complete": "1", "continuous": "60", "repositoryAction": "clear", "numberOfTimeslots": "0" } consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals('group', repository["repositoryGroupId"]) self.assertEquals("repository", repository["identifier"]) self.assertEquals("http://example.org/oai", repository["baseurl"]) self.assertEquals("ASET", repository["set"]) self.assertEquals("oai_dc", repository["metadataPrefix"]) self.assertEquals("mapping_identifier", repository["mappingId"]) self.assertEquals(None, repository["targetId"]) self.assertEquals("the collection", repository["collection"]) self.assertEquals(23, repository["maximumIgnore"]) self.assertEquals(True, repository["complete"]) self.assertEquals(60, repository["continuous"]) self.assertEquals(False, repository["use"]) self.assertEquals("clear", repository["action"]) self.assertEquals([], repository['shopclosed']) def testMinimalInfo(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", } consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals('group', repository["repositoryGroupId"]) self.assertEquals("repository", repository["identifier"]) self.assertEquals(None, repository["baseurl"]) self.assertEquals(None, repository["set"]) self.assertEquals(None, repository["metadataPrefix"]) self.assertEquals(None, repository["mappingId"]) self.assertEquals(None, repository["targetId"]) self.assertEquals(None, repository["collection"]) self.assertEquals(0, repository["maximumIgnore"]) self.assertEquals(None, repository["continuous"]) self.assertEquals(False, repository["complete"]) self.assertEquals(False, repository["use"]) self.assertEquals(None, repository["action"]) self.assertEquals([], repository['shopclosed']) def testShopClosedButNotAdded(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "0", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', } consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals([], repository['shopclosed']) def testShopClosedAdded(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "0", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', "addTimeslot": "button pressed", } consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals(['*:*:7:0-*:*:9:0'], repository['shopclosed']) def testModifyShopClosed(self): self.updateTheRepository(shopclosed=['1:2:7:0-1:2:9:0', '2:*:7:0-2:*:9:0',]) data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "2", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', 'shopclosedWeek_1': '3', 'shopclosedWeekDay_1': '*', 'shopclosedBegin_1': '17', 'shopclosedEnd_1': '19', 'shopclosedWeek_2': '4', 'shopclosedWeekDay_2': '5', 'shopclosedBegin_2': '9', 'shopclosedEnd_2': '10', } consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals(['3:*:17:0-3:*:19:0', '4:5:9:0-4:5:10:0',], repository['shopclosed']) def testDeleteShopClosed(self): self.updateTheRepository(shopclosed=['1:2:7:0-1:2:9:0', '2:*:7:0-2:*:9:0',]) data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "2", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', 'shopclosedWeek_1': '3', 'shopclosedWeekDay_1': '*', 'shopclosedBegin_1': '17', 'shopclosedEnd_1': '19', 'shopclosedWeek_2': '4', 'shopclosedWeekDay_2': '5', 'shopclosedBegin_2': '9', 'shopclosedEnd_2': '10', 'deleteTimeslot_1.x': '10', 'deleteTimeslot_1.y': '20', } consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals(['4:5:9:0-4:5:10:0',], repository['shopclosed']) def testSetRepositoryDone(self): self.updateTheRepository(action='refresh') repository = self.hd.getRepository('repository', 'domain') self.assertEquals('refresh', repository['action']) data = dict(domainId='domain', identifier='repository') consume(self.hda.handleRequest(Method='POST', path='/somewhere/repositoryDone', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals(None, repository['action']) def updateTheRepository(self, baseurl='', set='', metadataPrefix='', mappingId='', targetId='', collection='', maximumIgnore=0, use=False, continuous=False, complete=True, action='', shopclosed=None): self.hd.updateRepository('repository', domainId='domain', baseurl=baseurl, set=set, metadataPrefix=metadataPrefix, mappingId=mappingId, targetId=targetId, collection=collection, maximumIgnore=maximumIgnore, use=use, continuous=continuous, complete=complete, action=action, shopclosed=shopclosed or [] )