def setUp(self): SeecrTestCase.setUp(self) self.hd = HarvesterData(self.tempdir) self.hd.addDomain('domain') self.hd.addRepositoryGroup('group', domainId='domain') self.hd.addRepository('repository', repositoryGroupId='group', domainId='domain') self.hda = HarvesterDataActions() self.hda.addObserver(self.hd)
def setUp(self): IntegrationTestCase.setUp(self) system("rm -rf %s" % self.harvesterLogDir) system("rm -rf %s" % self.harvesterStateDir) self.filesystemDir = join(self.integrationTempdir, 'filesystem') system("rm -rf %s" % self.filesystemDir) self.controlHelper(action='reset') self.emptyDumpDir() self.domainStatePath = pathlib.Path(self.harvesterStateDir) / DOMAIN self.domainLogPath = pathlib.Path(self.harvesterLogDir) / DOMAIN self.domainStatePath.mkdir(parents=True) self.domainLogPath.mkdir(parents=True) self.harvesterData = HarvesterData( join(self.integrationTempdir, 'data')) try: self.harvesterData.addRepositoryGroup(identifier=REPOSITORYGROUP, domainId=DOMAIN) except ValueError: pass self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP)
def setUp(self): IntegrationTestCase.setUp(self) system("rm -rf %s" % self.harvesterLogDir) system("rm -rf %s" % self.harvesterStateDir) self.filesystemDir = join(self.integrationTempdir, 'filesystem') system("rm -rf %s" % self.filesystemDir) self.emptyDumpDir() system("mkdir -p %s" % join(self.harvesterStateDir, DOMAIN)) self.harvesterData = HarvesterData(join(self.integrationTempdir, 'data')) try: self.harvesterData.addRepositoryGroup(identifier=REPOSITORYGROUP, domainId=DOMAIN) except ValueError: pass self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP)
def setUp(self): SeecrTestCase.setUp(self) self.hd = HarvesterData(self.tempdir) self.hd.addDomain('domain') self.hd.addRepositoryGroup('group', domainId='domain') self.hd.addRepository('repository', repositoryGroupId='group', domainId='domain') self.hd.updateFieldDefinition( 'domain', { 'repository_fields': [ { 'name': 'name', 'label': 'Label', 'type': 'text', 'export': False }, { 'name': 'choice_1', 'label': 'Keuze', 'type': 'bool', 'export': False }, { 'name': 'choice_2', 'label': 'Keuze', 'type': 'bool', 'export': False }, ] }) self.hda = HarvesterDataActions() self.hda.addObserver(self.hd) self.observable = CallTrace() self.dna = be( (Observable(), (HarvesterDataActions(), (self.observable, ))))
def setUp(self): SeecrTestCase.setUp(self) open(join(self.tempdir, 'adomain.domain'), 'w').write("""{ "identifier": "adomain", "mappingIds": ["ignored MAPPING"], "targetIds": ["ignored TARGET"], "repositoryGroupIds": ["Group1", "Group2"] }""") open(join(self.tempdir, 'adomain.Group1.repositoryGroup'), 'w').write("""{ "identifier": "Group1", "name": {"nl": "Groep1", "en": "Group1"}, "repositoryIds": ["repository1", "repository2"] }""") open(join(self.tempdir, 'adomain.Group2.repositoryGroup'), 'w').write("""{ "identifier": "Group2", "name": {"nl": "Groep2", "en": "Group2"}, "repositoryIds": ["repository2_1", "repository2_2"] } """) open(join(self.tempdir, 'adomain.repository1.repository'), 'w').write("""{ "identifier": "repository1", "repositoryGroupId": "Group1" }""") open(join(self.tempdir, 'adomain.repository2.repository'), 'w').write("""{ "identifier": "repository2", "repositoryGroupId": "Group1" }""") open(join(self.tempdir, 'adomain.repository2_1.repository'), 'w').write("""{ "identifier": "repository2_1", "repositoryGroupId": "Group2" }""") open(join(self.tempdir, 'adomain.repository2_2.repository'), 'w').write("""{ "identifier": "repository2_2", "repositoryGroupId": "Group2" }""") open(join(self.tempdir, 'adomain.remi.repository'), 'w').write("""{ "identifier": "remi", "repositoryGroupId": "NoGroup" }""") self.hd = HarvesterData(self.tempdir)
class HarvesterTest(IntegrationTestCase): def setUp(self): IntegrationTestCase.setUp(self) system("rm -rf %s" % self.harvesterLogDir) system("rm -rf %s" % self.harvesterStateDir) self.filesystemDir = join(self.integrationTempdir, 'filesystem') system("rm -rf %s" % self.filesystemDir) self.emptyDumpDir() system("mkdir -p %s" % join(self.harvesterStateDir, DOMAIN)) self.harvesterData = HarvesterData(join(self.integrationTempdir, 'data')) try: self.harvesterData.addRepositoryGroup(identifier=REPOSITORYGROUP, domainId=DOMAIN) except ValueError: pass self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP) def tearDown(self): self.removeRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP) IntegrationTestCase.tearDown(self) def saveRepository(self, domain, repositoryId, repositoryGroupId, metadataPrefix="oai_dc", action=None, mappingId='MAPPING', targetId='SRUUPDATE', maximumIgnore=5, complete=False, continuous=None): try: self.harvesterData.addRepository(identifier=repositoryId, domainId=domain, repositoryGroupId=repositoryGroupId) except ValueError: pass self.harvesterData.updateRepository( identifier=repositoryId, domainId=domain, baseurl='http://localhost:%s/oai' % self.helperServerPortNumber, set=None, metadataPrefix=metadataPrefix, mappingId=mappingId, targetId=targetId, collection=None, maximumIgnore=maximumIgnore, use=True, complete=complete, continuous=continuous, action=action, shopclosed=[] ) def removeRepository(self, domain, repositoryId, repositoryGroupId): self.harvesterData.deleteRepository(identifier=repositoryId, domainId=domain, repositoryGroupId=repositoryGroupId) def testHarvestReturnsErrorWillNotSaveState(self): logs = self.getLogs() self.saveRepository(DOMAIN, "repo_invalid_metadataPrefix", REPOSITORYGROUP, metadataPrefix="not_existing") try: self.startHarvester(repository="repo_invalid_metadataPrefix") self.startHarvester(repository="repo_invalid_metadataPrefix") logs = self.getLogs()[len(logs):] self.assertEquals(2, len(logs)) self.assertEquals('/oai', logs[-2]['path']) self.assertEquals({'verb':['ListRecords'], 'metadataPrefix':['not_existing']}, logs[0]['arguments']) self.assertEquals('/oai', logs[-1]['path']) self.assertEquals({'verb':['ListRecords'], 'metadataPrefix':['not_existing']}, logs[1]['arguments']) finally: self.removeRepository(DOMAIN, 'repo_invalid_metadataPrefix', REPOSITORYGROUP) def testHarvestToSruUpdate(self): # initial harvest oldlogs = self.getLogs() self.startHarvester(repository=REPOSITORY) self.assertEquals(BATCHSIZE, self.sizeDumpDir()) self.assertEquals(2, len([f for f in listdir(self.dumpDir) if "info:srw/action/1/delete" in open(join(self.dumpDir, f)).read()])) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(8, len(ids)) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(invalidIds)) logs = self.getLogs()[len(oldlogs):] self.assertEquals(1, len(logs)) self.assertEquals('/oai', logs[-1]['path']) self.assertEquals({'verb':['ListRecords'], 'metadataPrefix':['oai_dc']}, logs[-1]['arguments']) statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY) token = getResumptionToken(open(statsFile).readlines()[-1]) # resumptionToken self.startHarvester(repository=REPOSITORY) self.assertEquals(15, self.sizeDumpDir()) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(13, len(ids)) logs = self.getLogs()[len(oldlogs):] self.assertEquals(2, len(logs)) self.assertEquals('/oai', logs[-1]['path']) self.assertEquals({'verb':['ListRecords'], 'resumptionToken':[token]}, logs[-1]['arguments']) # Nothing self.startHarvester(repository=REPOSITORY) logs = self.getLogs()[len(oldlogs):] self.assertEquals(2, len(logs)) self.assertEquals(None, getResumptionToken(open(statsFile).readlines()[-1])) def testContinuousHarvest(self): oldlogs = self.getLogs() self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, continuous=1) t = Thread(target=lambda: self.startHarvester(concurrency=1, runOnce=False, repository=REPOSITORY)) t.start() try: sleepWheel(5) logs = self.getLogs()[len(oldlogs):] self.assertTrue(len(logs) > 2, logs) self.assertEqual({'path': '/oai', 'arguments': {'verb': ['ListRecords'], 'metadataPrefix': ['oai_dc']}}, logs[0]) self.assertTrue('resumptionToken' in logs[1]['arguments'], logs[1]) self.assertTrue('from' in logs[2]['arguments'], logs[2]) finally: t.join() def testIncrementalHarvesting(self): oldlogs = self.getLogs() statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY) with open(statsFile, 'w') as f: f.write('Started: 2011-03-31 13:11:44, Harvested/Uploaded/Deleted/Total: 300/300/0/300, Done: 2011-03-31 13:12:36, ResumptionToken: xyz\n') f.write('Started: 2011-04-01 14:11:44, Harvested/Uploaded/Deleted/Total: 300/300/0/300, Done: 2011-04-01 14:12:36, ResumptionToken:\n') self.startHarvester(repository=REPOSITORY) self.assertEquals(BATCHSIZE, self.sizeDumpDir()) logs = self.getLogs()[len(oldlogs):] self.assertEquals(1, len(logs)) self.assertEquals('/oai', logs[-1]['path']) self.assertEquals({'verb':['ListRecords'], 'metadataPrefix':['oai_dc'], 'from':['2011-03-31']}, logs[-1]['arguments']) def testClear(self): self.startHarvester(repository=REPOSITORY) self.assertEquals(BATCHSIZE, self.sizeDumpDir()) header, result = getRequest(self.harvesterInternalServerPortNumber, '/get', {'verb': 'GetStatus', 'domainId': DOMAIN, 'repositoryId': REPOSITORY}, parse=False) data = JsonDict.loads(result) self.assertEquals(8, data['response']['GetStatus'][0]['total']) self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='clear') self.startHarvester(repository=REPOSITORY) self.assertEquals(18, self.sizeDumpDir()) for filename in sorted(listdir(self.dumpDir))[-8:]: self.assertTrue('_delete.updateRequest' in filename, filename) header, result = getRequest(self.harvesterInternalServerPortNumber, '/get', {'verb': 'GetStatus', 'domainId': DOMAIN, 'repositoryId': REPOSITORY}, parse=False) self.assertEqual(0, JsonDict.loads(result)['response']['GetStatus'][0]['total']) def testRefresh(self): oldlogs = self.getLogs() log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY) log.startRepository() for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,7,120,121]]: log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123]]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='refresh') self.startHarvester(repository=REPOSITORY) logs = self.getLogs()[len(oldlogs):] self.assertEquals(0, len(logs)) self.startHarvester(repository=REPOSITORY) logs = self.getLogs() self.assertEquals('/oai', logs[-1]["path"]) self.assertEquals({'verb': ['ListRecords'], 'metadataPrefix': ['oai_dc']}, logs[-1]["arguments"]) statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY) token = getResumptionToken(open(statsFile).readlines()[-1]) self.startHarvester(repository=REPOSITORY) logs = self.getLogs() self.assertEquals('/oai', logs[-1]["path"]) self.assertEquals({'verb': ['ListRecords'], 'resumptionToken': [token]}, logs[-1]["arguments"]) self.assertEquals(15, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) self.assertEquals(17, self.sizeDumpDir()) deleteFiles = [join(self.dumpDir, f) for f in listdir(self.dumpDir) if '_delete' in f] deletedIds = set([xpathFirst(parse(open(x)), '//ucp:recordIdentifier/text()') for x in deleteFiles]) self.assertEquals(set(['%s:oai:record:03' % REPOSITORY, '%s:oai:record:06' % REPOSITORY, '%s:oai:record:120' % REPOSITORY, '%s:oai:record:121' % REPOSITORY]), deletedIds) logs = self.getLogs()[len(oldlogs):] self.startHarvester(repository=REPOSITORY) self.assertEquals(len(logs), len(self.getLogs()[len(oldlogs):]), 'Action is over, expect nothing more.') def testInvalidIgnoredUptoMaxIgnore(self): maxIgnore = 5 self.controlHelper(action='allInvalid') nrOfDeleted = 2 self.startHarvester(repository=REPOSITORY) self.assertEquals(nrOfDeleted, self.sizeDumpDir()) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(ids)) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(maxIgnore + 1, len(invalidIds), invalidIds) invalidDataMessagesDir = join(self.harvesterLogDir, DOMAIN, "invalid", REPOSITORY) self.assertEquals(maxIgnore + 1, len(listdir(invalidDataMessagesDir))) invalidDataMessage01 = open(join(invalidDataMessagesDir, "oai:record:01")).read() self.assertTrue('uploadId: "integrationtest:oai:record:01"', invalidDataMessage01) self.controlHelper(action='noneInvalid') self.startHarvester(repository=REPOSITORY) self.assertEquals(nrOfDeleted + BATCHSIZE, self.sizeDumpDir()) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(BATCHSIZE - nrOfDeleted, len(ids)) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(invalidIds), invalidIds) self.assertEquals(0, len(listdir(invalidDataMessagesDir))) def testHarvestToFilesystemTarget(self): self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, targetId='FILESYSTEM') self.startHarvester(repository=REPOSITORY) self.assertEquals(8, len(listdir(join(self.filesystemDir, REPOSITORYGROUP, REPOSITORY)))) self.assertEquals(['%s:oai:record:%02d' % (REPOSITORY, i) for i in [3,6]], [id.strip() for id in open(join(self.filesystemDir, 'deleted_records'))]) def testClearOnFilesystemTarget(self): self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, targetId='FILESYSTEM') self.startHarvester(repository=REPOSITORY) self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, targetId='FILESYSTEM', action='clear') self.startHarvester(repository=REPOSITORY) self.assertEquals(0, len(listdir(join(self.filesystemDir, REPOSITORYGROUP, REPOSITORY)))) self.assertEquals(set([ 'harvestertestrepository:oai:record:10', 'harvestertestrepository:oai:record:09', 'harvestertestrepository:oai:record:08', 'harvestertestrepository:oai:record:07', 'harvestertestrepository:oai:record:06', 'harvestertestrepository:oai:record:05', 'harvestertestrepository:oai:record:04', 'harvestertestrepository:oai:record:03', 'harvestertestrepository:oai:record:02%2F&gkn', 'harvestertestrepository:oai:record:01' ]), set([id.strip() for id in open(join(self.filesystemDir, 'deleted_records'))]) ) def testHarvestWithError(self): self.startHarvester(repository=REPOSITORY) self.emptyDumpDir() self.controlHelper(action='raiseExceptionOnIds', id=['%s:oai:record:12' % REPOSITORY]) self.startHarvester(repository=REPOSITORY) successFullRecords=['oai:record:11'] self.assertEquals(len(successFullRecords), self.sizeDumpDir()) self.emptyDumpDir() self.controlHelper(action='raiseExceptionOnIds', id=[]) self.startHarvester(repository=REPOSITORY) secondBatchSize = 5 self.assertEquals(secondBatchSize, self.sizeDumpDir()) def testClearWithError(self): self.startHarvester(repository=REPOSITORY) self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='clear') self.controlHelper(action='raiseExceptionOnIds', id=['%s:oai:record:05' % REPOSITORY]) self.emptyDumpDir() self.startHarvester(repository=REPOSITORY) successFullDeletes = [1,2,4] deletesTodo = [5,7,8,9,10] self.assertEquals(len(successFullDeletes), self.sizeDumpDir()) self.controlHelper(action='raiseExceptionOnIds', id=[]) self.emptyDumpDir() self.startHarvester(repository=REPOSITORY) self.assertEquals(len(deletesTodo), self.sizeDumpDir()) def testRefreshWithIgnoredRecords(self): log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY) log.startRepository() for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,2,120,121]]: if uploadId == '%s:oai:record:02' % (REPOSITORY): uploadId = '%s:oai:record:02/&gkn' % (REPOSITORY) log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123,124]]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [7,8,125,126,127,128]]: log.notifyHarvestedRecord(uploadId) log.logInvalidData(uploadId, 'ignored message') log.logIgnoredIdentifierWarning(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() totalRecords = 15 oldUploads = 2 oldDeletes = 3 oldIgnoreds = 4 self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='refresh') self.startHarvester(repository=REPOSITORY) # Smoot init self.assertEquals(0, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) # Smooth harvest self.startHarvester(repository=REPOSITORY) # Smooth harvest self.assertEquals(totalRecords, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) # Smooth finish self.assertEquals(totalRecords + oldUploads + oldIgnoreds, self.sizeDumpDir()) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(invalidIds), invalidIds) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(13, len(ids), ids) def testClearWithInvalidRecords(self): log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY) log.startRepository() for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,2,120,121]]: log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123,124]]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [7,8,125,126,127,128]]: log.notifyHarvestedRecord(uploadId) log.logInvalidData(uploadId, 'ignored message') log.logIgnoredIdentifierWarning(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() oldUploads = 4 oldDeletes = 5 oldInvalids = 6 self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='clear') self.startHarvester(repository=REPOSITORY) self.assertEquals(oldUploads+oldInvalids, self.sizeDumpDir()) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(invalidIds), invalidIds) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(ids), ids) def testConcurrentHarvestToSruUpdate(self): self.startHarvester(concurrency=3) requestsLogged = sorted(listdir(self.dumpDir)) repositoryIds = [] for f in requestsLogged: lxml = parse(open(join(self.dumpDir, f))) repositoryIds.append(xpath(lxml, '//ucp:recordIdentifier/text()')[0].split(':', 1)[0]) repositoryIdsSet = set(repositoryIds) self.assertEquals(set(['repository2', 'integrationtest', 'harvestertestrepository']), repositoryIdsSet) lastSeenRepoId = None try: for repo in repositoryIds: if repo != lastSeenRepoId: repositoryIdsSet.remove(repo) lastSeenRepoId = repo continue except KeyError: pass else: self.fail('Records should have been inserted out-of-order.') def testConcurrentHarvestToSruUpdateBUG(self): self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, complete=True) self.startHarvester(concurrency=1) requestsLogged = sorted(listdir(self.dumpDir)) repositoryIds = [] for f in requestsLogged: lxml = parse(open(join(self.dumpDir, f))) repositoryIds.append(xpath(lxml, '//ucp:recordIdentifier/text()')[0].split(':', 1)[0]) self.assertEquals(15, repositoryIds.count(REPOSITORY)) self.assertEquals(10, repositoryIds.count('repository2')) self.assertEquals(10, repositoryIds.count('integrationtest')) def testStartHarvestingAddedRepository(self): t = Thread(target=lambda: self.startHarvester(concurrency=1, runOnce=False)) t.start() while not listdir(self.dumpDir): sleep(0.1) self.saveRepository(DOMAIN, 'xyz', REPOSITORYGROUP) stdoutfile = join(self.integrationTempdir, "stdouterr-meresco-harvester-harvester.log") sleepWheel(5) log = open(stdoutfile).read() try: self.assertTrue('xyz' in log, log) finally: self.removeRepository(DOMAIN, 'xyz', REPOSITORYGROUP) t.join() def testDontHarvestDeletedRepository(self): stdoutfile = join(self.integrationTempdir, "stdouterr-meresco-harvester-harvester.log") self.saveRepository(DOMAIN, 'xyz', REPOSITORYGROUP) t = Thread(target=lambda: self.startHarvester(concurrency=1, runOnce=False)) t.start() while not listdir(self.dumpDir): sleep(0.1) sleepWheel(1) log = open(stdoutfile).read() xyzOccurrences = log.count('[xyz]') self.removeRepository(DOMAIN, 'xyz', REPOSITORYGROUP) sleepWheel(5) log = open(stdoutfile).read() try: self.assertFalse('Traceback' in log, log) newXyzOccurrences = log.count('[xyz]') self.assertEquals(xyzOccurrences, newXyzOccurrences, "%s!=%s\n%s" % (xyzOccurrences, newXyzOccurrences, log)) finally: t.join() def testConcurrencyAtLeastOne(self): stdouterrlog = self.startHarvester(concurrency=0, expectedReturnCode=2) self.assertTrue("Concurrency must be at least 1" in stdouterrlog, stdouterrlog) stdouterrlog = self.startHarvester(concurrency=-1, expectedReturnCode=2) self.assertTrue("Concurrency must be at least 1" in stdouterrlog, stdouterrlog) def testCompleteInOnAttempt(self): self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, complete=True) stdouterrlog = self.startHarvester(repository=REPOSITORY, runOnce=True, timeoutInSeconds=5) self.assertEquals(15, self.sizeDumpDir()) self.assertTrue("Repository will be completed in one attempt" in stdouterrlog, stdouterrlog) def testHarvestingContinues4Ever(self): try: self.startHarvester(repository=REPOSITORY, runOnce=False, timeoutInSeconds=5) except SystemExit, e: self.assertTrue('took more than 5 seconds' in str(e), str(e)) self.assertEquals(15, self.sizeDumpDir())
class HarvesterDataActionsTest(SeecrTestCase): def setUp(self): SeecrTestCase.setUp(self) self.hd = HarvesterData(self.tempdir) self.hd.addDomain('domain') self.hd.addRepositoryGroup('group', domainId='domain') self.hd.addRepository('repository', repositoryGroupId='group', domainId='domain') self.hd.updateFieldDefinition( 'domain', { 'repository_fields': [ { 'name': 'name', 'label': 'Label', 'type': 'text', 'export': False }, { 'name': 'choice_1', 'label': 'Keuze', 'type': 'bool', 'export': False }, { 'name': 'choice_2', 'label': 'Keuze', 'type': 'bool', 'export': False }, ] }) self.hda = HarvesterDataActions() self.hda.addObserver(self.hd) self.observable = CallTrace() self.dna = be( (Observable(), (HarvesterDataActions(), (self.observable, )))) def testAddDomain(self): header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=False)), path="/actions/addDomain", Body=bytes(urlencode(dict(identifier="aap")), encoding="utf-8"), Method='Post'))) self.assertEqual(0, len(self.observable.calledMethods)) self.assertEqual("200", header['StatusCode']) self.assertEqual("application/json", header['Headers']['Content-Type']) response = JsonDict.loads(body) self.assertFalse(response['success']) self.assertEqual("Not allowed", response['message']) header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=True)), path="/actions/addDomain", Body=bytes(urlencode(dict(identifier="aap")), encoding="utf-8"), Method='Post'))) self.assertEqual("200", header['StatusCode']) self.assertEqual("application/json", header['Headers']['Content-Type']) response = JsonDict.loads(body) self.assertTrue(response['success']) self.assertEqual(1, len(self.observable.calledMethods)) self.assertEqual("addDomain", self.observable.calledMethods[0].name) self.assertEqual(dict(identifier='aap'), self.observable.calledMethods[0].kwargs) def testSetRepositoryDone(self): self.updateTheRepository(action='refresh') repository = self.hd.getRepository('repository', 'domain') self.assertEqual('refresh', repository['action']) data = dict(domainId='domain', identifier='repository') consume( self.hda.handleRequest(Method='POST', path='/somewhere/repositoryDone', Body=bUrlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEqual(None, repository['action']) def testUpdateRepositoryGroup(self): header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=True)), Method='POST', path='/somewhere/updateRepositoryGroup', Body=bUrlencode(dict( identifier='group', domainId='domain', nl_name="De nieuwe naam", en_name="The old name", ), doseq=True)))) self.assertEqual('200', header['StatusCode']) self.assertEqual(dict(success=True), JsonDict.loads(body)) self.assertEqual(1, len(self.observable.calledMethods)) self.assertEqual('updateRepositoryGroup', self.observable.calledMethods[0].name) self.assertEqual( { 'identifier': 'group', 'domainId': 'domain', 'name': { 'nl': 'De nieuwe naam', 'en': 'The old name' } }, self.observable.calledMethods[0].kwargs) def testCreateRepository(self): header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=True)), Method='POST', path='/actions/addRepository', Body=bUrlencode(dict( identifier='repo-id', domainId='domain-id', repositoryGroupId='repositoryGroupId', ), doseq=True)))) self.assertEqual('200', header['StatusCode']) self.assertEqual(dict(success=True), JsonDict.loads(body)) self.assertEqual(1, len(self.observable.calledMethods)) self.assertEqual('addRepository', self.observable.calledMethods[0].name) self.assertEqual( { 'domainId': 'domain-id', 'identifier': 'repo-id', 'repositoryGroupId': 'repositoryGroupId' }, self.observable.calledMethods[0].kwargs) def testDeleteRepository(self): header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=True)), Method='POST', path='/actions/deleteRepository', Body=bUrlencode(dict( identifier='repo-id', domainId='domain-id', repositoryGroupId='repositoryGroupId', ), doseq=True)))) self.assertEqual('200', header['StatusCode']) self.assertEqual(dict(success=True), JsonDict.loads(body)) self.assertEqual(1, len(self.observable.calledMethods)) self.assertEqual('deleteRepository', self.observable.calledMethods[0].name) self.assertEqual( { 'domainId': 'domain-id', 'identifier': 'repo-id', 'repositoryGroupId': 'repositoryGroupId' }, self.observable.calledMethods[0].kwargs) def testUpdateRepositoryAttributes(self): header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=True)), Method='POST', path='/actions/updateRepositoryAttributes', Body=bUrlencode(dict( identifier='repo-id', domainId='domain-id', userAgent="Herman in de zon op een terras", ), doseq=True)))) self.assertEqual('200', header['StatusCode']) self.assertEqual(dict(success=True), JsonDict.loads(body)) self.assertEqual(1, len(self.observable.calledMethods)) self.assertEqual('updateRepositoryAttributes', self.observable.calledMethods[0].name) self.assertEqual( { 'identifier': 'repo-id', 'domainId': 'domain-id', 'baseurl': None, 'set': None, 'metadataPrefix': None, 'userAgent': 'Herman in de zon op een terras', 'collection': None, 'authorizationKey': None, 'mappingId': None, 'targetId': None }, self.observable.calledMethods[0].kwargs) header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=True)), Method='POST', path='/actions/updateRepositoryAttributes', Body=bUrlencode(dict( identifier='repo-id', domainId='domain-id', userAgent="", ), doseq=True)))) self.assertEqual('200', header['StatusCode']) self.assertEqual(dict(success=True), JsonDict.loads(body)) self.assertEqual(2, len(self.observable.calledMethods)) self.assertEqual('updateRepositoryAttributes', self.observable.calledMethods[1].name) self.assertEqual( { 'identifier': 'repo-id', 'domainId': 'domain-id', 'baseurl': None, 'set': None, 'metadataPrefix': None, 'userAgent': None, 'collection': None, 'authorizationKey': None, 'mappingId': None, 'targetId': None }, self.observable.calledMethods[1].kwargs) def testUpdateRepositoryActionForm(self): header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=True)), Method='POST', path='/actions/updateRepositoryActionAttributes', Body=bUrlencode(dict( identifier='repo-id', domainId='domain-id', maximumIgnore="42", ), doseq=True)))) self.assertEqual('200', header['StatusCode']) self.assertEqual(dict(success=True), JsonDict.loads(body)) self.assertEqual(1, len(self.observable.calledMethods)) self.assertEqual('updateRepositoryAttributes', self.observable.calledMethods[0].name) self.assertEqual( { 'complete': False, 'continuous': None, 'domainId': 'domain-id', 'identifier': 'repo-id', 'maximumIgnore': '42', 'action': None, 'use': False }, self.observable.calledMethods[0].kwargs) def testUpdateRepositoryActionForm_booleanFields(self): header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=True)), Method='POST', path='/actions/updateRepositoryActionAttributes', Body=bUrlencode(dict( identifier='repo-id', domainId='domain-id', complete="on", ), doseq=True)))) self.assertEqual('200', header['StatusCode']) self.assertEqual(dict(success=True), JsonDict.loads(body)) self.assertEqual(1, len(self.observable.calledMethods)) self.assertEqual('updateRepositoryAttributes', self.observable.calledMethods[0].name) self.assertEqual( { 'complete': True, 'continuous': None, 'domainId': 'domain-id', 'identifier': 'repo-id', 'maximumIgnore': 0, 'action': None, 'use': False }, self.observable.calledMethods[0].kwargs) def testUpdateRepositoryActionForm_Action(self): header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=True)), Method='POST', path='/actions/updateRepositoryActionAttributes', Body=bUrlencode(dict( identifier='repo-id', domainId='domain-id', action="-", ), doseq=True)))) self.assertEqual('200', header['StatusCode']) self.assertEqual(dict(success=True), JsonDict.loads(body)) self.assertEqual(1, len(self.observable.calledMethods)) self.assertEqual('updateRepositoryAttributes', self.observable.calledMethods[0].name) self.assertEqual( { 'complete': False, 'continuous': None, 'domainId': 'domain-id', 'identifier': 'repo-id', 'maximumIgnore': 0, 'action': None, 'use': False }, self.observable.calledMethods[0].kwargs) def testAddClosingHours(self): header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=True)), Method='POST', path='/actions/addRepositoryClosingHours', Body=bUrlencode(dict(repositoryId='repo-id', domainId='domain-id', week="*", day="1", startHour="10", endHour="14"), doseq=True)))) self.assertEqual('200', header['StatusCode']) self.assertEqual(dict(success=True), JsonDict.loads(body)) self.assertEqual(1, len(self.observable.calledMethods)) self.assertEqual('addClosingHours', self.observable.calledMethods[0].name) self.assertEqual( { 'day': '1', 'domainId': 'domain-id', 'endHour': '14', 'identifier': 'repo-id', 'startHour': '10', 'week': '*' }, self.observable.calledMethods[0].kwargs) def testDeleteClosingHours(self): header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=True)), Method='POST', path='/actions/deleteRepositoryClosingHours', Body=bUrlencode(dict(repositoryId='repo-id', domainId='domain-id', closingHour="0"), doseq=True)))) self.assertEqual('200', header['StatusCode']) self.assertEqual(dict(success=True), JsonDict.loads(body)) self.assertEqual(1, len(self.observable.calledMethods)) self.assertEqual('deleteClosingHours', self.observable.calledMethods[0].name) self.assertEqual( { 'domainId': 'domain-id', 'identifier': 'repo-id', 'closingHoursIndex': '0' }, self.observable.calledMethods[0].kwargs) def testUpdateFieldDefinition(self): header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=True)), Method='POST', path='/actions/updateFieldDefinition', Body=bUrlencode(dict( domainId='domain-id', fieldDefinition='{"is":"json"}', ), doseq=True)))) self.assertEqual('200', header['StatusCode']) self.assertEqual(dict(success=True), JsonDict.loads(body)) self.assertEqual(1, len(self.observable.calledMethods)) self.assertEqual('updateFieldDefinition', self.observable.calledMethods[0].name) self.assertEqual({ 'domainId': 'domain-id', 'data': { 'is': 'json' }, }, self.observable.calledMethods[0].kwargs) def testUpdateFieldDefinition_error(self): header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=True)), Method='POST', path='/actions/updateFieldDefinition', Body=bUrlencode(dict( domainId='domain-id', fieldDefinition='{"is no json"}', ), doseq=True)))) self.assertEqual('200', header['StatusCode']) self.assertEqual(dict(success=False, message='Ongeldige JSON'), JsonDict.loads(body)) self.assertEqual(0, len(self.observable.calledMethods)) def testUpdateRepositoryFieldDefinition(self): header, body = parseResponse( asBytes( self.dna.all.handleRequest( user=CallTrace(returnValues=dict(isAdmin=True)), Method='POST', path='/actions/updateRepositoryFieldDefinitions', Body=bUrlencode(dict( identifier='repo-id', domainId='domain-id', extra_name="Herman in de zon op een terras", extra_no_such_field="Bestaat niet"), doseq=True)))) self.assertEqual('200', header['StatusCode']) self.assertEqual(dict(success=True), JsonDict.loads(body)) self.assertEqual(1, len(self.observable.calledMethods)) self.assertEqual('updateRepositoryFieldDefinitions', self.observable.calledMethods[0].name) self.assertEqual( { 'identifier': 'repo-id', 'domainId': 'domain-id', 'extra_no_such_field': 'Bestaat niet', 'extra_name': "Herman in de zon op een terras" }, self.observable.calledMethods[0].kwargs) def updateTheRepository(self, baseurl='', set='', metadataPrefix='', mappingId='', targetId='', collection='', maximumIgnore=0, use=False, continuous=False, complete=True, action='', shopclosed=None): self.hd.updateRepositoryAttributes( identifier='repository', domainId='domain', baseurl=baseurl, set=set, metadataPrefix=metadataPrefix, mappingId=mappingId, targetId=targetId, collection=collection, maximumIgnore=maximumIgnore, use=use, continuous=continuous, complete=complete, action=action, userAgent='', authorizationKey='', )
class HarvesterDataActionsTest(SeecrTestCase): def setUp(self): SeecrTestCase.setUp(self) self.hd = HarvesterData(self.tempdir) self.hd.addDomain('domain') self.hd.addRepositoryGroup('group', domainId='domain') self.hd.addRepository('repository', repositoryGroupId='group', domainId='domain') self.hda = HarvesterDataActions() self.hda.addObserver(self.hd) def testUpdateRepository(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "baseurl": "http://example.org/oai", "set": "ASET", "metadataPrefix": "oai_dc", "mappingId": "mapping_identifier", "targetId": "", "collection": "the collection", "maximumIgnore": "23", "complete": "1", "continuous": "60", "repositoryAction": "clear", "numberOfTimeslots": "0" } consume( self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals('group', repository["repositoryGroupId"]) self.assertEquals("repository", repository["identifier"]) self.assertEquals("http://example.org/oai", repository["baseurl"]) self.assertEquals("ASET", repository["set"]) self.assertEquals("oai_dc", repository["metadataPrefix"]) self.assertEquals("mapping_identifier", repository["mappingId"]) self.assertEquals(None, repository["targetId"]) self.assertEquals("the collection", repository["collection"]) self.assertEquals(23, repository["maximumIgnore"]) self.assertEquals(True, repository["complete"]) self.assertEquals(60, repository["continuous"]) self.assertEquals(False, repository["use"]) self.assertEquals("clear", repository["action"]) self.assertEquals([], repository['shopclosed']) def testMinimalInfo(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", } consume( self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals('group', repository["repositoryGroupId"]) self.assertEquals("repository", repository["identifier"]) self.assertEquals(None, repository["baseurl"]) self.assertEquals(None, repository["set"]) self.assertEquals(None, repository["metadataPrefix"]) self.assertEquals(None, repository["mappingId"]) self.assertEquals(None, repository["targetId"]) self.assertEquals(None, repository["collection"]) self.assertEquals(0, repository["maximumIgnore"]) self.assertEquals(None, repository["continuous"]) self.assertEquals(False, repository["complete"]) self.assertEquals(False, repository["use"]) self.assertEquals(None, repository["action"]) self.assertEquals([], repository['shopclosed']) def testShopClosedButNotAdded(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "0", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', } consume( self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals([], repository['shopclosed']) def testShopClosedAdded(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "0", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', "addTimeslot": "button pressed", } consume( self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals(['*:*:7:0-*:*:9:0'], repository['shopclosed']) def testModifyShopClosed(self): self.updateTheRepository(shopclosed=[ '1:2:7:0-1:2:9:0', '2:*:7:0-2:*:9:0', ]) data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "2", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', 'shopclosedWeek_1': '3', 'shopclosedWeekDay_1': '*', 'shopclosedBegin_1': '17', 'shopclosedEnd_1': '19', 'shopclosedWeek_2': '4', 'shopclosedWeekDay_2': '5', 'shopclosedBegin_2': '9', 'shopclosedEnd_2': '10', } consume( self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals([ '3:*:17:0-3:*:19:0', '4:5:9:0-4:5:10:0', ], repository['shopclosed']) def testDeleteShopClosed(self): self.updateTheRepository(shopclosed=[ '1:2:7:0-1:2:9:0', '2:*:7:0-2:*:9:0', ]) data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "2", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', 'shopclosedWeek_1': '3', 'shopclosedWeekDay_1': '*', 'shopclosedBegin_1': '17', 'shopclosedEnd_1': '19', 'shopclosedWeek_2': '4', 'shopclosedWeekDay_2': '5', 'shopclosedBegin_2': '9', 'shopclosedEnd_2': '10', 'deleteTimeslot_1.x': '10', 'deleteTimeslot_1.y': '20', } consume( self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals([ '4:5:9:0-4:5:10:0', ], repository['shopclosed']) def testSetRepositoryDone(self): self.updateTheRepository(action='refresh') repository = self.hd.getRepository('repository', 'domain') self.assertEquals('refresh', repository['action']) data = dict(domainId='domain', identifier='repository') consume( self.hda.handleRequest(Method='POST', path='/somewhere/repositoryDone', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals(None, repository['action']) def updateTheRepository(self, baseurl='', set='', metadataPrefix='', mappingId='', targetId='', collection='', maximumIgnore=0, use=False, continuous=False, complete=True, action='', shopclosed=None): self.hd.updateRepository('repository', domainId='domain', baseurl=baseurl, set=set, metadataPrefix=metadataPrefix, mappingId=mappingId, targetId=targetId, collection=collection, maximumIgnore=maximumIgnore, use=use, continuous=continuous, complete=complete, action=action, shopclosed=shopclosed or [])
class HarvesterDataTest(SeecrTestCase): def setUp(self): SeecrTestCase.setUp(self) open(join(self.tempdir, 'adomain.domain'), 'w').write("""{ "identifier": "adomain", "mappingIds": ["ignored MAPPING"], "targetIds": ["ignored TARGET"], "repositoryGroupIds": ["Group1", "Group2"] }""") open(join(self.tempdir, 'adomain.Group1.repositoryGroup'), 'w').write("""{ "identifier": "Group1", "name": {"nl": "Groep1", "en": "Group1"}, "repositoryIds": ["repository1", "repository2"] }""") open(join(self.tempdir, 'adomain.Group2.repositoryGroup'), 'w').write("""{ "identifier": "Group2", "name": {"nl": "Groep2", "en": "Group2"}, "repositoryIds": ["repository2_1", "repository2_2"] } """) open(join(self.tempdir, 'adomain.repository1.repository'), 'w').write("""{ "identifier": "repository1", "repositoryGroupId": "Group1" }""") open(join(self.tempdir, 'adomain.repository2.repository'), 'w').write("""{ "identifier": "repository2", "repositoryGroupId": "Group1" }""") open(join(self.tempdir, 'adomain.repository2_1.repository'), 'w').write("""{ "identifier": "repository2_1", "repositoryGroupId": "Group2" }""") open(join(self.tempdir, 'adomain.repository2_2.repository'), 'w').write("""{ "identifier": "repository2_2", "repositoryGroupId": "Group2" }""") open(join(self.tempdir, 'adomain.remi.repository'), 'w').write("""{ "identifier": "remi", "repositoryGroupId": "NoGroup" }""") self.hd = HarvesterData(self.tempdir) def testGetRepositoryGroupIds(self): self.assertEquals(["Group1", "Group2"], self.hd.getRepositoryGroupIds(domainId="adomain")) def testGetRepositoryIds(self): self.assertEquals(["repository1", "repository2"], self.hd.getRepositoryIds(domainId="adomain", repositoryGroupId="Group1")) self.assertEquals(["repository1", "repository2", "repository2_1", "repository2_2"], self.hd.getRepositoryIds(domainId="adomain")) def testGetRepositoryGroupId(self): self.assertEquals("Group1", self.hd.getRepositoryGroupId(domainId="adomain", repositoryId="repository1")) def testGetRepositoryGroup(self): self.assertEqual({ 'identifier': 'Group1', 'name': {'en': 'Group1', 'nl': 'Groep1'}, 'repositoryIds': ['repository1', 'repository2'] }, self.hd.getRepositoryGroup(identifier='Group1', domainId='adomain')) def testGetRepositories(self): result = self.hd.getRepositories(domainId='adomain') self.assertEqualsWS("""[ { "identifier": "repository1", "repositoryGroupId": "Group1" }, { "identifier": "repository2", "repositoryGroupId": "Group1" }, { "identifier": "repository2_1", "repositoryGroupId": "Group2" }, { "identifier": "repository2_2", "repositoryGroupId": "Group2" } ]""", result.dumps()) def testGetRepositoriesWithError(self): try: self.hd.getRepositories(domainId='adomain', repositoryGroupId='doesnotexist') self.fail() except ValueError, e: self.assertEqual('idDoesNotExist', str(e)) try: self.hd.getRepositories(domainId='baddomain') self.fail() except ValueError, e: self.assertEqual('idDoesNotExist', str(e))
class HarvesterDataActionsTest(SeecrTestCase): def setUp(self): SeecrTestCase.setUp(self) self.hd = HarvesterData(self.tempdir) self.hd.addDomain('domain') self.hd.addRepositoryGroup('group', domainId='domain') self.hd.addRepository('repository', repositoryGroupId='group', domainId='domain') self.hda = HarvesterDataActions() self.hda.addObserver(self.hd) def testUpdateRepository(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "baseurl": "http://example.org/oai", "set": "ASET", "metadataPrefix": "oai_dc", "mappingId": "mapping_identifier", "targetId": "", "collection": "the collection", "maximumIgnore": "23", "complete": "1", "continuous": "60", "repositoryAction": "clear", "numberOfTimeslots": "0" } consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals('group', repository["repositoryGroupId"]) self.assertEquals("repository", repository["identifier"]) self.assertEquals("http://example.org/oai", repository["baseurl"]) self.assertEquals("ASET", repository["set"]) self.assertEquals("oai_dc", repository["metadataPrefix"]) self.assertEquals("mapping_identifier", repository["mappingId"]) self.assertEquals(None, repository["targetId"]) self.assertEquals("the collection", repository["collection"]) self.assertEquals(23, repository["maximumIgnore"]) self.assertEquals(True, repository["complete"]) self.assertEquals(60, repository["continuous"]) self.assertEquals(False, repository["use"]) self.assertEquals("clear", repository["action"]) self.assertEquals([], repository['shopclosed']) def testMinimalInfo(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", } consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals('group', repository["repositoryGroupId"]) self.assertEquals("repository", repository["identifier"]) self.assertEquals(None, repository["baseurl"]) self.assertEquals(None, repository["set"]) self.assertEquals(None, repository["metadataPrefix"]) self.assertEquals(None, repository["mappingId"]) self.assertEquals(None, repository["targetId"]) self.assertEquals(None, repository["collection"]) self.assertEquals(0, repository["maximumIgnore"]) self.assertEquals(None, repository["continuous"]) self.assertEquals(False, repository["complete"]) self.assertEquals(False, repository["use"]) self.assertEquals(None, repository["action"]) self.assertEquals([], repository['shopclosed']) def testShopClosedButNotAdded(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "0", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', } consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals([], repository['shopclosed']) def testShopClosedAdded(self): data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "0", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', "addTimeslot": "button pressed", } consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals(['*:*:7:0-*:*:9:0'], repository['shopclosed']) def testModifyShopClosed(self): self.updateTheRepository(shopclosed=['1:2:7:0-1:2:9:0', '2:*:7:0-2:*:9:0',]) data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "2", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', 'shopclosedWeek_1': '3', 'shopclosedWeekDay_1': '*', 'shopclosedBegin_1': '17', 'shopclosedEnd_1': '19', 'shopclosedWeek_2': '4', 'shopclosedWeekDay_2': '5', 'shopclosedBegin_2': '9', 'shopclosedEnd_2': '10', } consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals(['3:*:17:0-3:*:19:0', '4:5:9:0-4:5:10:0',], repository['shopclosed']) def testDeleteShopClosed(self): self.updateTheRepository(shopclosed=['1:2:7:0-1:2:9:0', '2:*:7:0-2:*:9:0',]) data = { 'redirectUri': 'http://example.org', "repositoryGroupId": "ignored", "identifier": "repository", "domainId": "domain", "numberOfTimeslots": "2", 'shopclosedWeek_0': '*', 'shopclosedWeekDay_0': '*', 'shopclosedBegin_0': '7', 'shopclosedEnd_0': '9', 'shopclosedWeek_1': '3', 'shopclosedWeekDay_1': '*', 'shopclosedBegin_1': '17', 'shopclosedEnd_1': '19', 'shopclosedWeek_2': '4', 'shopclosedWeekDay_2': '5', 'shopclosedBegin_2': '9', 'shopclosedEnd_2': '10', 'deleteTimeslot_1.x': '10', 'deleteTimeslot_1.y': '20', } consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals(['4:5:9:0-4:5:10:0',], repository['shopclosed']) def testSetRepositoryDone(self): self.updateTheRepository(action='refresh') repository = self.hd.getRepository('repository', 'domain') self.assertEquals('refresh', repository['action']) data = dict(domainId='domain', identifier='repository') consume(self.hda.handleRequest(Method='POST', path='/somewhere/repositoryDone', Body=urlencode(data, doseq=True))) repository = self.hd.getRepository('repository', 'domain') self.assertEquals(None, repository['action']) def updateTheRepository(self, baseurl='', set='', metadataPrefix='', mappingId='', targetId='', collection='', maximumIgnore=0, use=False, continuous=False, complete=True, action='', shopclosed=None): self.hd.updateRepository('repository', domainId='domain', baseurl=baseurl, set=set, metadataPrefix=metadataPrefix, mappingId=mappingId, targetId=targetId, collection=collection, maximumIgnore=maximumIgnore, use=use, continuous=continuous, complete=complete, action=action, shopclosed=shopclosed or [] )
def createHarvesterData(self, id_fn): return HarvesterData(self.tempdir, id_fn=id_fn, datastore=OldDataStore(self.tempdir, id_fn=id_fn))
class HarvesterTest(IntegrationTestCase): def setUp(self): IntegrationTestCase.setUp(self) system("rm -rf %s" % self.harvesterLogDir) system("rm -rf %s" % self.harvesterStateDir) self.filesystemDir = join(self.integrationTempdir, 'filesystem') system("rm -rf %s" % self.filesystemDir) self.controlHelper(action='reset') self.emptyDumpDir() self.domainStatePath = pathlib.Path(self.harvesterStateDir) / DOMAIN self.domainLogPath = pathlib.Path(self.harvesterLogDir) / DOMAIN self.domainStatePath.mkdir(parents=True) self.domainLogPath.mkdir(parents=True) self.harvesterData = HarvesterData( join(self.integrationTempdir, 'data')) try: self.harvesterData.addRepositoryGroup(identifier=REPOSITORYGROUP, domainId=DOMAIN) except ValueError: pass self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP) def tearDown(self): self.removeRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP) IntegrationTestCase.tearDown(self) def saveRepository(self, domain, repositoryId, repositoryGroupId, metadataPrefix="oai_dc", action=None, mappingId='MAPPING', targetId='SRUUPDATE', maximumIgnore=5, complete=False, continuous=None, baseUrl=None): baseUrl = baseUrl if baseUrl else 'http://localhost:%s/oai' % self.helperServerPortNumber try: self.harvesterData.addRepository( identifier=repositoryId, domainId=domain, repositoryGroupId=repositoryGroupId) except ValueError: pass self.harvesterData.updateRepositoryAttributes( identifier=repositoryId, domainId=domain, baseurl=baseUrl, set=None, metadataPrefix=metadataPrefix, mappingId=mappingId, targetId=targetId, collection=None, maximumIgnore=maximumIgnore, use=True, complete=complete, continuous=continuous, action=action, userAgent='', authorizationKey='', ) def removeRepository(self, domain, repositoryId, repositoryGroupId): self.harvesterData.deleteRepository( identifier=repositoryId, domainId=domain, repositoryGroupId=repositoryGroupId) def testHarvestReturnsErrorWillNotSaveState(self): logs = self.getLogs() self.saveRepository(DOMAIN, "repo_invalid_metadataPrefix", REPOSITORYGROUP, metadataPrefix="not_existing") try: self.startHarvester(repository="repo_invalid_metadataPrefix") self.startHarvester(repository="repo_invalid_metadataPrefix") logs = self.getLogs()[len(logs):] self.assertEqual(2, len(logs)) self.assertEqual('/oai', logs[-2]['path']) self.assertEqual( { 'verb': ['ListRecords'], 'metadataPrefix': ['not_existing'] }, logs[0]['arguments']) self.assertEqual('/oai', logs[-1]['path']) self.assertEqual( { 'verb': ['ListRecords'], 'metadataPrefix': ['not_existing'] }, logs[1]['arguments']) finally: self.removeRepository(DOMAIN, 'repo_invalid_metadataPrefix', REPOSITORYGROUP) def get_ids(self, ids_name, repository=REPOSITORY): state = State(self.domainStatePath, self.domainLogPath, repository) try: return getattr(state, ids_name) finally: state.close() def testHarvestToSruUpdate(self): # initial harvest oldlogs = self.getLogs() self.startHarvester(repository=REPOSITORY) self.assertEqual(BATCHSIZE, self.sizeDumpDir()) self.assertEqual( 2, len([ f for f in listdir(self.dumpDir) if "info:srw/action/1/delete" in open(join(self.dumpDir, f)).read() ])) ids = self.get_ids('ids') self.assertEqual(8, len(ids)) invalidIds = self.get_ids('invalidIds') self.assertEqual(0, len(invalidIds)) logs = self.getLogs()[len(oldlogs):] self.assertEqual(1, len(logs)) self.assertEqual('/oai', logs[-1]['path']) self.assertEqual( { 'verb': ['ListRecords'], 'metadataPrefix': ['oai_dc'] }, logs[-1]['arguments']) statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY) token = getResumptionToken(open(statsFile).readlines()[-1]) # resumptionToken self.startHarvester(repository=REPOSITORY) self.assertEqual(15, self.sizeDumpDir()) ids = self.get_ids('ids') self.assertEqual(13, len(ids)) logs = self.getLogs()[len(oldlogs):] self.assertEqual(2, len(logs)) self.assertEqual('/oai', logs[-1]['path']) self.assertEqual({ 'verb': ['ListRecords'], 'resumptionToken': [token] }, logs[-1]['arguments']) # Nothing output = self.startHarvester(repository=REPOSITORY) self.assertEqual('Nothing to do!', what_happened(output)) logs = self.getLogs()[len(oldlogs):] self.assertEqual(2, len(logs)) self.assertEqual(None, getResumptionToken(open(statsFile).readlines()[-1])) def testContinuousHarvest(self): oldlogs = self.getLogs() self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, continuous=1) t = Thread(target=lambda: self.startHarvester( concurrency=1, runOnce=False, repository=REPOSITORY)) t.start() try: sleepWheel(5) logs = self.getLogs()[len(oldlogs):] self.assertTrue(len(logs) > 2, logs) self.assertEqual( { 'path': '/oai', 'arguments': { 'verb': ['ListRecords'], 'metadataPrefix': ['oai_dc'] } }, logs[0]) self.assertTrue('resumptionToken' in logs[1]['arguments'], logs[1]) self.assertTrue('from' in logs[2]['arguments'], logs[2]) finally: t.join() def testIncrementalHarvesting(self): oldlogs = self.getLogs() statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY) with open(statsFile, 'w') as f: f.write( 'Started: 2011-03-31 13:11:44, Harvested/Uploaded/Deleted/Total: 300/300/0/300, Done: 2011-03-31 13:12:36, ResumptionToken: xyz\n' ) f.write( 'Started: 2011-04-01 14:11:44, Harvested/Uploaded/Deleted/Total: 300/300/0/300, Done: 2011-04-01 14:12:36, ResumptionToken:\n' ) self.startHarvester(repository=REPOSITORY) self.assertEqual(BATCHSIZE, self.sizeDumpDir()) logs = self.getLogs()[len(oldlogs):] self.assertEqual(1, len(logs)) self.assertEqual('/oai', logs[-1]['path']) self.assertEqual( { 'verb': ['ListRecords'], 'metadataPrefix': ['oai_dc'], 'from': ['2011-03-31'] }, logs[-1]['arguments']) def testClear(self): self.startHarvester(repository=REPOSITORY) self.assertEqual(BATCHSIZE, self.sizeDumpDir()) header, data = getRequest(self.harvesterInternalServerPortNumber, '/get', { 'verb': 'GetStatus', 'domainId': DOMAIN, 'repositoryId': REPOSITORY }) self.assertEqual(8, data['response']['GetStatus'][0]['total']) self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='clear') self.startHarvester(repository=REPOSITORY) self.assertEqual(18, self.sizeDumpDir()) for filename in sorted(listdir(self.dumpDir))[-8:]: self.assertTrue('_delete.updateRequest' in filename, filename) header, data = getRequest(self.harvesterInternalServerPortNumber, '/get', { 'verb': 'GetStatus', 'domainId': DOMAIN, 'repositoryId': REPOSITORY }) self.assertEqual(0, data['response']['GetStatus'][0]['total']) def testRefresh(self): oldlogs = self.getLogs() log = State(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY).getHarvesterLog() log.startRepository() for uploadId in [ '%s:oai:record:%02d' % (REPOSITORY, i) for i in [1, 7, 120, 121] ]: log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in [ '%s:oai:record:%02d' % (REPOSITORY, i) for i in [4, 5, 122, 123] ]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='refresh') self.startHarvester(repository=REPOSITORY) logs = self.getLogs()[len(oldlogs):] self.assertEqual(0, len(logs)) self.startHarvester(repository=REPOSITORY) logs = self.getLogs() self.assertEqual('/oai', logs[-1]["path"]) self.assertEqual( { 'verb': ['ListRecords'], 'metadataPrefix': ['oai_dc'] }, logs[-1]["arguments"]) statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY) token = getResumptionToken(open(statsFile).readlines()[-1]) self.startHarvester(repository=REPOSITORY) logs = self.getLogs() self.assertEqual('/oai', logs[-1]["path"]) self.assertEqual({ 'verb': ['ListRecords'], 'resumptionToken': [token] }, logs[-1]["arguments"]) self.assertEqual(15, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) self.assertEqual(17, self.sizeDumpDir()) deleteFiles = [ join(self.dumpDir, f) for f in listdir(self.dumpDir) if '_delete' in f ] deletedIds = set([ xpathFirst(parse(open(x)), '//ucp:recordIdentifier/text()') for x in deleteFiles ]) self.assertEqual( set([ '%s:oai:record:03' % REPOSITORY, '%s:oai:record:06' % REPOSITORY, '%s:oai:record:120' % REPOSITORY, '%s:oai:record:121' % REPOSITORY ]), deletedIds) logs = self.getLogs()[len(oldlogs):] self.startHarvester(repository=REPOSITORY) self.assertEqual(len(logs), len(self.getLogs()[len(oldlogs):]), 'Action is over, expect nothing more.') def testInvalidIgnoredUptoMaxIgnore(self): maxIgnore = 5 self.controlHelper(action='allInvalid') nrOfDeleted = 2 self.startHarvester(repository=REPOSITORY) self.assertEqual(nrOfDeleted, self.sizeDumpDir()) self.assertEqual(0, len(self.get_ids('ids'))) invalidIds = self.get_ids('invalidIds') self.assertEqual(maxIgnore + 1, len(invalidIds), invalidIds) invalidDataMessagesDir = join(self.harvesterLogDir, DOMAIN, "invalid", REPOSITORY) self.assertEqual(maxIgnore + 1, len(listdir(invalidDataMessagesDir))) invalidDataMessage01 = open( join(invalidDataMessagesDir, "oai:record:01")).read() self.assertTrue('uploadId: "integrationtest:oai:record:01"', invalidDataMessage01) self.controlHelper(action='noneInvalid') self.startHarvester(repository=REPOSITORY) self.assertEqual(nrOfDeleted + BATCHSIZE, self.sizeDumpDir()) ids = self.get_ids('ids') self.assertEqual(BATCHSIZE - nrOfDeleted, len(ids)) invalidIds = self.get_ids('invalidIds') self.assertEqual(0, len(invalidIds), invalidIds) self.assertEqual(0, len(listdir(invalidDataMessagesDir))) def testHarvestToFilesystemTarget(self): self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, targetId='FILESYSTEM') self.startHarvester(repository=REPOSITORY) self.assertEqual( 8, len(listdir(join(self.filesystemDir, REPOSITORYGROUP, REPOSITORY)))) self.assertEqual( ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [3, 6]], [ id.strip() for id in open(join(self.filesystemDir, 'deleted_records')) ]) def testClearOnFilesystemTarget(self): self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, targetId='FILESYSTEM') self.startHarvester(repository=REPOSITORY) self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, targetId='FILESYSTEM', action='clear') self.startHarvester(repository=REPOSITORY) self.assertEqual( 0, len(listdir(join(self.filesystemDir, REPOSITORYGROUP, REPOSITORY)))) self.assertEqual( set([ 'harvestertestrepository:oai:record:10', 'harvestertestrepository:oai:record:09', 'harvestertestrepository:oai:record:08', 'harvestertestrepository:oai:record:07', 'harvestertestrepository:oai:record:06', 'harvestertestrepository:oai:record:05', 'harvestertestrepository:oai:record:04', 'harvestertestrepository:oai:record:03', 'harvestertestrepository:oai:record:02%2F&gkn', 'harvestertestrepository:oai:record:01' ]), set([ id.strip() for id in open(join(self.filesystemDir, 'deleted_records')) ])) def testHarvestWithError(self): self.startHarvester(repository=REPOSITORY) self.emptyDumpDir() self.controlHelper(action='raiseExceptionOnIds', id=['%s:oai:record:12' % REPOSITORY]) self.startHarvester(repository=REPOSITORY) successFullRecords = ['oai:record:11'] self.assertEqual(len(successFullRecords), self.sizeDumpDir()) self.emptyDumpDir() self.controlHelper(action='raiseExceptionOnIds', id=[]) self.startHarvester(repository=REPOSITORY) secondBatchSize = 5 self.assertEqual(secondBatchSize, self.sizeDumpDir()) def testClearWithError(self): self.startHarvester(repository=REPOSITORY) self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='clear') self.controlHelper(action='raiseExceptionOnIds', id=['%s:oai:record:05' % REPOSITORY]) self.emptyDumpDir() self.startHarvester(repository=REPOSITORY) successFullDeletes = [1, 2, 4] deletesTodo = [5, 7, 8, 9, 10] self.assertEqual(len(successFullDeletes), self.sizeDumpDir()) self.controlHelper(action='raiseExceptionOnIds', id=[]) self.emptyDumpDir() self.assertEqual(0, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) self.assertEqual(len(deletesTodo), self.sizeDumpDir()) def testRefreshWithIgnoredRecords(self): log = State(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY).getHarvesterLog() log.startRepository() for uploadId in [ '%s:oai:record:%02d' % (REPOSITORY, i) for i in [1, 2, 120, 121] ]: if uploadId == '%s:oai:record:02' % (REPOSITORY): uploadId = '%s:oai:record:02/&gkn' % (REPOSITORY) log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in [ '%s:oai:record:%02d' % (REPOSITORY, i) for i in [4, 5, 122, 123, 124] ]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) for uploadId in [ '%s:oai:record:%02d' % (REPOSITORY, i) for i in [7, 8, 125, 126, 127, 128] ]: log.notifyHarvestedRecord(uploadId) log.logInvalidData(uploadId, 'ignored message') log.logIgnoredIdentifierWarning(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() totalRecords = 15 oldUploads = 2 oldDeletes = 3 oldIgnoreds = 4 self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='refresh') self.startHarvester(repository=REPOSITORY) # Smoot init self.assertEqual(0, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) # Smooth harvest self.startHarvester(repository=REPOSITORY) # Smooth harvest self.assertEqual(totalRecords, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) # Smooth finish self.assertEqual(totalRecords + oldUploads + oldIgnoreds, self.sizeDumpDir()) invalidIds = self.get_ids('invalidIds') self.assertEqual(0, len(invalidIds), invalidIds) self.assertEqual(13, len(self.get_ids('ids'))) def testClearWithInvalidRecords(self): state = State(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY) try: log = state.getHarvesterLog() log.startRepository() for uploadId in [ '%s:oai:record:%02d' % (REPOSITORY, i) for i in [1, 2, 120, 121] ]: log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in [ '%s:oai:record:%02d' % (REPOSITORY, i) for i in [4, 5, 122, 123, 124] ]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) for uploadId in [ '%s:oai:record:%02d' % (REPOSITORY, i) for i in [7, 8, 125, 126, 127, 128] ]: log.notifyHarvestedRecord(uploadId) log.logInvalidData(uploadId, 'ignored message') log.logIgnoredIdentifierWarning(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() oldUploads = 4 oldDeletes = 5 oldInvalids = 6 self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='clear') self.startHarvester(repository=REPOSITORY) self.assertEqual(oldUploads + oldInvalids, self.sizeDumpDir()) self.assertEqual(0, len(state.invalidIds), state.invalidIds.getIds()) self.assertEqual(0, len(state.ids), state.ids.getIds()) finally: state.close() def testConcurrentHarvestToSruUpdate(self): self.startHarvester(concurrency=3) requestsLogged = sorted(listdir(self.dumpDir)) repositoryIds = [] for f in requestsLogged: lxml = parse(open(join(self.dumpDir, f))) repositoryIds.append( xpath(lxml, '//ucp:recordIdentifier/text()')[0].split(':', 1)[0]) repositoryIdsSet = set(repositoryIds) self.assertEqual( set(['repository2', 'integrationtest', 'harvestertestrepository']), repositoryIdsSet) lastSeenRepoId = None try: for repo in repositoryIds: if repo != lastSeenRepoId: repositoryIdsSet.remove(repo) lastSeenRepoId = repo continue except KeyError: pass else: self.fail('Records should have been inserted out-of-order.') def testConcurrentHarvestToSruUpdateBUG(self): self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, complete=True) self.startHarvester(concurrency=1) requestsLogged = sorted(listdir(self.dumpDir)) repositoryIds = [] for f in requestsLogged: lxml = parse(open(join(self.dumpDir, f))) repositoryIds.append( xpath(lxml, '//ucp:recordIdentifier/text()')[0].split(':', 1)[0]) self.assertEqual(15, repositoryIds.count(REPOSITORY)) self.assertEqual(10, repositoryIds.count('repository2')) self.assertEqual(10, repositoryIds.count('integrationtest')) def testStartHarvestingAddedRepository(self): t = Thread( target=lambda: self.startHarvester(concurrency=1, runOnce=False)) t.start() while not listdir(self.dumpDir): sleep(0.1) self.saveRepository(DOMAIN, 'xyz', REPOSITORYGROUP) stdoutfile = join(self.integrationTempdir, "stdouterr-meresco-harvester-harvester.log") sleepWheel(5) log = open(stdoutfile).read() try: self.assertTrue('xyz' in log, log) finally: self.removeRepository(DOMAIN, 'xyz', REPOSITORYGROUP) t.join() def testDontHarvestDeletedRepository(self): stdoutfile = join(self.integrationTempdir, "stdouterr-meresco-harvester-harvester.log") self.saveRepository(DOMAIN, 'xyz', REPOSITORYGROUP) t = Thread( target=lambda: self.startHarvester(concurrency=1, runOnce=False)) t.start() while not listdir(self.dumpDir): sleep(0.1) sleepWheel(1) def _readFile(name): with open(name) as fp: return fp.read() log = _readFile(stdoutfile) xyzOccurrences = log.count('[xyz]') self.removeRepository(DOMAIN, 'xyz', REPOSITORYGROUP) log = _readFile(stdoutfile) try: newXyzOccurrences = log.count('[xyz]') self.assertEqual( xyzOccurrences, newXyzOccurrences, "%s!=%s\n%s" % (xyzOccurrences, newXyzOccurrences, log)) finally: t.join() def testErrorReportedToGustos(self): baseUrl = join(self.integrationTempdir, "choppy_oai.xml") filename = "{}?verb=ListRecords&metadataPrefix=oai_dc".format(baseUrl) with open(filename, "w") as fp: fp.write("""<?xml version="1.0" encoding="UTF-8"?> <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2017-10-31T15:12:52Z</responseDate><request from="2017-10-04T11:52:57Z" metadataPrefix="didl_mods" verb="ListRecords">https://surfsharekit.nl/oai/hhs/</request><ListRecords><record><header><identifier>oai:surfsharekit.nl:b6ea6503-e2fc-4974-8941-2a4a405dc72f</identifier><datestamp>2017-10-04T14:16:22Z</datestamp></header><metadata><didl:DIDL xmlns:didl="urn:mpeg:mpeg21:2002:02-DIDL-NS" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <didl:Item> <didl:Descriptor> <didl:Statement mimeType="application/xml"> <dii:Identifier xmlns:dii="urn:mpeg:mpeg21:2002:01-DII-NS">urn:nbn:nl:hs:18-b6ea6503-e2fc-4974-8941-2a4a405dc72f</dii:Identifier> </didl:Statement> </didl:Descrip""") errorCount = len(self.gustosUdpListener.log()) self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, baseUrl="file://{}".format(baseUrl)) t = Thread( target=lambda: self.startHarvester(concurrency=1, runOnce=True)) t.start() sleepWheel(5) last_logs = [ JsonDict.loads(l)['data'] for l in self.gustosUdpListener.log()[errorCount:] ] for data in reversed(last_logs): my_group_log = data.get(f'Harvester ({DOMAIN})', {}).get(f'{REPOSITORYGROUP}:{REPOSITORY}') if my_group_log is not None: break self.assertEqual({"count": 1}, my_group_log['errors']) def testConcurrencyAtLeastOne(self): stdouterrlog = self.startHarvester(concurrency=0, expectedReturnCode=2) self.assertTrue("Concurrency must be at least 1" in stdouterrlog, stdouterrlog) stdouterrlog = self.startHarvester(concurrency=-1, expectedReturnCode=2) self.assertTrue("Concurrency must be at least 1" in stdouterrlog, stdouterrlog) def testCompleteInOnAttempt(self): self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, complete=True) stdouterrlog = self.startHarvester(repository=REPOSITORY, runOnce=True, timeoutInSeconds=5) self.assertEqual(15, self.sizeDumpDir()) self.assertTrue( "Repository will be completed in one attempt" in stdouterrlog, stdouterrlog) def testHarvestingContinues4Ever(self): try: self.startHarvester(repository=REPOSITORY, runOnce=False, timeoutInSeconds=5) except SystemExit as e: self.assertTrue('took more than 5 seconds' in str(e), str(e)) self.assertEqual(15, self.sizeDumpDir()) def testBadOai(self): header, data = getRequest(port=self.helperServerPortNumber, path='/badoai/responsedate', arguments=dict(verb='ListRecords', metadataPrefix='prefix')) self.assertEqual( 'resume0', xpathFirst( data, '/oai:OAI-PMH/oai:ListRecords/oai:resumptionToken/text()')) header, data = getRequest(port=self.helperServerPortNumber, path='/badoai/responsedate', arguments=dict(verb='ListRecords', resumptionToken='resume0')) self.assertEqual( 'resume1', xpathFirst( data, '/oai:OAI-PMH/oai:ListRecords/oai:resumptionToken/text()')) def testNormalHarvesting(self): self.assertEqual( 'Harvested.', what_happened(self.startHarvester(repository=REPOSITORY))) self.assertEqual(10, self.sizeDumpDir()) self.assertEqual( 'Harvested.', what_happened(self.startHarvester(repository=REPOSITORY))) self.assertEqual(15, self.sizeDumpDir()) self.assertEqual( 'Nothing to do!', what_happened(self.startHarvester(repository=REPOSITORY))) self.assertEqual(15, self.sizeDumpDir()) def saveBadoai(self, **kwargs): self.saveRepository( DOMAIN, REPOSITORY, REPOSITORYGROUP, baseUrl='http://localhost:{}/badoai/responsedate'.format( self.helperServerPortNumber), metadataPrefix='prefix', **kwargs) def testWithStrangeResponseDate(self): self.saveBadoai(complete=False) self.assertEqual( 'Harvested.', what_happened(self.startHarvester(repository=REPOSITORY))) self.assertEqual(1, self.sizeDumpDir()) self.assertEqual( 'Harvested.', what_happened(self.startHarvester(repository=REPOSITORY))) self.assertEqual(2, self.sizeDumpDir()) self.assertEqual( 'Harvested.', what_happened(self.startHarvester(repository=REPOSITORY))) self.assertEqual(3, self.sizeDumpDir()) self.assertEqual( 'Nothing to do!', what_happened(self.startHarvester(repository=REPOSITORY))) self.assertEqual(3, self.sizeDumpDir()) # Problem is that the harvester wants to continue because responsedate is in the past. It should # use a separate date to determine if it has done enough for the day. def testCompleteWithStrangeResponseDate(self): self.saveBadoai(complete=True) self.assertEqual( 'Harvested.', what_happened(self.startHarvester(repository=REPOSITORY))) self.assertEqual(3, self.sizeDumpDir()) self.assertEqual( 'Nothing to do!', what_happened(self.startHarvester(repository=REPOSITORY))) self.assertEqual(3, self.sizeDumpDir()) # Further testing # save(action='refresh') # output = self.startHarvester(repository='responsedate') # self.assertEqual('Harvested.', what_happened(output)) # output = self.startHarvester(repository='responsedate') # self.assertEqual('Harvested.', what_happened(output)) def emptyDumpDir(self): if listdir(self.dumpDir): system('rm %s/*' % self.dumpDir) def sizeDumpDir(self): return len(listdir(self.dumpDir))