Exemple #1
0
 def setUp(self):
     SeecrTestCase.setUp(self)
     self.hd = HarvesterData(self.tempdir)
     self.hd.addDomain('domain')
     self.hd.addRepositoryGroup('group', domainId='domain')
     self.hd.addRepository('repository',
                           repositoryGroupId='group',
                           domainId='domain')
     self.hda = HarvesterDataActions()
     self.hda.addObserver(self.hd)
Exemple #2
0
 def setUp(self):
     SeecrTestCase.setUp(self)
     self.hd = HarvesterData(self.tempdir)
     self.hd.addDomain('domain')
     self.hd.addRepositoryGroup('group', domainId='domain')
     self.hd.addRepository('repository', repositoryGroupId='group', domainId='domain')
     self.hda = HarvesterDataActions()
     self.hda.addObserver(self.hd)
Exemple #3
0
 def setUp(self):
     IntegrationTestCase.setUp(self)
     system("rm -rf %s" % self.harvesterLogDir)
     system("rm -rf %s" % self.harvesterStateDir)
     self.filesystemDir = join(self.integrationTempdir, 'filesystem')
     system("rm -rf %s" % self.filesystemDir)
     self.controlHelper(action='reset')
     self.emptyDumpDir()
     self.domainStatePath = pathlib.Path(self.harvesterStateDir) / DOMAIN
     self.domainLogPath = pathlib.Path(self.harvesterLogDir) / DOMAIN
     self.domainStatePath.mkdir(parents=True)
     self.domainLogPath.mkdir(parents=True)
     self.harvesterData = HarvesterData(
         join(self.integrationTempdir, 'data'))
     try:
         self.harvesterData.addRepositoryGroup(identifier=REPOSITORYGROUP,
                                               domainId=DOMAIN)
     except ValueError:
         pass
     self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP)
Exemple #4
0
 def setUp(self):
     IntegrationTestCase.setUp(self)
     system("rm -rf %s" % self.harvesterLogDir)
     system("rm -rf %s" % self.harvesterStateDir)
     self.filesystemDir = join(self.integrationTempdir, 'filesystem')
     system("rm -rf %s" % self.filesystemDir)
     self.emptyDumpDir()
     system("mkdir -p %s" % join(self.harvesterStateDir, DOMAIN))
     self.harvesterData = HarvesterData(join(self.integrationTempdir, 'data'))
     try:
         self.harvesterData.addRepositoryGroup(identifier=REPOSITORYGROUP, domainId=DOMAIN)
     except ValueError:
         pass
     self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP)
    def setUp(self):
        SeecrTestCase.setUp(self)
        self.hd = HarvesterData(self.tempdir)
        self.hd.addDomain('domain')
        self.hd.addRepositoryGroup('group', domainId='domain')
        self.hd.addRepository('repository',
                              repositoryGroupId='group',
                              domainId='domain')
        self.hd.updateFieldDefinition(
            'domain', {
                'repository_fields': [
                    {
                        'name': 'name',
                        'label': 'Label',
                        'type': 'text',
                        'export': False
                    },
                    {
                        'name': 'choice_1',
                        'label': 'Keuze',
                        'type': 'bool',
                        'export': False
                    },
                    {
                        'name': 'choice_2',
                        'label': 'Keuze',
                        'type': 'bool',
                        'export': False
                    },
                ]
            })
        self.hda = HarvesterDataActions()
        self.hda.addObserver(self.hd)

        self.observable = CallTrace()
        self.dna = be(
            (Observable(), (HarvesterDataActions(), (self.observable, ))))
Exemple #6
0
    def setUp(self):
        SeecrTestCase.setUp(self)
        open(join(self.tempdir, 'adomain.domain'), 'w').write("""{
    "identifier": "adomain",
    "mappingIds": ["ignored MAPPING"],
    "targetIds": ["ignored TARGET"],
    "repositoryGroupIds": ["Group1", "Group2"]
}""")
        open(join(self.tempdir, 'adomain.Group1.repositoryGroup'), 'w').write("""{
    "identifier": "Group1",
    "name": {"nl": "Groep1", "en": "Group1"},
    "repositoryIds": ["repository1", "repository2"]
}""")
        open(join(self.tempdir, 'adomain.Group2.repositoryGroup'), 'w').write("""{
    "identifier": "Group2",
    "name": {"nl": "Groep2", "en": "Group2"},
    "repositoryIds": ["repository2_1", "repository2_2"]
} """)
        open(join(self.tempdir, 'adomain.repository1.repository'), 'w').write("""{
    "identifier": "repository1",
    "repositoryGroupId": "Group1"
}""")
        open(join(self.tempdir, 'adomain.repository2.repository'), 'w').write("""{
    "identifier": "repository2",
    "repositoryGroupId": "Group1"
}""")
        open(join(self.tempdir, 'adomain.repository2_1.repository'), 'w').write("""{
    "identifier": "repository2_1",
    "repositoryGroupId": "Group2"
}""")
        open(join(self.tempdir, 'adomain.repository2_2.repository'), 'w').write("""{
    "identifier": "repository2_2",
    "repositoryGroupId": "Group2"
}""")
        open(join(self.tempdir, 'adomain.remi.repository'), 'w').write("""{
    "identifier": "remi",
    "repositoryGroupId": "NoGroup"
}""")
        self.hd = HarvesterData(self.tempdir)
Exemple #7
0
class HarvesterTest(IntegrationTestCase):
    def setUp(self):
        IntegrationTestCase.setUp(self)
        system("rm -rf %s" % self.harvesterLogDir)
        system("rm -rf %s" % self.harvesterStateDir)
        self.filesystemDir = join(self.integrationTempdir, 'filesystem')
        system("rm -rf %s" % self.filesystemDir)
        self.emptyDumpDir()
        system("mkdir -p %s" % join(self.harvesterStateDir, DOMAIN))
        self.harvesterData = HarvesterData(join(self.integrationTempdir, 'data'))
        try:
            self.harvesterData.addRepositoryGroup(identifier=REPOSITORYGROUP, domainId=DOMAIN)
        except ValueError:
            pass
        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP)

    def tearDown(self):
        self.removeRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP)
        IntegrationTestCase.tearDown(self)

    def saveRepository(self, domain, repositoryId, repositoryGroupId, metadataPrefix="oai_dc", action=None, mappingId='MAPPING', targetId='SRUUPDATE', maximumIgnore=5, complete=False, continuous=None):
        try:
            self.harvesterData.addRepository(identifier=repositoryId, domainId=domain, repositoryGroupId=repositoryGroupId)
        except ValueError:
            pass
        self.harvesterData.updateRepository(
                identifier=repositoryId,
                domainId=domain,
                baseurl='http://localhost:%s/oai' % self.helperServerPortNumber,
                set=None,
                metadataPrefix=metadataPrefix,
                mappingId=mappingId,
                targetId=targetId,
                collection=None,
                maximumIgnore=maximumIgnore,
                use=True,
                complete=complete,
                continuous=continuous,
                action=action,
                shopclosed=[]
            )

    def removeRepository(self, domain, repositoryId, repositoryGroupId):
        self.harvesterData.deleteRepository(identifier=repositoryId, domainId=domain, repositoryGroupId=repositoryGroupId)

    def testHarvestReturnsErrorWillNotSaveState(self):
        logs = self.getLogs()
        self.saveRepository(DOMAIN, "repo_invalid_metadataPrefix", REPOSITORYGROUP, metadataPrefix="not_existing")
        try:
            self.startHarvester(repository="repo_invalid_metadataPrefix")
            self.startHarvester(repository="repo_invalid_metadataPrefix")
            logs = self.getLogs()[len(logs):]
            self.assertEquals(2, len(logs))
            self.assertEquals('/oai', logs[-2]['path'])
            self.assertEquals({'verb':['ListRecords'], 'metadataPrefix':['not_existing']}, logs[0]['arguments'])
            self.assertEquals('/oai', logs[-1]['path'])
            self.assertEquals({'verb':['ListRecords'], 'metadataPrefix':['not_existing']}, logs[1]['arguments'])
        finally:
            self.removeRepository(DOMAIN, 'repo_invalid_metadataPrefix', REPOSITORYGROUP)

    def testHarvestToSruUpdate(self):
        # initial harvest
        oldlogs = self.getLogs()
        self.startHarvester(repository=REPOSITORY)
        self.assertEquals(BATCHSIZE, self.sizeDumpDir())
        self.assertEquals(2, len([f for f in listdir(self.dumpDir) if "info:srw/action/1/delete" in open(join(self.dumpDir, f)).read()]))
        ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines()
        self.assertEquals(8, len(ids))
        invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines()
        self.assertEquals(0, len(invalidIds))
        logs = self.getLogs()[len(oldlogs):]
        self.assertEquals(1, len(logs))
        self.assertEquals('/oai', logs[-1]['path'])
        self.assertEquals({'verb':['ListRecords'], 'metadataPrefix':['oai_dc']}, logs[-1]['arguments'])
        statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY)
        token = getResumptionToken(open(statsFile).readlines()[-1])

        # resumptionToken
        self.startHarvester(repository=REPOSITORY)
        self.assertEquals(15, self.sizeDumpDir())
        ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines()
        self.assertEquals(13, len(ids))
        logs = self.getLogs()[len(oldlogs):]
        self.assertEquals(2, len(logs))
        self.assertEquals('/oai', logs[-1]['path'])
        self.assertEquals({'verb':['ListRecords'], 'resumptionToken':[token]}, logs[-1]['arguments'])

        # Nothing
        self.startHarvester(repository=REPOSITORY)
        logs = self.getLogs()[len(oldlogs):]
        self.assertEquals(2, len(logs))
        self.assertEquals(None, getResumptionToken(open(statsFile).readlines()[-1]))

    def testContinuousHarvest(self):
        oldlogs = self.getLogs()
        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, continuous=1)
        t = Thread(target=lambda: self.startHarvester(concurrency=1, runOnce=False, repository=REPOSITORY))
        t.start()
        try:
            sleepWheel(5)
            logs = self.getLogs()[len(oldlogs):]
            self.assertTrue(len(logs) > 2, logs)
            self.assertEqual({'path': '/oai', 'arguments': {'verb': ['ListRecords'], 'metadataPrefix': ['oai_dc']}}, logs[0])
            self.assertTrue('resumptionToken' in logs[1]['arguments'], logs[1])
            self.assertTrue('from' in logs[2]['arguments'], logs[2])
        finally:
            t.join()

    def testIncrementalHarvesting(self):
        oldlogs = self.getLogs()
        statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY)
        with open(statsFile, 'w') as f:
            f.write('Started: 2011-03-31 13:11:44, Harvested/Uploaded/Deleted/Total: 300/300/0/300, Done: 2011-03-31 13:12:36, ResumptionToken: xyz\n')
            f.write('Started: 2011-04-01 14:11:44, Harvested/Uploaded/Deleted/Total: 300/300/0/300, Done: 2011-04-01 14:12:36, ResumptionToken:\n')
        self.startHarvester(repository=REPOSITORY)
        self.assertEquals(BATCHSIZE, self.sizeDumpDir())
        logs = self.getLogs()[len(oldlogs):]
        self.assertEquals(1, len(logs))
        self.assertEquals('/oai', logs[-1]['path'])
        self.assertEquals({'verb':['ListRecords'], 'metadataPrefix':['oai_dc'], 'from':['2011-03-31']}, logs[-1]['arguments'])

    def testClear(self):
        self.startHarvester(repository=REPOSITORY)
        self.assertEquals(BATCHSIZE, self.sizeDumpDir())

        header, result = getRequest(self.harvesterInternalServerPortNumber, '/get', {'verb': 'GetStatus', 'domainId': DOMAIN, 'repositoryId': REPOSITORY}, parse=False)
        data = JsonDict.loads(result)
        self.assertEquals(8, data['response']['GetStatus'][0]['total'])

        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='clear')

        self.startHarvester(repository=REPOSITORY)
        self.assertEquals(18, self.sizeDumpDir())
        for filename in sorted(listdir(self.dumpDir))[-8:]:
            self.assertTrue('_delete.updateRequest' in filename, filename)

        header, result = getRequest(self.harvesterInternalServerPortNumber, '/get', {'verb': 'GetStatus', 'domainId': DOMAIN, 'repositoryId': REPOSITORY}, parse=False)
        self.assertEqual(0, JsonDict.loads(result)['response']['GetStatus'][0]['total'])

    def testRefresh(self):
        oldlogs = self.getLogs()
        log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY)
        log.startRepository()
        for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,7,120,121]]:
            log.notifyHarvestedRecord(uploadId)
            log.uploadIdentifier(uploadId)
        for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123]]:
            log.notifyHarvestedRecord(uploadId)
            log.deleteIdentifier(uploadId)
        log.endRepository('token', '2012-01-01T09:00:00Z')
        log.close()

        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='refresh')

        self.startHarvester(repository=REPOSITORY)
        logs = self.getLogs()[len(oldlogs):]
        self.assertEquals(0, len(logs))
        self.startHarvester(repository=REPOSITORY)
        logs = self.getLogs()
        self.assertEquals('/oai', logs[-1]["path"])
        self.assertEquals({'verb': ['ListRecords'], 'metadataPrefix': ['oai_dc']}, logs[-1]["arguments"])
        statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY)
        token = getResumptionToken(open(statsFile).readlines()[-1])

        self.startHarvester(repository=REPOSITORY)
        logs = self.getLogs()
        self.assertEquals('/oai', logs[-1]["path"])
        self.assertEquals({'verb': ['ListRecords'], 'resumptionToken': [token]}, logs[-1]["arguments"])
        self.assertEquals(15, self.sizeDumpDir())

        self.startHarvester(repository=REPOSITORY)
        self.assertEquals(17, self.sizeDumpDir())
        deleteFiles = [join(self.dumpDir, f) for f in listdir(self.dumpDir) if '_delete' in f]
        deletedIds = set([xpathFirst(parse(open(x)), '//ucp:recordIdentifier/text()') for x in deleteFiles])
        self.assertEquals(set(['%s:oai:record:03' % REPOSITORY, '%s:oai:record:06' % REPOSITORY, '%s:oai:record:120' % REPOSITORY, '%s:oai:record:121' % REPOSITORY]), deletedIds)

        logs = self.getLogs()[len(oldlogs):]
        self.startHarvester(repository=REPOSITORY)
        self.assertEquals(len(logs), len(self.getLogs()[len(oldlogs):]), 'Action is over, expect nothing more.')

    def testInvalidIgnoredUptoMaxIgnore(self):
        maxIgnore = 5
        self.controlHelper(action='allInvalid')
        nrOfDeleted = 2
        self.startHarvester(repository=REPOSITORY)
        self.assertEquals(nrOfDeleted, self.sizeDumpDir())
        ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines()
        self.assertEquals(0, len(ids))
        invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines()
        self.assertEquals(maxIgnore + 1, len(invalidIds), invalidIds)
        invalidDataMessagesDir = join(self.harvesterLogDir, DOMAIN, "invalid", REPOSITORY)
        self.assertEquals(maxIgnore + 1, len(listdir(invalidDataMessagesDir)))
        invalidDataMessage01 = open(join(invalidDataMessagesDir, "oai:record:01")).read()
        self.assertTrue('uploadId: "integrationtest:oai:record:01"', invalidDataMessage01)
        self.controlHelper(action='noneInvalid')
        self.startHarvester(repository=REPOSITORY)
        self.assertEquals(nrOfDeleted + BATCHSIZE, self.sizeDumpDir())
        ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines()
        self.assertEquals(BATCHSIZE - nrOfDeleted, len(ids))
        invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines()
        self.assertEquals(0, len(invalidIds), invalidIds)
        self.assertEquals(0, len(listdir(invalidDataMessagesDir)))

    def testHarvestToFilesystemTarget(self):
        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, targetId='FILESYSTEM')
        self.startHarvester(repository=REPOSITORY)

        self.assertEquals(8, len(listdir(join(self.filesystemDir, REPOSITORYGROUP, REPOSITORY))))
        self.assertEquals(['%s:oai:record:%02d' % (REPOSITORY, i) for i in [3,6]],
                [id.strip() for id in open(join(self.filesystemDir, 'deleted_records'))])

    def testClearOnFilesystemTarget(self):
        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, targetId='FILESYSTEM')
        self.startHarvester(repository=REPOSITORY)

        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, targetId='FILESYSTEM', action='clear')
        self.startHarvester(repository=REPOSITORY)
        self.assertEquals(0, len(listdir(join(self.filesystemDir, REPOSITORYGROUP, REPOSITORY))))
        self.assertEquals(set([
                'harvestertestrepository:oai:record:10', 'harvestertestrepository:oai:record:09', 'harvestertestrepository:oai:record:08',
                'harvestertestrepository:oai:record:07', 'harvestertestrepository:oai:record:06', 'harvestertestrepository:oai:record:05',
                'harvestertestrepository:oai:record:04', 'harvestertestrepository:oai:record:03', 'harvestertestrepository:oai:record:02%2F&gkn',
                'harvestertestrepository:oai:record:01'
            ]),
            set([id.strip() for id in open(join(self.filesystemDir, 'deleted_records'))])
        )

    def testHarvestWithError(self):
        self.startHarvester(repository=REPOSITORY)
        self.emptyDumpDir()

        self.controlHelper(action='raiseExceptionOnIds', id=['%s:oai:record:12' % REPOSITORY])
        self.startHarvester(repository=REPOSITORY)
        successFullRecords=['oai:record:11']
        self.assertEquals(len(successFullRecords), self.sizeDumpDir())
        self.emptyDumpDir()

        self.controlHelper(action='raiseExceptionOnIds', id=[])
        self.startHarvester(repository=REPOSITORY)
        secondBatchSize = 5
        self.assertEquals(secondBatchSize, self.sizeDumpDir())

    def testClearWithError(self):
        self.startHarvester(repository=REPOSITORY)

        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='clear')
        self.controlHelper(action='raiseExceptionOnIds', id=['%s:oai:record:05' % REPOSITORY])
        self.emptyDumpDir()

        self.startHarvester(repository=REPOSITORY)
        successFullDeletes = [1,2,4]
        deletesTodo = [5,7,8,9,10]
        self.assertEquals(len(successFullDeletes), self.sizeDumpDir())

        self.controlHelper(action='raiseExceptionOnIds', id=[])
        self.emptyDumpDir()
        self.startHarvester(repository=REPOSITORY)
        self.assertEquals(len(deletesTodo), self.sizeDumpDir())

    def testRefreshWithIgnoredRecords(self):
        log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY)
        log.startRepository()
        for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,2,120,121]]:
            if uploadId == '%s:oai:record:02' % (REPOSITORY):
                uploadId = '%s:oai:record:02/&gkn' % (REPOSITORY)
            log.notifyHarvestedRecord(uploadId)
            log.uploadIdentifier(uploadId)
        for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123,124]]:
            log.notifyHarvestedRecord(uploadId)
            log.deleteIdentifier(uploadId)
        for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [7,8,125,126,127,128]]:
            log.notifyHarvestedRecord(uploadId)
            log.logInvalidData(uploadId, 'ignored message')
            log.logIgnoredIdentifierWarning(uploadId)
        log.endRepository('token', '2012-01-01T09:00:00Z')
        log.close()
        totalRecords = 15
        oldUploads = 2
        oldDeletes = 3
        oldIgnoreds = 4

        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='refresh')

        self.startHarvester(repository=REPOSITORY) # Smoot init
        self.assertEquals(0, self.sizeDumpDir())
        self.startHarvester(repository=REPOSITORY) # Smooth harvest
        self.startHarvester(repository=REPOSITORY) # Smooth harvest
        self.assertEquals(totalRecords, self.sizeDumpDir())
        self.startHarvester(repository=REPOSITORY) # Smooth finish
        self.assertEquals(totalRecords + oldUploads + oldIgnoreds, self.sizeDumpDir())
        invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines()
        self.assertEquals(0, len(invalidIds), invalidIds)
        ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines()
        self.assertEquals(13, len(ids), ids)

    def testClearWithInvalidRecords(self):
        log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY)
        log.startRepository()
        for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,2,120,121]]:
            log.notifyHarvestedRecord(uploadId)
            log.uploadIdentifier(uploadId)
        for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123,124]]:
            log.notifyHarvestedRecord(uploadId)
            log.deleteIdentifier(uploadId)
        for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [7,8,125,126,127,128]]:
            log.notifyHarvestedRecord(uploadId)
            log.logInvalidData(uploadId, 'ignored message')
            log.logIgnoredIdentifierWarning(uploadId)
        log.endRepository('token', '2012-01-01T09:00:00Z')
        log.close()
        oldUploads = 4
        oldDeletes = 5
        oldInvalids = 6

        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='clear')

        self.startHarvester(repository=REPOSITORY)
        self.assertEquals(oldUploads+oldInvalids, self.sizeDumpDir())
        invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines()
        self.assertEquals(0, len(invalidIds), invalidIds)
        ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines()
        self.assertEquals(0, len(ids), ids)

    def testConcurrentHarvestToSruUpdate(self):
        self.startHarvester(concurrency=3)

        requestsLogged = sorted(listdir(self.dumpDir))

        repositoryIds = []
        for f in requestsLogged:
            lxml = parse(open(join(self.dumpDir, f)))
            repositoryIds.append(xpath(lxml, '//ucp:recordIdentifier/text()')[0].split(':', 1)[0])

        repositoryIdsSet = set(repositoryIds)
        self.assertEquals(set(['repository2', 'integrationtest', 'harvestertestrepository']), repositoryIdsSet)

        lastSeenRepoId = None
        try:
            for repo in repositoryIds:
                if repo != lastSeenRepoId:
                    repositoryIdsSet.remove(repo)
                    lastSeenRepoId = repo
                    continue
        except KeyError:
            pass
        else:
            self.fail('Records should have been inserted out-of-order.')

    def testConcurrentHarvestToSruUpdateBUG(self):
        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, complete=True)

        self.startHarvester(concurrency=1)

        requestsLogged = sorted(listdir(self.dumpDir))
        repositoryIds = []
        for f in requestsLogged:
            lxml = parse(open(join(self.dumpDir, f)))
            repositoryIds.append(xpath(lxml, '//ucp:recordIdentifier/text()')[0].split(':', 1)[0])
        self.assertEquals(15, repositoryIds.count(REPOSITORY))
        self.assertEquals(10, repositoryIds.count('repository2'))
        self.assertEquals(10, repositoryIds.count('integrationtest'))

    def testStartHarvestingAddedRepository(self):
        t = Thread(target=lambda: self.startHarvester(concurrency=1, runOnce=False))
        t.start()

        while not listdir(self.dumpDir):
            sleep(0.1)

        self.saveRepository(DOMAIN, 'xyz', REPOSITORYGROUP)
        stdoutfile = join(self.integrationTempdir, "stdouterr-meresco-harvester-harvester.log")
        sleepWheel(5)
        log = open(stdoutfile).read()
        try:
            self.assertTrue('xyz' in log, log)
        finally:
            self.removeRepository(DOMAIN, 'xyz', REPOSITORYGROUP)
            t.join()

    def testDontHarvestDeletedRepository(self):
        stdoutfile = join(self.integrationTempdir, "stdouterr-meresco-harvester-harvester.log")
        self.saveRepository(DOMAIN, 'xyz', REPOSITORYGROUP)
        t = Thread(target=lambda: self.startHarvester(concurrency=1, runOnce=False))
        t.start()

        while not listdir(self.dumpDir):
            sleep(0.1)
        sleepWheel(1)
        log = open(stdoutfile).read()
        xyzOccurrences = log.count('[xyz]')

        self.removeRepository(DOMAIN, 'xyz', REPOSITORYGROUP)
        sleepWheel(5)
        log = open(stdoutfile).read()
        try:
            self.assertFalse('Traceback' in log, log)
            newXyzOccurrences = log.count('[xyz]')
            self.assertEquals(xyzOccurrences, newXyzOccurrences, "%s!=%s\n%s" % (xyzOccurrences, newXyzOccurrences, log))
        finally:
            t.join()

    def testConcurrencyAtLeastOne(self):
        stdouterrlog = self.startHarvester(concurrency=0, expectedReturnCode=2)
        self.assertTrue("Concurrency must be at least 1" in stdouterrlog, stdouterrlog)

        stdouterrlog = self.startHarvester(concurrency=-1, expectedReturnCode=2)
        self.assertTrue("Concurrency must be at least 1" in stdouterrlog, stdouterrlog)

    def testCompleteInOnAttempt(self):
        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, complete=True)
        stdouterrlog = self.startHarvester(repository=REPOSITORY, runOnce=True, timeoutInSeconds=5)
        self.assertEquals(15, self.sizeDumpDir())
        self.assertTrue("Repository will be completed in one attempt" in stdouterrlog, stdouterrlog)

    def testHarvestingContinues4Ever(self):
        try:
            self.startHarvester(repository=REPOSITORY, runOnce=False, timeoutInSeconds=5)
        except SystemExit, e:
            self.assertTrue('took more than 5 seconds' in str(e), str(e))
        self.assertEquals(15, self.sizeDumpDir())
class HarvesterDataActionsTest(SeecrTestCase):
    def setUp(self):
        SeecrTestCase.setUp(self)
        self.hd = HarvesterData(self.tempdir)
        self.hd.addDomain('domain')
        self.hd.addRepositoryGroup('group', domainId='domain')
        self.hd.addRepository('repository',
                              repositoryGroupId='group',
                              domainId='domain')
        self.hd.updateFieldDefinition(
            'domain', {
                'repository_fields': [
                    {
                        'name': 'name',
                        'label': 'Label',
                        'type': 'text',
                        'export': False
                    },
                    {
                        'name': 'choice_1',
                        'label': 'Keuze',
                        'type': 'bool',
                        'export': False
                    },
                    {
                        'name': 'choice_2',
                        'label': 'Keuze',
                        'type': 'bool',
                        'export': False
                    },
                ]
            })
        self.hda = HarvesterDataActions()
        self.hda.addObserver(self.hd)

        self.observable = CallTrace()
        self.dna = be(
            (Observable(), (HarvesterDataActions(), (self.observable, ))))

    def testAddDomain(self):
        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=False)),
                    path="/actions/addDomain",
                    Body=bytes(urlencode(dict(identifier="aap")),
                               encoding="utf-8"),
                    Method='Post')))
        self.assertEqual(0, len(self.observable.calledMethods))
        self.assertEqual("200", header['StatusCode'])
        self.assertEqual("application/json", header['Headers']['Content-Type'])
        response = JsonDict.loads(body)
        self.assertFalse(response['success'])
        self.assertEqual("Not allowed", response['message'])

        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=True)),
                    path="/actions/addDomain",
                    Body=bytes(urlencode(dict(identifier="aap")),
                               encoding="utf-8"),
                    Method='Post')))
        self.assertEqual("200", header['StatusCode'])
        self.assertEqual("application/json", header['Headers']['Content-Type'])
        response = JsonDict.loads(body)
        self.assertTrue(response['success'])
        self.assertEqual(1, len(self.observable.calledMethods))
        self.assertEqual("addDomain", self.observable.calledMethods[0].name)
        self.assertEqual(dict(identifier='aap'),
                         self.observable.calledMethods[0].kwargs)

    def testSetRepositoryDone(self):
        self.updateTheRepository(action='refresh')
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEqual('refresh', repository['action'])

        data = dict(domainId='domain', identifier='repository')
        consume(
            self.hda.handleRequest(Method='POST',
                                   path='/somewhere/repositoryDone',
                                   Body=bUrlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEqual(None, repository['action'])

    def testUpdateRepositoryGroup(self):
        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=True)),
                    Method='POST',
                    path='/somewhere/updateRepositoryGroup',
                    Body=bUrlencode(dict(
                        identifier='group',
                        domainId='domain',
                        nl_name="De nieuwe naam",
                        en_name="The old name",
                    ),
                                    doseq=True))))
        self.assertEqual('200', header['StatusCode'])
        self.assertEqual(dict(success=True), JsonDict.loads(body))
        self.assertEqual(1, len(self.observable.calledMethods))
        self.assertEqual('updateRepositoryGroup',
                         self.observable.calledMethods[0].name)
        self.assertEqual(
            {
                'identifier': 'group',
                'domainId': 'domain',
                'name': {
                    'nl': 'De nieuwe naam',
                    'en': 'The old name'
                }
            }, self.observable.calledMethods[0].kwargs)

    def testCreateRepository(self):
        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=True)),
                    Method='POST',
                    path='/actions/addRepository',
                    Body=bUrlencode(dict(
                        identifier='repo-id',
                        domainId='domain-id',
                        repositoryGroupId='repositoryGroupId',
                    ),
                                    doseq=True))))
        self.assertEqual('200', header['StatusCode'])
        self.assertEqual(dict(success=True), JsonDict.loads(body))
        self.assertEqual(1, len(self.observable.calledMethods))
        self.assertEqual('addRepository',
                         self.observable.calledMethods[0].name)
        self.assertEqual(
            {
                'domainId': 'domain-id',
                'identifier': 'repo-id',
                'repositoryGroupId': 'repositoryGroupId'
            }, self.observable.calledMethods[0].kwargs)

    def testDeleteRepository(self):
        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=True)),
                    Method='POST',
                    path='/actions/deleteRepository',
                    Body=bUrlencode(dict(
                        identifier='repo-id',
                        domainId='domain-id',
                        repositoryGroupId='repositoryGroupId',
                    ),
                                    doseq=True))))
        self.assertEqual('200', header['StatusCode'])
        self.assertEqual(dict(success=True), JsonDict.loads(body))
        self.assertEqual(1, len(self.observable.calledMethods))
        self.assertEqual('deleteRepository',
                         self.observable.calledMethods[0].name)
        self.assertEqual(
            {
                'domainId': 'domain-id',
                'identifier': 'repo-id',
                'repositoryGroupId': 'repositoryGroupId'
            }, self.observable.calledMethods[0].kwargs)

    def testUpdateRepositoryAttributes(self):
        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=True)),
                    Method='POST',
                    path='/actions/updateRepositoryAttributes',
                    Body=bUrlencode(dict(
                        identifier='repo-id',
                        domainId='domain-id',
                        userAgent="Herman in de zon op een terras",
                    ),
                                    doseq=True))))
        self.assertEqual('200', header['StatusCode'])
        self.assertEqual(dict(success=True), JsonDict.loads(body))
        self.assertEqual(1, len(self.observable.calledMethods))
        self.assertEqual('updateRepositoryAttributes',
                         self.observable.calledMethods[0].name)

        self.assertEqual(
            {
                'identifier': 'repo-id',
                'domainId': 'domain-id',
                'baseurl': None,
                'set': None,
                'metadataPrefix': None,
                'userAgent': 'Herman in de zon op een terras',
                'collection': None,
                'authorizationKey': None,
                'mappingId': None,
                'targetId': None
            }, self.observable.calledMethods[0].kwargs)

        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=True)),
                    Method='POST',
                    path='/actions/updateRepositoryAttributes',
                    Body=bUrlencode(dict(
                        identifier='repo-id',
                        domainId='domain-id',
                        userAgent="",
                    ),
                                    doseq=True))))
        self.assertEqual('200', header['StatusCode'])
        self.assertEqual(dict(success=True), JsonDict.loads(body))
        self.assertEqual(2, len(self.observable.calledMethods))
        self.assertEqual('updateRepositoryAttributes',
                         self.observable.calledMethods[1].name)
        self.assertEqual(
            {
                'identifier': 'repo-id',
                'domainId': 'domain-id',
                'baseurl': None,
                'set': None,
                'metadataPrefix': None,
                'userAgent': None,
                'collection': None,
                'authorizationKey': None,
                'mappingId': None,
                'targetId': None
            }, self.observable.calledMethods[1].kwargs)

    def testUpdateRepositoryActionForm(self):
        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=True)),
                    Method='POST',
                    path='/actions/updateRepositoryActionAttributes',
                    Body=bUrlencode(dict(
                        identifier='repo-id',
                        domainId='domain-id',
                        maximumIgnore="42",
                    ),
                                    doseq=True))))
        self.assertEqual('200', header['StatusCode'])
        self.assertEqual(dict(success=True), JsonDict.loads(body))
        self.assertEqual(1, len(self.observable.calledMethods))
        self.assertEqual('updateRepositoryAttributes',
                         self.observable.calledMethods[0].name)
        self.assertEqual(
            {
                'complete': False,
                'continuous': None,
                'domainId': 'domain-id',
                'identifier': 'repo-id',
                'maximumIgnore': '42',
                'action': None,
                'use': False
            }, self.observable.calledMethods[0].kwargs)

    def testUpdateRepositoryActionForm_booleanFields(self):
        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=True)),
                    Method='POST',
                    path='/actions/updateRepositoryActionAttributes',
                    Body=bUrlencode(dict(
                        identifier='repo-id',
                        domainId='domain-id',
                        complete="on",
                    ),
                                    doseq=True))))
        self.assertEqual('200', header['StatusCode'])
        self.assertEqual(dict(success=True), JsonDict.loads(body))
        self.assertEqual(1, len(self.observable.calledMethods))
        self.assertEqual('updateRepositoryAttributes',
                         self.observable.calledMethods[0].name)
        self.assertEqual(
            {
                'complete': True,
                'continuous': None,
                'domainId': 'domain-id',
                'identifier': 'repo-id',
                'maximumIgnore': 0,
                'action': None,
                'use': False
            }, self.observable.calledMethods[0].kwargs)

    def testUpdateRepositoryActionForm_Action(self):
        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=True)),
                    Method='POST',
                    path='/actions/updateRepositoryActionAttributes',
                    Body=bUrlencode(dict(
                        identifier='repo-id',
                        domainId='domain-id',
                        action="-",
                    ),
                                    doseq=True))))
        self.assertEqual('200', header['StatusCode'])
        self.assertEqual(dict(success=True), JsonDict.loads(body))
        self.assertEqual(1, len(self.observable.calledMethods))
        self.assertEqual('updateRepositoryAttributes',
                         self.observable.calledMethods[0].name)
        self.assertEqual(
            {
                'complete': False,
                'continuous': None,
                'domainId': 'domain-id',
                'identifier': 'repo-id',
                'maximumIgnore': 0,
                'action': None,
                'use': False
            }, self.observable.calledMethods[0].kwargs)

    def testAddClosingHours(self):
        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=True)),
                    Method='POST',
                    path='/actions/addRepositoryClosingHours',
                    Body=bUrlencode(dict(repositoryId='repo-id',
                                         domainId='domain-id',
                                         week="*",
                                         day="1",
                                         startHour="10",
                                         endHour="14"),
                                    doseq=True))))
        self.assertEqual('200', header['StatusCode'])
        self.assertEqual(dict(success=True), JsonDict.loads(body))
        self.assertEqual(1, len(self.observable.calledMethods))
        self.assertEqual('addClosingHours',
                         self.observable.calledMethods[0].name)
        self.assertEqual(
            {
                'day': '1',
                'domainId': 'domain-id',
                'endHour': '14',
                'identifier': 'repo-id',
                'startHour': '10',
                'week': '*'
            }, self.observable.calledMethods[0].kwargs)

    def testDeleteClosingHours(self):
        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=True)),
                    Method='POST',
                    path='/actions/deleteRepositoryClosingHours',
                    Body=bUrlencode(dict(repositoryId='repo-id',
                                         domainId='domain-id',
                                         closingHour="0"),
                                    doseq=True))))
        self.assertEqual('200', header['StatusCode'])
        self.assertEqual(dict(success=True), JsonDict.loads(body))
        self.assertEqual(1, len(self.observable.calledMethods))
        self.assertEqual('deleteClosingHours',
                         self.observable.calledMethods[0].name)
        self.assertEqual(
            {
                'domainId': 'domain-id',
                'identifier': 'repo-id',
                'closingHoursIndex': '0'
            }, self.observable.calledMethods[0].kwargs)

    def testUpdateFieldDefinition(self):
        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=True)),
                    Method='POST',
                    path='/actions/updateFieldDefinition',
                    Body=bUrlencode(dict(
                        domainId='domain-id',
                        fieldDefinition='{"is":"json"}',
                    ),
                                    doseq=True))))
        self.assertEqual('200', header['StatusCode'])
        self.assertEqual(dict(success=True), JsonDict.loads(body))
        self.assertEqual(1, len(self.observable.calledMethods))
        self.assertEqual('updateFieldDefinition',
                         self.observable.calledMethods[0].name)
        self.assertEqual({
            'domainId': 'domain-id',
            'data': {
                'is': 'json'
            },
        }, self.observable.calledMethods[0].kwargs)

    def testUpdateFieldDefinition_error(self):
        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=True)),
                    Method='POST',
                    path='/actions/updateFieldDefinition',
                    Body=bUrlencode(dict(
                        domainId='domain-id',
                        fieldDefinition='{"is no json"}',
                    ),
                                    doseq=True))))
        self.assertEqual('200', header['StatusCode'])
        self.assertEqual(dict(success=False, message='Ongeldige JSON'),
                         JsonDict.loads(body))
        self.assertEqual(0, len(self.observable.calledMethods))

    def testUpdateRepositoryFieldDefinition(self):
        header, body = parseResponse(
            asBytes(
                self.dna.all.handleRequest(
                    user=CallTrace(returnValues=dict(isAdmin=True)),
                    Method='POST',
                    path='/actions/updateRepositoryFieldDefinitions',
                    Body=bUrlencode(dict(
                        identifier='repo-id',
                        domainId='domain-id',
                        extra_name="Herman in de zon op een terras",
                        extra_no_such_field="Bestaat niet"),
                                    doseq=True))))
        self.assertEqual('200', header['StatusCode'])
        self.assertEqual(dict(success=True), JsonDict.loads(body))
        self.assertEqual(1, len(self.observable.calledMethods))
        self.assertEqual('updateRepositoryFieldDefinitions',
                         self.observable.calledMethods[0].name)

        self.assertEqual(
            {
                'identifier': 'repo-id',
                'domainId': 'domain-id',
                'extra_no_such_field': 'Bestaat niet',
                'extra_name': "Herman in de zon op een terras"
            }, self.observable.calledMethods[0].kwargs)

    def updateTheRepository(self,
                            baseurl='',
                            set='',
                            metadataPrefix='',
                            mappingId='',
                            targetId='',
                            collection='',
                            maximumIgnore=0,
                            use=False,
                            continuous=False,
                            complete=True,
                            action='',
                            shopclosed=None):
        self.hd.updateRepositoryAttributes(
            identifier='repository',
            domainId='domain',
            baseurl=baseurl,
            set=set,
            metadataPrefix=metadataPrefix,
            mappingId=mappingId,
            targetId=targetId,
            collection=collection,
            maximumIgnore=maximumIgnore,
            use=use,
            continuous=continuous,
            complete=complete,
            action=action,
            userAgent='',
            authorizationKey='',
        )
Exemple #9
0
class HarvesterDataActionsTest(SeecrTestCase):
    def setUp(self):
        SeecrTestCase.setUp(self)
        self.hd = HarvesterData(self.tempdir)
        self.hd.addDomain('domain')
        self.hd.addRepositoryGroup('group', domainId='domain')
        self.hd.addRepository('repository',
                              repositoryGroupId='group',
                              domainId='domain')
        self.hda = HarvesterDataActions()
        self.hda.addObserver(self.hd)

    def testUpdateRepository(self):
        data = {
            'redirectUri': 'http://example.org',
            "repositoryGroupId": "ignored",
            "identifier": "repository",
            "domainId": "domain",
            "baseurl": "http://example.org/oai",
            "set": "ASET",
            "metadataPrefix": "oai_dc",
            "mappingId": "mapping_identifier",
            "targetId": "",
            "collection": "the collection",
            "maximumIgnore": "23",
            "complete": "1",
            "continuous": "60",
            "repositoryAction": "clear",
            "numberOfTimeslots": "0"
        }
        consume(
            self.hda.handleRequest(Method='POST',
                                   path='/somewhere/updateRepository',
                                   Body=urlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals('group', repository["repositoryGroupId"])
        self.assertEquals("repository", repository["identifier"])
        self.assertEquals("http://example.org/oai", repository["baseurl"])
        self.assertEquals("ASET", repository["set"])
        self.assertEquals("oai_dc", repository["metadataPrefix"])
        self.assertEquals("mapping_identifier", repository["mappingId"])
        self.assertEquals(None, repository["targetId"])
        self.assertEquals("the collection", repository["collection"])
        self.assertEquals(23, repository["maximumIgnore"])
        self.assertEquals(True, repository["complete"])
        self.assertEquals(60, repository["continuous"])
        self.assertEquals(False, repository["use"])
        self.assertEquals("clear", repository["action"])
        self.assertEquals([], repository['shopclosed'])

    def testMinimalInfo(self):
        data = {
            'redirectUri': 'http://example.org',
            "repositoryGroupId": "ignored",
            "identifier": "repository",
            "domainId": "domain",
        }
        consume(
            self.hda.handleRequest(Method='POST',
                                   path='/somewhere/updateRepository',
                                   Body=urlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals('group', repository["repositoryGroupId"])
        self.assertEquals("repository", repository["identifier"])
        self.assertEquals(None, repository["baseurl"])
        self.assertEquals(None, repository["set"])
        self.assertEquals(None, repository["metadataPrefix"])
        self.assertEquals(None, repository["mappingId"])
        self.assertEquals(None, repository["targetId"])
        self.assertEquals(None, repository["collection"])
        self.assertEquals(0, repository["maximumIgnore"])
        self.assertEquals(None, repository["continuous"])
        self.assertEquals(False, repository["complete"])
        self.assertEquals(False, repository["use"])
        self.assertEquals(None, repository["action"])
        self.assertEquals([], repository['shopclosed'])

    def testShopClosedButNotAdded(self):
        data = {
            'redirectUri': 'http://example.org',
            "repositoryGroupId": "ignored",
            "identifier": "repository",
            "domainId": "domain",
            "numberOfTimeslots": "0",
            'shopclosedWeek_0': '*',
            'shopclosedWeekDay_0': '*',
            'shopclosedBegin_0': '7',
            'shopclosedEnd_0': '9',
        }
        consume(
            self.hda.handleRequest(Method='POST',
                                   path='/somewhere/updateRepository',
                                   Body=urlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals([], repository['shopclosed'])

    def testShopClosedAdded(self):
        data = {
            'redirectUri': 'http://example.org',
            "repositoryGroupId": "ignored",
            "identifier": "repository",
            "domainId": "domain",
            "numberOfTimeslots": "0",
            'shopclosedWeek_0': '*',
            'shopclosedWeekDay_0': '*',
            'shopclosedBegin_0': '7',
            'shopclosedEnd_0': '9',
            "addTimeslot": "button pressed",
        }
        consume(
            self.hda.handleRequest(Method='POST',
                                   path='/somewhere/updateRepository',
                                   Body=urlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals(['*:*:7:0-*:*:9:0'], repository['shopclosed'])

    def testModifyShopClosed(self):
        self.updateTheRepository(shopclosed=[
            '1:2:7:0-1:2:9:0',
            '2:*:7:0-2:*:9:0',
        ])
        data = {
            'redirectUri': 'http://example.org',
            "repositoryGroupId": "ignored",
            "identifier": "repository",
            "domainId": "domain",
            "numberOfTimeslots": "2",
            'shopclosedWeek_0': '*',
            'shopclosedWeekDay_0': '*',
            'shopclosedBegin_0': '7',
            'shopclosedEnd_0': '9',
            'shopclosedWeek_1': '3',
            'shopclosedWeekDay_1': '*',
            'shopclosedBegin_1': '17',
            'shopclosedEnd_1': '19',
            'shopclosedWeek_2': '4',
            'shopclosedWeekDay_2': '5',
            'shopclosedBegin_2': '9',
            'shopclosedEnd_2': '10',
        }
        consume(
            self.hda.handleRequest(Method='POST',
                                   path='/somewhere/updateRepository',
                                   Body=urlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals([
            '3:*:17:0-3:*:19:0',
            '4:5:9:0-4:5:10:0',
        ], repository['shopclosed'])

    def testDeleteShopClosed(self):
        self.updateTheRepository(shopclosed=[
            '1:2:7:0-1:2:9:0',
            '2:*:7:0-2:*:9:0',
        ])
        data = {
            'redirectUri': 'http://example.org',
            "repositoryGroupId": "ignored",
            "identifier": "repository",
            "domainId": "domain",
            "numberOfTimeslots": "2",
            'shopclosedWeek_0': '*',
            'shopclosedWeekDay_0': '*',
            'shopclosedBegin_0': '7',
            'shopclosedEnd_0': '9',
            'shopclosedWeek_1': '3',
            'shopclosedWeekDay_1': '*',
            'shopclosedBegin_1': '17',
            'shopclosedEnd_1': '19',
            'shopclosedWeek_2': '4',
            'shopclosedWeekDay_2': '5',
            'shopclosedBegin_2': '9',
            'shopclosedEnd_2': '10',
            'deleteTimeslot_1.x': '10',
            'deleteTimeslot_1.y': '20',
        }
        consume(
            self.hda.handleRequest(Method='POST',
                                   path='/somewhere/updateRepository',
                                   Body=urlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals([
            '4:5:9:0-4:5:10:0',
        ], repository['shopclosed'])

    def testSetRepositoryDone(self):
        self.updateTheRepository(action='refresh')
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals('refresh', repository['action'])

        data = dict(domainId='domain', identifier='repository')
        consume(
            self.hda.handleRequest(Method='POST',
                                   path='/somewhere/repositoryDone',
                                   Body=urlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals(None, repository['action'])

    def updateTheRepository(self,
                            baseurl='',
                            set='',
                            metadataPrefix='',
                            mappingId='',
                            targetId='',
                            collection='',
                            maximumIgnore=0,
                            use=False,
                            continuous=False,
                            complete=True,
                            action='',
                            shopclosed=None):
        self.hd.updateRepository('repository',
                                 domainId='domain',
                                 baseurl=baseurl,
                                 set=set,
                                 metadataPrefix=metadataPrefix,
                                 mappingId=mappingId,
                                 targetId=targetId,
                                 collection=collection,
                                 maximumIgnore=maximumIgnore,
                                 use=use,
                                 continuous=continuous,
                                 complete=complete,
                                 action=action,
                                 shopclosed=shopclosed or [])
Exemple #10
0
class HarvesterDataTest(SeecrTestCase):
    def setUp(self):
        SeecrTestCase.setUp(self)
        open(join(self.tempdir, 'adomain.domain'), 'w').write("""{
    "identifier": "adomain",
    "mappingIds": ["ignored MAPPING"],
    "targetIds": ["ignored TARGET"],
    "repositoryGroupIds": ["Group1", "Group2"]
}""")
        open(join(self.tempdir, 'adomain.Group1.repositoryGroup'), 'w').write("""{
    "identifier": "Group1",
    "name": {"nl": "Groep1", "en": "Group1"},
    "repositoryIds": ["repository1", "repository2"]
}""")
        open(join(self.tempdir, 'adomain.Group2.repositoryGroup'), 'w').write("""{
    "identifier": "Group2",
    "name": {"nl": "Groep2", "en": "Group2"},
    "repositoryIds": ["repository2_1", "repository2_2"]
} """)
        open(join(self.tempdir, 'adomain.repository1.repository'), 'w').write("""{
    "identifier": "repository1",
    "repositoryGroupId": "Group1"
}""")
        open(join(self.tempdir, 'adomain.repository2.repository'), 'w').write("""{
    "identifier": "repository2",
    "repositoryGroupId": "Group1"
}""")
        open(join(self.tempdir, 'adomain.repository2_1.repository'), 'w').write("""{
    "identifier": "repository2_1",
    "repositoryGroupId": "Group2"
}""")
        open(join(self.tempdir, 'adomain.repository2_2.repository'), 'w').write("""{
    "identifier": "repository2_2",
    "repositoryGroupId": "Group2"
}""")
        open(join(self.tempdir, 'adomain.remi.repository'), 'w').write("""{
    "identifier": "remi",
    "repositoryGroupId": "NoGroup"
}""")
        self.hd = HarvesterData(self.tempdir)

    def testGetRepositoryGroupIds(self):
        self.assertEquals(["Group1", "Group2"], self.hd.getRepositoryGroupIds(domainId="adomain"))

    def testGetRepositoryIds(self):
        self.assertEquals(["repository1", "repository2"], self.hd.getRepositoryIds(domainId="adomain", repositoryGroupId="Group1"))
        self.assertEquals(["repository1", "repository2", "repository2_1", "repository2_2"], self.hd.getRepositoryIds(domainId="adomain"))

    def testGetRepositoryGroupId(self):
        self.assertEquals("Group1", self.hd.getRepositoryGroupId(domainId="adomain", repositoryId="repository1"))

    def testGetRepositoryGroup(self):
        self.assertEqual({
                'identifier': 'Group1',
                'name': {'en': 'Group1', 'nl': 'Groep1'},
                'repositoryIds': ['repository1', 'repository2']
            }, self.hd.getRepositoryGroup(identifier='Group1', domainId='adomain'))

    def testGetRepositories(self):
        result = self.hd.getRepositories(domainId='adomain')
        self.assertEqualsWS("""[
{
    "identifier": "repository1",
    "repositoryGroupId": "Group1"
},
{
    "identifier": "repository2",
    "repositoryGroupId": "Group1"
},
{
    "identifier": "repository2_1",
    "repositoryGroupId": "Group2"
},
{
    "identifier": "repository2_2",
    "repositoryGroupId": "Group2"
}
]""", result.dumps())

    def testGetRepositoriesWithError(self):
        try:
            self.hd.getRepositories(domainId='adomain', repositoryGroupId='doesnotexist')
            self.fail()
        except ValueError, e:
            self.assertEqual('idDoesNotExist', str(e))

        try:
            self.hd.getRepositories(domainId='baddomain')
            self.fail()
        except ValueError, e:
            self.assertEqual('idDoesNotExist', str(e))
Exemple #11
0
class HarvesterDataActionsTest(SeecrTestCase):
    def setUp(self):
        SeecrTestCase.setUp(self)
        self.hd = HarvesterData(self.tempdir)
        self.hd.addDomain('domain')
        self.hd.addRepositoryGroup('group', domainId='domain')
        self.hd.addRepository('repository', repositoryGroupId='group', domainId='domain')
        self.hda = HarvesterDataActions()
        self.hda.addObserver(self.hd)

    def testUpdateRepository(self):
        data = {
            'redirectUri': 'http://example.org',
            "repositoryGroupId": "ignored",
            "identifier": "repository",
            "domainId": "domain",
            "baseurl": "http://example.org/oai",
            "set": "ASET",
            "metadataPrefix": "oai_dc",
            "mappingId": "mapping_identifier",
            "targetId": "",
            "collection": "the collection",
            "maximumIgnore": "23",
            "complete": "1",
            "continuous": "60",
            "repositoryAction": "clear",
            "numberOfTimeslots": "0"
        }
        consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals('group', repository["repositoryGroupId"])
        self.assertEquals("repository", repository["identifier"])
        self.assertEquals("http://example.org/oai", repository["baseurl"])
        self.assertEquals("ASET", repository["set"])
        self.assertEquals("oai_dc", repository["metadataPrefix"])
        self.assertEquals("mapping_identifier", repository["mappingId"])
        self.assertEquals(None, repository["targetId"])
        self.assertEquals("the collection", repository["collection"])
        self.assertEquals(23, repository["maximumIgnore"])
        self.assertEquals(True, repository["complete"])
        self.assertEquals(60, repository["continuous"])
        self.assertEquals(False, repository["use"])
        self.assertEquals("clear", repository["action"])
        self.assertEquals([], repository['shopclosed'])

    def testMinimalInfo(self):
        data = {
            'redirectUri': 'http://example.org',
            "repositoryGroupId": "ignored",
            "identifier": "repository",
            "domainId": "domain",
        }
        consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals('group', repository["repositoryGroupId"])
        self.assertEquals("repository", repository["identifier"])
        self.assertEquals(None, repository["baseurl"])
        self.assertEquals(None, repository["set"])
        self.assertEquals(None, repository["metadataPrefix"])
        self.assertEquals(None, repository["mappingId"])
        self.assertEquals(None, repository["targetId"])
        self.assertEquals(None, repository["collection"])
        self.assertEquals(0, repository["maximumIgnore"])
        self.assertEquals(None, repository["continuous"])
        self.assertEquals(False, repository["complete"])
        self.assertEquals(False, repository["use"])
        self.assertEquals(None, repository["action"])
        self.assertEquals([], repository['shopclosed'])

    def testShopClosedButNotAdded(self):
        data = {
            'redirectUri': 'http://example.org',
            "repositoryGroupId": "ignored",
            "identifier": "repository",
            "domainId": "domain",
            "numberOfTimeslots": "0",
            'shopclosedWeek_0': '*',
            'shopclosedWeekDay_0': '*',
            'shopclosedBegin_0': '7',
            'shopclosedEnd_0': '9',
        }
        consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals([], repository['shopclosed'])

    def testShopClosedAdded(self):
        data = {
            'redirectUri': 'http://example.org',
            "repositoryGroupId": "ignored",
            "identifier": "repository",
            "domainId": "domain",
            "numberOfTimeslots": "0",
            'shopclosedWeek_0': '*',
            'shopclosedWeekDay_0': '*',
            'shopclosedBegin_0': '7',
            'shopclosedEnd_0': '9',
            "addTimeslot": "button pressed",
        }
        consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals(['*:*:7:0-*:*:9:0'], repository['shopclosed'])

    def testModifyShopClosed(self):
        self.updateTheRepository(shopclosed=['1:2:7:0-1:2:9:0', '2:*:7:0-2:*:9:0',])
        data = {
            'redirectUri': 'http://example.org',
            "repositoryGroupId": "ignored",
            "identifier": "repository",
            "domainId": "domain",
            "numberOfTimeslots": "2",
            'shopclosedWeek_0': '*',
            'shopclosedWeekDay_0': '*',
            'shopclosedBegin_0': '7',
            'shopclosedEnd_0': '9',
            'shopclosedWeek_1': '3',
            'shopclosedWeekDay_1': '*',
            'shopclosedBegin_1': '17',
            'shopclosedEnd_1': '19',
            'shopclosedWeek_2': '4',
            'shopclosedWeekDay_2': '5',
            'shopclosedBegin_2': '9',
            'shopclosedEnd_2': '10',
        }
        consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals(['3:*:17:0-3:*:19:0', '4:5:9:0-4:5:10:0',], repository['shopclosed'])

    def testDeleteShopClosed(self):
        self.updateTheRepository(shopclosed=['1:2:7:0-1:2:9:0', '2:*:7:0-2:*:9:0',])
        data = {
            'redirectUri': 'http://example.org',
            "repositoryGroupId": "ignored",
            "identifier": "repository",
            "domainId": "domain",
            "numberOfTimeslots": "2",
            'shopclosedWeek_0': '*',
            'shopclosedWeekDay_0': '*',
            'shopclosedBegin_0': '7',
            'shopclosedEnd_0': '9',
            'shopclosedWeek_1': '3',
            'shopclosedWeekDay_1': '*',
            'shopclosedBegin_1': '17',
            'shopclosedEnd_1': '19',
            'shopclosedWeek_2': '4',
            'shopclosedWeekDay_2': '5',
            'shopclosedBegin_2': '9',
            'shopclosedEnd_2': '10',
            'deleteTimeslot_1.x': '10',
            'deleteTimeslot_1.y': '20',
        }
        consume(self.hda.handleRequest(Method='POST', path='/somewhere/updateRepository', Body=urlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals(['4:5:9:0-4:5:10:0',], repository['shopclosed'])

    def testSetRepositoryDone(self):
        self.updateTheRepository(action='refresh')
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals('refresh', repository['action'])

        data = dict(domainId='domain', identifier='repository')
        consume(self.hda.handleRequest(Method='POST', path='/somewhere/repositoryDone', Body=urlencode(data, doseq=True)))
        repository = self.hd.getRepository('repository', 'domain')
        self.assertEquals(None, repository['action'])

    def updateTheRepository(self, baseurl='', set='', metadataPrefix='', mappingId='', targetId='', collection='', maximumIgnore=0, use=False, continuous=False, complete=True, action='', shopclosed=None):
        self.hd.updateRepository('repository', domainId='domain',
            baseurl=baseurl,
            set=set,
            metadataPrefix=metadataPrefix,
            mappingId=mappingId,
            targetId=targetId,
            collection=collection,
            maximumIgnore=maximumIgnore,
            use=use,
            continuous=continuous,
            complete=complete,
            action=action,
            shopclosed=shopclosed or []
        )
Exemple #12
0
 def createHarvesterData(self, id_fn):
     return HarvesterData(self.tempdir,
                          id_fn=id_fn,
                          datastore=OldDataStore(self.tempdir, id_fn=id_fn))
Exemple #13
0
class HarvesterTest(IntegrationTestCase):
    def setUp(self):
        IntegrationTestCase.setUp(self)
        system("rm -rf %s" % self.harvesterLogDir)
        system("rm -rf %s" % self.harvesterStateDir)
        self.filesystemDir = join(self.integrationTempdir, 'filesystem')
        system("rm -rf %s" % self.filesystemDir)
        self.controlHelper(action='reset')
        self.emptyDumpDir()
        self.domainStatePath = pathlib.Path(self.harvesterStateDir) / DOMAIN
        self.domainLogPath = pathlib.Path(self.harvesterLogDir) / DOMAIN
        self.domainStatePath.mkdir(parents=True)
        self.domainLogPath.mkdir(parents=True)
        self.harvesterData = HarvesterData(
            join(self.integrationTempdir, 'data'))
        try:
            self.harvesterData.addRepositoryGroup(identifier=REPOSITORYGROUP,
                                                  domainId=DOMAIN)
        except ValueError:
            pass
        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP)

    def tearDown(self):
        self.removeRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP)
        IntegrationTestCase.tearDown(self)

    def saveRepository(self,
                       domain,
                       repositoryId,
                       repositoryGroupId,
                       metadataPrefix="oai_dc",
                       action=None,
                       mappingId='MAPPING',
                       targetId='SRUUPDATE',
                       maximumIgnore=5,
                       complete=False,
                       continuous=None,
                       baseUrl=None):
        baseUrl = baseUrl if baseUrl else 'http://localhost:%s/oai' % self.helperServerPortNumber
        try:
            self.harvesterData.addRepository(
                identifier=repositoryId,
                domainId=domain,
                repositoryGroupId=repositoryGroupId)
        except ValueError:
            pass
        self.harvesterData.updateRepositoryAttributes(
            identifier=repositoryId,
            domainId=domain,
            baseurl=baseUrl,
            set=None,
            metadataPrefix=metadataPrefix,
            mappingId=mappingId,
            targetId=targetId,
            collection=None,
            maximumIgnore=maximumIgnore,
            use=True,
            complete=complete,
            continuous=continuous,
            action=action,
            userAgent='',
            authorizationKey='',
        )

    def removeRepository(self, domain, repositoryId, repositoryGroupId):
        self.harvesterData.deleteRepository(
            identifier=repositoryId,
            domainId=domain,
            repositoryGroupId=repositoryGroupId)

    def testHarvestReturnsErrorWillNotSaveState(self):
        logs = self.getLogs()
        self.saveRepository(DOMAIN,
                            "repo_invalid_metadataPrefix",
                            REPOSITORYGROUP,
                            metadataPrefix="not_existing")
        try:
            self.startHarvester(repository="repo_invalid_metadataPrefix")
            self.startHarvester(repository="repo_invalid_metadataPrefix")
            logs = self.getLogs()[len(logs):]
            self.assertEqual(2, len(logs))
            self.assertEqual('/oai', logs[-2]['path'])
            self.assertEqual(
                {
                    'verb': ['ListRecords'],
                    'metadataPrefix': ['not_existing']
                }, logs[0]['arguments'])
            self.assertEqual('/oai', logs[-1]['path'])
            self.assertEqual(
                {
                    'verb': ['ListRecords'],
                    'metadataPrefix': ['not_existing']
                }, logs[1]['arguments'])
        finally:
            self.removeRepository(DOMAIN, 'repo_invalid_metadataPrefix',
                                  REPOSITORYGROUP)

    def get_ids(self, ids_name, repository=REPOSITORY):
        state = State(self.domainStatePath, self.domainLogPath, repository)
        try:
            return getattr(state, ids_name)
        finally:
            state.close()

    def testHarvestToSruUpdate(self):
        # initial harvest
        oldlogs = self.getLogs()
        self.startHarvester(repository=REPOSITORY)
        self.assertEqual(BATCHSIZE, self.sizeDumpDir())
        self.assertEqual(
            2,
            len([
                f for f in listdir(self.dumpDir) if "info:srw/action/1/delete"
                in open(join(self.dumpDir, f)).read()
            ]))
        ids = self.get_ids('ids')
        self.assertEqual(8, len(ids))
        invalidIds = self.get_ids('invalidIds')
        self.assertEqual(0, len(invalidIds))
        logs = self.getLogs()[len(oldlogs):]
        self.assertEqual(1, len(logs))
        self.assertEqual('/oai', logs[-1]['path'])
        self.assertEqual(
            {
                'verb': ['ListRecords'],
                'metadataPrefix': ['oai_dc']
            }, logs[-1]['arguments'])
        statsFile = join(self.harvesterStateDir, DOMAIN,
                         '%s.stats' % REPOSITORY)
        token = getResumptionToken(open(statsFile).readlines()[-1])

        # resumptionToken
        self.startHarvester(repository=REPOSITORY)
        self.assertEqual(15, self.sizeDumpDir())
        ids = self.get_ids('ids')
        self.assertEqual(13, len(ids))
        logs = self.getLogs()[len(oldlogs):]
        self.assertEqual(2, len(logs))
        self.assertEqual('/oai', logs[-1]['path'])
        self.assertEqual({
            'verb': ['ListRecords'],
            'resumptionToken': [token]
        }, logs[-1]['arguments'])

        # Nothing
        output = self.startHarvester(repository=REPOSITORY)
        self.assertEqual('Nothing to do!', what_happened(output))
        logs = self.getLogs()[len(oldlogs):]
        self.assertEqual(2, len(logs))
        self.assertEqual(None,
                         getResumptionToken(open(statsFile).readlines()[-1]))

    def testContinuousHarvest(self):
        oldlogs = self.getLogs()
        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, continuous=1)
        t = Thread(target=lambda: self.startHarvester(
            concurrency=1, runOnce=False, repository=REPOSITORY))
        t.start()
        try:
            sleepWheel(5)
            logs = self.getLogs()[len(oldlogs):]
            self.assertTrue(len(logs) > 2, logs)
            self.assertEqual(
                {
                    'path': '/oai',
                    'arguments': {
                        'verb': ['ListRecords'],
                        'metadataPrefix': ['oai_dc']
                    }
                }, logs[0])
            self.assertTrue('resumptionToken' in logs[1]['arguments'], logs[1])
            self.assertTrue('from' in logs[2]['arguments'], logs[2])
        finally:
            t.join()

    def testIncrementalHarvesting(self):
        oldlogs = self.getLogs()
        statsFile = join(self.harvesterStateDir, DOMAIN,
                         '%s.stats' % REPOSITORY)
        with open(statsFile, 'w') as f:
            f.write(
                'Started: 2011-03-31 13:11:44, Harvested/Uploaded/Deleted/Total: 300/300/0/300, Done: 2011-03-31 13:12:36, ResumptionToken: xyz\n'
            )
            f.write(
                'Started: 2011-04-01 14:11:44, Harvested/Uploaded/Deleted/Total: 300/300/0/300, Done: 2011-04-01 14:12:36, ResumptionToken:\n'
            )
        self.startHarvester(repository=REPOSITORY)
        self.assertEqual(BATCHSIZE, self.sizeDumpDir())
        logs = self.getLogs()[len(oldlogs):]
        self.assertEqual(1, len(logs))
        self.assertEqual('/oai', logs[-1]['path'])
        self.assertEqual(
            {
                'verb': ['ListRecords'],
                'metadataPrefix': ['oai_dc'],
                'from': ['2011-03-31']
            }, logs[-1]['arguments'])

    def testClear(self):
        self.startHarvester(repository=REPOSITORY)
        self.assertEqual(BATCHSIZE, self.sizeDumpDir())

        header, data = getRequest(self.harvesterInternalServerPortNumber,
                                  '/get', {
                                      'verb': 'GetStatus',
                                      'domainId': DOMAIN,
                                      'repositoryId': REPOSITORY
                                  })
        self.assertEqual(8, data['response']['GetStatus'][0]['total'])

        self.saveRepository(DOMAIN,
                            REPOSITORY,
                            REPOSITORYGROUP,
                            action='clear')

        self.startHarvester(repository=REPOSITORY)
        self.assertEqual(18, self.sizeDumpDir())
        for filename in sorted(listdir(self.dumpDir))[-8:]:
            self.assertTrue('_delete.updateRequest' in filename, filename)

        header, data = getRequest(self.harvesterInternalServerPortNumber,
                                  '/get', {
                                      'verb': 'GetStatus',
                                      'domainId': DOMAIN,
                                      'repositoryId': REPOSITORY
                                  })
        self.assertEqual(0, data['response']['GetStatus'][0]['total'])

    def testRefresh(self):
        oldlogs = self.getLogs()
        log = State(stateDir=join(self.harvesterStateDir, DOMAIN),
                    logDir=join(self.harvesterLogDir, DOMAIN),
                    name=REPOSITORY).getHarvesterLog()
        log.startRepository()
        for uploadId in [
                '%s:oai:record:%02d' % (REPOSITORY, i)
                for i in [1, 7, 120, 121]
        ]:
            log.notifyHarvestedRecord(uploadId)
            log.uploadIdentifier(uploadId)
        for uploadId in [
                '%s:oai:record:%02d' % (REPOSITORY, i)
                for i in [4, 5, 122, 123]
        ]:
            log.notifyHarvestedRecord(uploadId)
            log.deleteIdentifier(uploadId)
        log.endRepository('token', '2012-01-01T09:00:00Z')
        log.close()

        self.saveRepository(DOMAIN,
                            REPOSITORY,
                            REPOSITORYGROUP,
                            action='refresh')

        self.startHarvester(repository=REPOSITORY)
        logs = self.getLogs()[len(oldlogs):]
        self.assertEqual(0, len(logs))
        self.startHarvester(repository=REPOSITORY)
        logs = self.getLogs()
        self.assertEqual('/oai', logs[-1]["path"])
        self.assertEqual(
            {
                'verb': ['ListRecords'],
                'metadataPrefix': ['oai_dc']
            }, logs[-1]["arguments"])
        statsFile = join(self.harvesterStateDir, DOMAIN,
                         '%s.stats' % REPOSITORY)
        token = getResumptionToken(open(statsFile).readlines()[-1])

        self.startHarvester(repository=REPOSITORY)
        logs = self.getLogs()
        self.assertEqual('/oai', logs[-1]["path"])
        self.assertEqual({
            'verb': ['ListRecords'],
            'resumptionToken': [token]
        }, logs[-1]["arguments"])
        self.assertEqual(15, self.sizeDumpDir())

        self.startHarvester(repository=REPOSITORY)
        self.assertEqual(17, self.sizeDumpDir())
        deleteFiles = [
            join(self.dumpDir, f) for f in listdir(self.dumpDir)
            if '_delete' in f
        ]
        deletedIds = set([
            xpathFirst(parse(open(x)), '//ucp:recordIdentifier/text()')
            for x in deleteFiles
        ])
        self.assertEqual(
            set([
                '%s:oai:record:03' % REPOSITORY,
                '%s:oai:record:06' % REPOSITORY,
                '%s:oai:record:120' % REPOSITORY,
                '%s:oai:record:121' % REPOSITORY
            ]), deletedIds)

        logs = self.getLogs()[len(oldlogs):]
        self.startHarvester(repository=REPOSITORY)
        self.assertEqual(len(logs), len(self.getLogs()[len(oldlogs):]),
                         'Action is over, expect nothing more.')

    def testInvalidIgnoredUptoMaxIgnore(self):
        maxIgnore = 5
        self.controlHelper(action='allInvalid')
        nrOfDeleted = 2
        self.startHarvester(repository=REPOSITORY)
        self.assertEqual(nrOfDeleted, self.sizeDumpDir())
        self.assertEqual(0, len(self.get_ids('ids')))
        invalidIds = self.get_ids('invalidIds')
        self.assertEqual(maxIgnore + 1, len(invalidIds), invalidIds)
        invalidDataMessagesDir = join(self.harvesterLogDir, DOMAIN, "invalid",
                                      REPOSITORY)
        self.assertEqual(maxIgnore + 1, len(listdir(invalidDataMessagesDir)))
        invalidDataMessage01 = open(
            join(invalidDataMessagesDir, "oai:record:01")).read()
        self.assertTrue('uploadId: "integrationtest:oai:record:01"',
                        invalidDataMessage01)
        self.controlHelper(action='noneInvalid')
        self.startHarvester(repository=REPOSITORY)
        self.assertEqual(nrOfDeleted + BATCHSIZE, self.sizeDumpDir())
        ids = self.get_ids('ids')
        self.assertEqual(BATCHSIZE - nrOfDeleted, len(ids))
        invalidIds = self.get_ids('invalidIds')
        self.assertEqual(0, len(invalidIds), invalidIds)
        self.assertEqual(0, len(listdir(invalidDataMessagesDir)))

    def testHarvestToFilesystemTarget(self):
        self.saveRepository(DOMAIN,
                            REPOSITORY,
                            REPOSITORYGROUP,
                            targetId='FILESYSTEM')
        self.startHarvester(repository=REPOSITORY)

        self.assertEqual(
            8,
            len(listdir(join(self.filesystemDir, REPOSITORYGROUP,
                             REPOSITORY))))
        self.assertEqual(
            ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [3, 6]], [
                id.strip()
                for id in open(join(self.filesystemDir, 'deleted_records'))
            ])

    def testClearOnFilesystemTarget(self):
        self.saveRepository(DOMAIN,
                            REPOSITORY,
                            REPOSITORYGROUP,
                            targetId='FILESYSTEM')
        self.startHarvester(repository=REPOSITORY)

        self.saveRepository(DOMAIN,
                            REPOSITORY,
                            REPOSITORYGROUP,
                            targetId='FILESYSTEM',
                            action='clear')
        self.startHarvester(repository=REPOSITORY)
        self.assertEqual(
            0,
            len(listdir(join(self.filesystemDir, REPOSITORYGROUP,
                             REPOSITORY))))
        self.assertEqual(
            set([
                'harvestertestrepository:oai:record:10',
                'harvestertestrepository:oai:record:09',
                'harvestertestrepository:oai:record:08',
                'harvestertestrepository:oai:record:07',
                'harvestertestrepository:oai:record:06',
                'harvestertestrepository:oai:record:05',
                'harvestertestrepository:oai:record:04',
                'harvestertestrepository:oai:record:03',
                'harvestertestrepository:oai:record:02%2F&gkn',
                'harvestertestrepository:oai:record:01'
            ]),
            set([
                id.strip()
                for id in open(join(self.filesystemDir, 'deleted_records'))
            ]))

    def testHarvestWithError(self):
        self.startHarvester(repository=REPOSITORY)
        self.emptyDumpDir()

        self.controlHelper(action='raiseExceptionOnIds',
                           id=['%s:oai:record:12' % REPOSITORY])
        self.startHarvester(repository=REPOSITORY)
        successFullRecords = ['oai:record:11']
        self.assertEqual(len(successFullRecords), self.sizeDumpDir())
        self.emptyDumpDir()

        self.controlHelper(action='raiseExceptionOnIds', id=[])
        self.startHarvester(repository=REPOSITORY)
        secondBatchSize = 5
        self.assertEqual(secondBatchSize, self.sizeDumpDir())

    def testClearWithError(self):
        self.startHarvester(repository=REPOSITORY)

        self.saveRepository(DOMAIN,
                            REPOSITORY,
                            REPOSITORYGROUP,
                            action='clear')
        self.controlHelper(action='raiseExceptionOnIds',
                           id=['%s:oai:record:05' % REPOSITORY])
        self.emptyDumpDir()

        self.startHarvester(repository=REPOSITORY)
        successFullDeletes = [1, 2, 4]
        deletesTodo = [5, 7, 8, 9, 10]
        self.assertEqual(len(successFullDeletes), self.sizeDumpDir())

        self.controlHelper(action='raiseExceptionOnIds', id=[])
        self.emptyDumpDir()
        self.assertEqual(0, self.sizeDumpDir())
        self.startHarvester(repository=REPOSITORY)
        self.assertEqual(len(deletesTodo), self.sizeDumpDir())

    def testRefreshWithIgnoredRecords(self):
        log = State(stateDir=join(self.harvesterStateDir, DOMAIN),
                    logDir=join(self.harvesterLogDir, DOMAIN),
                    name=REPOSITORY).getHarvesterLog()
        log.startRepository()
        for uploadId in [
                '%s:oai:record:%02d' % (REPOSITORY, i)
                for i in [1, 2, 120, 121]
        ]:
            if uploadId == '%s:oai:record:02' % (REPOSITORY):
                uploadId = '%s:oai:record:02/&gkn' % (REPOSITORY)
            log.notifyHarvestedRecord(uploadId)
            log.uploadIdentifier(uploadId)
        for uploadId in [
                '%s:oai:record:%02d' % (REPOSITORY, i)
                for i in [4, 5, 122, 123, 124]
        ]:
            log.notifyHarvestedRecord(uploadId)
            log.deleteIdentifier(uploadId)
        for uploadId in [
                '%s:oai:record:%02d' % (REPOSITORY, i)
                for i in [7, 8, 125, 126, 127, 128]
        ]:
            log.notifyHarvestedRecord(uploadId)
            log.logInvalidData(uploadId, 'ignored message')
            log.logIgnoredIdentifierWarning(uploadId)
        log.endRepository('token', '2012-01-01T09:00:00Z')
        log.close()
        totalRecords = 15
        oldUploads = 2
        oldDeletes = 3
        oldIgnoreds = 4

        self.saveRepository(DOMAIN,
                            REPOSITORY,
                            REPOSITORYGROUP,
                            action='refresh')

        self.startHarvester(repository=REPOSITORY)  # Smoot init
        self.assertEqual(0, self.sizeDumpDir())
        self.startHarvester(repository=REPOSITORY)  # Smooth harvest
        self.startHarvester(repository=REPOSITORY)  # Smooth harvest
        self.assertEqual(totalRecords, self.sizeDumpDir())
        self.startHarvester(repository=REPOSITORY)  # Smooth finish
        self.assertEqual(totalRecords + oldUploads + oldIgnoreds,
                         self.sizeDumpDir())
        invalidIds = self.get_ids('invalidIds')
        self.assertEqual(0, len(invalidIds), invalidIds)
        self.assertEqual(13, len(self.get_ids('ids')))

    def testClearWithInvalidRecords(self):
        state = State(stateDir=join(self.harvesterStateDir, DOMAIN),
                      logDir=join(self.harvesterLogDir, DOMAIN),
                      name=REPOSITORY)
        try:
            log = state.getHarvesterLog()
            log.startRepository()
            for uploadId in [
                    '%s:oai:record:%02d' % (REPOSITORY, i)
                    for i in [1, 2, 120, 121]
            ]:
                log.notifyHarvestedRecord(uploadId)
                log.uploadIdentifier(uploadId)
            for uploadId in [
                    '%s:oai:record:%02d' % (REPOSITORY, i)
                    for i in [4, 5, 122, 123, 124]
            ]:
                log.notifyHarvestedRecord(uploadId)
                log.deleteIdentifier(uploadId)
            for uploadId in [
                    '%s:oai:record:%02d' % (REPOSITORY, i)
                    for i in [7, 8, 125, 126, 127, 128]
            ]:
                log.notifyHarvestedRecord(uploadId)
                log.logInvalidData(uploadId, 'ignored message')
                log.logIgnoredIdentifierWarning(uploadId)
            log.endRepository('token', '2012-01-01T09:00:00Z')
            log.close()
            oldUploads = 4
            oldDeletes = 5
            oldInvalids = 6
            self.saveRepository(DOMAIN,
                                REPOSITORY,
                                REPOSITORYGROUP,
                                action='clear')

            self.startHarvester(repository=REPOSITORY)
            self.assertEqual(oldUploads + oldInvalids, self.sizeDumpDir())
            self.assertEqual(0, len(state.invalidIds),
                             state.invalidIds.getIds())
            self.assertEqual(0, len(state.ids), state.ids.getIds())
        finally:
            state.close()

    def testConcurrentHarvestToSruUpdate(self):
        self.startHarvester(concurrency=3)

        requestsLogged = sorted(listdir(self.dumpDir))

        repositoryIds = []
        for f in requestsLogged:
            lxml = parse(open(join(self.dumpDir, f)))
            repositoryIds.append(
                xpath(lxml, '//ucp:recordIdentifier/text()')[0].split(':',
                                                                      1)[0])

        repositoryIdsSet = set(repositoryIds)
        self.assertEqual(
            set(['repository2', 'integrationtest', 'harvestertestrepository']),
            repositoryIdsSet)

        lastSeenRepoId = None
        try:
            for repo in repositoryIds:
                if repo != lastSeenRepoId:
                    repositoryIdsSet.remove(repo)
                    lastSeenRepoId = repo
                    continue
        except KeyError:
            pass
        else:
            self.fail('Records should have been inserted out-of-order.')

    def testConcurrentHarvestToSruUpdateBUG(self):
        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, complete=True)

        self.startHarvester(concurrency=1)

        requestsLogged = sorted(listdir(self.dumpDir))
        repositoryIds = []
        for f in requestsLogged:
            lxml = parse(open(join(self.dumpDir, f)))
            repositoryIds.append(
                xpath(lxml, '//ucp:recordIdentifier/text()')[0].split(':',
                                                                      1)[0])
        self.assertEqual(15, repositoryIds.count(REPOSITORY))
        self.assertEqual(10, repositoryIds.count('repository2'))
        self.assertEqual(10, repositoryIds.count('integrationtest'))

    def testStartHarvestingAddedRepository(self):
        t = Thread(
            target=lambda: self.startHarvester(concurrency=1, runOnce=False))
        t.start()

        while not listdir(self.dumpDir):
            sleep(0.1)

        self.saveRepository(DOMAIN, 'xyz', REPOSITORYGROUP)
        stdoutfile = join(self.integrationTempdir,
                          "stdouterr-meresco-harvester-harvester.log")
        sleepWheel(5)
        log = open(stdoutfile).read()
        try:
            self.assertTrue('xyz' in log, log)
        finally:
            self.removeRepository(DOMAIN, 'xyz', REPOSITORYGROUP)
            t.join()

    def testDontHarvestDeletedRepository(self):
        stdoutfile = join(self.integrationTempdir,
                          "stdouterr-meresco-harvester-harvester.log")
        self.saveRepository(DOMAIN, 'xyz', REPOSITORYGROUP)
        t = Thread(
            target=lambda: self.startHarvester(concurrency=1, runOnce=False))
        t.start()

        while not listdir(self.dumpDir):
            sleep(0.1)
        sleepWheel(1)

        def _readFile(name):
            with open(name) as fp:
                return fp.read()

        log = _readFile(stdoutfile)
        xyzOccurrences = log.count('[xyz]')

        self.removeRepository(DOMAIN, 'xyz', REPOSITORYGROUP)
        log = _readFile(stdoutfile)
        try:
            newXyzOccurrences = log.count('[xyz]')
            self.assertEqual(
                xyzOccurrences, newXyzOccurrences,
                "%s!=%s\n%s" % (xyzOccurrences, newXyzOccurrences, log))
        finally:
            t.join()

    def testErrorReportedToGustos(self):
        baseUrl = join(self.integrationTempdir, "choppy_oai.xml")
        filename = "{}?verb=ListRecords&metadataPrefix=oai_dc".format(baseUrl)
        with open(filename, "w") as fp:
            fp.write("""<?xml version="1.0" encoding="UTF-8"?>
            <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2017-10-31T15:12:52Z</responseDate><request from="2017-10-04T11:52:57Z" metadataPrefix="didl_mods" verb="ListRecords">https://surfsharekit.nl/oai/hhs/</request><ListRecords><record><header><identifier>oai:surfsharekit.nl:b6ea6503-e2fc-4974-8941-2a4a405dc72f</identifier><datestamp>2017-10-04T14:16:22Z</datestamp></header><metadata><didl:DIDL xmlns:didl="urn:mpeg:mpeg21:2002:02-DIDL-NS" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
              <didl:Item> <didl:Descriptor> <didl:Statement mimeType="application/xml"> <dii:Identifier xmlns:dii="urn:mpeg:mpeg21:2002:01-DII-NS">urn:nbn:nl:hs:18-b6ea6503-e2fc-4974-8941-2a4a405dc72f</dii:Identifier>
                                      </didl:Statement> </didl:Descrip""")

        errorCount = len(self.gustosUdpListener.log())
        self.saveRepository(DOMAIN,
                            REPOSITORY,
                            REPOSITORYGROUP,
                            baseUrl="file://{}".format(baseUrl))
        t = Thread(
            target=lambda: self.startHarvester(concurrency=1, runOnce=True))
        t.start()

        sleepWheel(5)
        last_logs = [
            JsonDict.loads(l)['data']
            for l in self.gustosUdpListener.log()[errorCount:]
        ]
        for data in reversed(last_logs):
            my_group_log = data.get(f'Harvester ({DOMAIN})',
                                    {}).get(f'{REPOSITORYGROUP}:{REPOSITORY}')
            if my_group_log is not None:
                break
        self.assertEqual({"count": 1}, my_group_log['errors'])

    def testConcurrencyAtLeastOne(self):
        stdouterrlog = self.startHarvester(concurrency=0, expectedReturnCode=2)
        self.assertTrue("Concurrency must be at least 1" in stdouterrlog,
                        stdouterrlog)

        stdouterrlog = self.startHarvester(concurrency=-1,
                                           expectedReturnCode=2)
        self.assertTrue("Concurrency must be at least 1" in stdouterrlog,
                        stdouterrlog)

    def testCompleteInOnAttempt(self):
        self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, complete=True)
        stdouterrlog = self.startHarvester(repository=REPOSITORY,
                                           runOnce=True,
                                           timeoutInSeconds=5)
        self.assertEqual(15, self.sizeDumpDir())
        self.assertTrue(
            "Repository will be completed in one attempt" in stdouterrlog,
            stdouterrlog)

    def testHarvestingContinues4Ever(self):
        try:
            self.startHarvester(repository=REPOSITORY,
                                runOnce=False,
                                timeoutInSeconds=5)
        except SystemExit as e:
            self.assertTrue('took more than 5 seconds' in str(e), str(e))
        self.assertEqual(15, self.sizeDumpDir())

    def testBadOai(self):
        header, data = getRequest(port=self.helperServerPortNumber,
                                  path='/badoai/responsedate',
                                  arguments=dict(verb='ListRecords',
                                                 metadataPrefix='prefix'))
        self.assertEqual(
            'resume0',
            xpathFirst(
                data,
                '/oai:OAI-PMH/oai:ListRecords/oai:resumptionToken/text()'))
        header, data = getRequest(port=self.helperServerPortNumber,
                                  path='/badoai/responsedate',
                                  arguments=dict(verb='ListRecords',
                                                 resumptionToken='resume0'))
        self.assertEqual(
            'resume1',
            xpathFirst(
                data,
                '/oai:OAI-PMH/oai:ListRecords/oai:resumptionToken/text()'))

    def testNormalHarvesting(self):
        self.assertEqual(
            'Harvested.',
            what_happened(self.startHarvester(repository=REPOSITORY)))
        self.assertEqual(10, self.sizeDumpDir())
        self.assertEqual(
            'Harvested.',
            what_happened(self.startHarvester(repository=REPOSITORY)))
        self.assertEqual(15, self.sizeDumpDir())
        self.assertEqual(
            'Nothing to do!',
            what_happened(self.startHarvester(repository=REPOSITORY)))
        self.assertEqual(15, self.sizeDumpDir())

    def saveBadoai(self, **kwargs):
        self.saveRepository(
            DOMAIN,
            REPOSITORY,
            REPOSITORYGROUP,
            baseUrl='http://localhost:{}/badoai/responsedate'.format(
                self.helperServerPortNumber),
            metadataPrefix='prefix',
            **kwargs)

    def testWithStrangeResponseDate(self):
        self.saveBadoai(complete=False)
        self.assertEqual(
            'Harvested.',
            what_happened(self.startHarvester(repository=REPOSITORY)))
        self.assertEqual(1, self.sizeDumpDir())
        self.assertEqual(
            'Harvested.',
            what_happened(self.startHarvester(repository=REPOSITORY)))
        self.assertEqual(2, self.sizeDumpDir())
        self.assertEqual(
            'Harvested.',
            what_happened(self.startHarvester(repository=REPOSITORY)))
        self.assertEqual(3, self.sizeDumpDir())
        self.assertEqual(
            'Nothing to do!',
            what_happened(self.startHarvester(repository=REPOSITORY)))
        self.assertEqual(3, self.sizeDumpDir())
        # Problem is that the harvester wants to continue because responsedate is in the past. It should
        # use a separate date to determine if it has done enough for the day.

    def testCompleteWithStrangeResponseDate(self):
        self.saveBadoai(complete=True)
        self.assertEqual(
            'Harvested.',
            what_happened(self.startHarvester(repository=REPOSITORY)))
        self.assertEqual(3, self.sizeDumpDir())
        self.assertEqual(
            'Nothing to do!',
            what_happened(self.startHarvester(repository=REPOSITORY)))
        self.assertEqual(3, self.sizeDumpDir())

        # Further testing
        # save(action='refresh')
        # output = self.startHarvester(repository='responsedate')
        # self.assertEqual('Harvested.', what_happened(output))
        # output = self.startHarvester(repository='responsedate')
        # self.assertEqual('Harvested.', what_happened(output))

    def emptyDumpDir(self):
        if listdir(self.dumpDir):
            system('rm %s/*' % self.dumpDir)

    def sizeDumpDir(self):
        return len(listdir(self.dumpDir))