def testHarvestToSruUpdate(self): # initial harvest oldlogs = self.getLogs() self.startHarvester(repository=REPOSITORY) self.assertEquals(BATCHSIZE, self.sizeDumpDir()) self.assertEquals(2, len([f for f in listdir(self.dumpDir) if "info:srw/action/1/delete" in open(join(self.dumpDir, f)).read()])) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(8, len(ids)) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(invalidIds)) logs = self.getLogs()[len(oldlogs):] self.assertEquals(1, len(logs)) self.assertEquals('/oai', logs[-1]['path']) self.assertEquals({'verb':['ListRecords'], 'metadataPrefix':['oai_dc']}, logs[-1]['arguments']) statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY) token = getResumptionToken(open(statsFile).readlines()[-1]) # resumptionToken self.startHarvester(repository=REPOSITORY) self.assertEquals(15, self.sizeDumpDir()) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(13, len(ids)) logs = self.getLogs()[len(oldlogs):] self.assertEquals(2, len(logs)) self.assertEquals('/oai', logs[-1]['path']) self.assertEquals({'verb':['ListRecords'], 'resumptionToken':[token]}, logs[-1]['arguments']) # Nothing self.startHarvester(repository=REPOSITORY) logs = self.getLogs()[len(oldlogs):] self.assertEquals(2, len(logs)) self.assertEquals(None, getResumptionToken(open(statsFile).readlines()[-1]))
def testReadResumptionTokenFromStats(self): logline = ' Started: 2005-01-02 16:12:56, Harvested/Uploaded: 199/ 200, Done: 2005-01-02 16:13:45, ResumptionToken: ^^^oai_dc^45230' self.assertEquals('^^^oai_dc^45230', getResumptionToken(logline)) logline='Started: 1999-12-01 16:37:41, Harvested/Uploaded: 113/ 113, Error: XXX\n' self.assertEqual(None, getResumptionToken(logline)) logline = ' Started: 2005-01-02 16:12:56, Harvested/Uploaded: 199/ 200, Done: 2005-01-02 16:13:45, ResumptionToken: None' self.assertEqual(None, getResumptionToken(logline)) logline = ' Started: 2005-01-02 16:12:56, Harvested/Uploaded: 199/ 200, Done: 2005-01-02 16:13:45, ResumptionToken: ^^^oai_dc^45230\n' self.assertEquals('^^^oai_dc^45230', getResumptionToken(logline)) logline = ' Started: 2005-01-02 16:12:56, Harvested/Uploaded: 199/ 200, Done: 2005-01-02 16:13:45, ResumptionToken: ^^^oai_dc^452 30\n' self.assertEquals('^^^oai_dc^452 30', getResumptionToken(logline))
def testRefresh(self): oldlogs = self.getLogs() log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY) log.startRepository() for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,7,120,121]]: log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123]]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='refresh') self.startHarvester(repository=REPOSITORY) logs = self.getLogs()[len(oldlogs):] self.assertEquals(0, len(logs)) self.startHarvester(repository=REPOSITORY) logs = self.getLogs() self.assertEquals('/oai', logs[-1]["path"]) self.assertEquals({'verb': ['ListRecords'], 'metadataPrefix': ['oai_dc']}, logs[-1]["arguments"]) statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY) token = getResumptionToken(open(statsFile).readlines()[-1]) self.startHarvester(repository=REPOSITORY) logs = self.getLogs() self.assertEquals('/oai', logs[-1]["path"]) self.assertEquals({'verb': ['ListRecords'], 'resumptionToken': [token]}, logs[-1]["arguments"]) self.assertEquals(15, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) self.assertEquals(17, self.sizeDumpDir()) deleteFiles = [join(self.dumpDir, f) for f in listdir(self.dumpDir) if '_delete' in f] deletedIds = set([xpathFirst(parse(open(x)), '//ucp:recordIdentifier/text()') for x in deleteFiles]) self.assertEquals(set(['%s:oai:record:03' % REPOSITORY, '%s:oai:record:06' % REPOSITORY, '%s:oai:record:120' % REPOSITORY, '%s:oai:record:121' % REPOSITORY]), deletedIds) logs = self.getLogs()[len(oldlogs):] self.startHarvester(repository=REPOSITORY) self.assertEquals(len(logs), len(self.getLogs()[len(oldlogs):]), 'Action is over, expect nothing more.')