def test_oai_harvest(self): """ Test harvesting. """ exception_raised = False try: oai_harvest(url='http://nirvana', directory=tempfile.gettempdir(), download=download_failed) except RuntimeError: exception_raised = True self.assertTrue(exception_raised) # probability of this test failing: 7.888609052210118e-29. oai_harvest(url='http://nirvana', directory=tempfile.gettempdir(), max_retries=100, download=download_flaky(probability=0.5))
def run(self): """ Harvest files for a certain timeframe in a temporary directory, then combine all records into a single file. """ stopover = tempfile.mkdtemp(prefix='tasktree-') oai_harvest(url="http://oai.bnf.fr/oai2/OAIHandler", begin=self.begin, end=self.end, prefix=self.prefix, directory=stopover, collection=self.collection) with self.output().open('w') as output: output.write("""<collection xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">""") for path in iterfiles(stopover): with open(path) as handle: soup = BeautifulSoup.BeautifulStoneSoup(handle.read()) for record in soup.findAll('record'): output.write(str(record)) # or unicode? output.write('</collection>\n')
def run(self): stopover = tempfile.mkdtemp(prefix='gluish-') oai_harvest(url=self.url, begin=self.begin, end=self.end, prefix=self.prefix, directory=stopover, collection=self.collection, delay=self.delay) with self.output().open('w') as output: output.write("""<collection xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> """) for path in iterfiles(stopover): with open(path) as handle: soup = BeautifulSoup.BeautifulStoneSoup(handle.read()) for record in soup.findAll('record'): output.write(str(record)) # or unicode? output.write('</collection>\n')