Example #1
0
    def test_oai_harvest(self):
        """ Test harvesting. """
        exception_raised = False
        try:
            oai_harvest(url='http://nirvana', directory=tempfile.gettempdir(),
                        download=download_failed)
        except RuntimeError:
            exception_raised = True
        self.assertTrue(exception_raised)

        # probability of this test failing: 7.888609052210118e-29.
        oai_harvest(url='http://nirvana', directory=tempfile.gettempdir(),
                    max_retries=100, download=download_flaky(probability=0.5))
Example #2
0
 def run(self):
     """ Harvest files for a certain timeframe in a temporary
     directory, then combine all records into a single file. """
     stopover = tempfile.mkdtemp(prefix='tasktree-')
     oai_harvest(url="http://oai.bnf.fr/oai2/OAIHandler",
                 begin=self.begin, end=self.end, prefix=self.prefix,
                 directory=stopover, collection=self.collection)
     with self.output().open('w') as output:
         output.write("""<collection
             xmlns="http://www.openarchives.org/OAI/2.0/"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">""")
         for path in iterfiles(stopover):
             with open(path) as handle:
                 soup = BeautifulSoup.BeautifulStoneSoup(handle.read())
                 for record in soup.findAll('record'):
                     output.write(str(record)) # or unicode?
         output.write('</collection>\n')
Example #3
0
    def run(self):
        stopover = tempfile.mkdtemp(prefix='gluish-')
        oai_harvest(url=self.url, begin=self.begin, end=self.end,
                    prefix=self.prefix, directory=stopover,
                    collection=self.collection, delay=self.delay)

        with self.output().open('w') as output:
            output.write("""<collection
                xmlns="http://www.openarchives.org/OAI/2.0/"
                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
            """)
            for path in iterfiles(stopover):
                with open(path) as handle:
                    soup = BeautifulSoup.BeautifulStoneSoup(handle.read())
                    for record in soup.findAll('record'):
                        output.write(str(record)) # or unicode?
            output.write('</collection>\n')
Example #4
0
    def run(self):
        stopover = tempfile.mkdtemp(prefix='gluish-')
        oai_harvest(url=self.url,
                    begin=self.begin,
                    end=self.end,
                    prefix=self.prefix,
                    directory=stopover,
                    collection=self.collection,
                    delay=self.delay)

        with self.output().open('w') as output:
            output.write("""<collection
                xmlns="http://www.openarchives.org/OAI/2.0/"
                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
            """)
            for path in iterfiles(stopover):
                with open(path) as handle:
                    soup = BeautifulSoup.BeautifulStoneSoup(handle.read())
                    for record in soup.findAll('record'):
                        output.write(str(record))  # or unicode?
            output.write('</collection>\n')