def test_split_file(self): original = os.path.join(FIXTURES, 'l-100.txt') task = SplitFile(filename=original, chunks=10) unlink(task.output().path) luigi.build([task], local_scheduler=True) lines = [line.strip() for line in task.output().open()] self.assertEquals(10, len(lines)) content = ''.join(open(fn).read() for fn in lines) with open(original) as handle: self.assertEquals(content, handle.read())
def test_unlink(self): """ Should not raise any exception on non-existent file. """ path = tempfile.mktemp() self.assertFalse(os.path.exists(path)) exception_raised = False try: os.unlink(path) except OSError as err: exception_raised = True self.assertEquals(errno.ENOENT, err.errno) self.assertTrue(exception_raised) exception_raised = False try: unlink(path) except Exception as exc: exception_raised = True self.assertFalse(exception_raised)
def test_run_l100(self): task = ConcreteLineCount(filename='l-100.txt') unlink(task.output().path) luigi.build([task], local_scheduler=True) content = task.output().open().read().strip() self.assertEquals('100', content)
def oai_harvest(url=None, collection=None, begin=None, end=None, prefix='oai_dc', verb='ListRecords', max_retries=8, directory=None, ext='xml', download=download, delay=0): """ Harvest OAI for `url`. Will download all files into `directory`. Optionally add a delay between requests. argument OAI name -------------------- begin from collection set end until prefix metadataPrefix verb verb """ if url is None: raise RuntimeError('A URL must be given.') if directory is None: raise RuntimeError('A directory must be given.') if not os.path.exists(directory): raise RuntimeError('Directory does not exist: %s' % directory) params = {'from': begin, 'until': end, 'metadataPrefix': prefix, 'set': collection, 'verb': verb} params = dict([(k, v) for k, v in params.iteritems() if v]) # first request with all params full_url = '%s?%s' % (url, urllib.urlencode(params)) path = os.path.join(directory, '%s.%s' % (random_string(length=16), ext)) for retry in range(max_retries): try: download(url=full_url, filename=path, timeout=30) time.sleep(delay) break except RuntimeError as err: logger.info('Retry %s on %s' % (retry, full_url)) unlink(path) else: raise RuntimeError('Max retries (%s) exceeded: %s' % ( max_retries, full_url)) # any subsequent request uses 'resumptiontoken' while True: with open(path) as handle: soup = BeautifulSoup.BeautifulStoneSoup(handle.read()) token = soup.find('resumptiontoken') if token is None: break # subsequent requests are done with resumptiontoken only ... params = {'resumptionToken': token.text, 'verb': verb} full_url = '%s?%s' % (url, urllib.urlencode(params)) path = os.path.join(directory, "%s.%s" % ( random_string(length=16), ext)) retry = 0 while True: if retry >= max_retries: raise RuntimeError("Max retries (%s) exceeded: %s" % ( max_retries, full_url)) try: download(url=full_url, filename=path) time.sleep(delay) break except RuntimeError as err: retry += 1 logger.info("Retry #%s on %s" % (retry, full_url)) unlink(path)