Beispiel #1
0
    def test_split_file(self):
        original = os.path.join(FIXTURES, 'l-100.txt')
        task = SplitFile(filename=original, chunks=10)
        unlink(task.output().path)

        luigi.build([task], local_scheduler=True)
        lines = [line.strip() for line in task.output().open()]
        self.assertEquals(10, len(lines))

        content = ''.join(open(fn).read() for fn in lines)
        with open(original) as handle:
            self.assertEquals(content, handle.read())
Beispiel #2
0
    def test_split_file(self):
        original = os.path.join(FIXTURES, 'l-100.txt')
        task = SplitFile(filename=original, chunks=10)
        unlink(task.output().path)

        luigi.build([task], local_scheduler=True)
        lines = [line.strip() for line in task.output().open()]
        self.assertEquals(10, len(lines))

        content = ''.join(open(fn).read() for fn in lines)
        with open(original) as handle:
            self.assertEquals(content, handle.read())
Beispiel #3
0
    def test_unlink(self):
        """ Should not raise any exception on non-existent file. """
        path = tempfile.mktemp()
        self.assertFalse(os.path.exists(path))

        exception_raised = False
        try:
            os.unlink(path)
        except OSError as err:
            exception_raised = True
            self.assertEquals(errno.ENOENT, err.errno)
        self.assertTrue(exception_raised)

        exception_raised = False
        try:
            unlink(path)
        except Exception as exc:
            exception_raised = True
        self.assertFalse(exception_raised)
Beispiel #4
0
 def test_run_l100(self):
     task = ConcreteLineCount(filename='l-100.txt')
     unlink(task.output().path)
     luigi.build([task], local_scheduler=True)
     content = task.output().open().read().strip()
     self.assertEquals('100', content)
Beispiel #5
0
def oai_harvest(url=None, collection=None, begin=None, end=None,
                prefix='oai_dc', verb='ListRecords',
                max_retries=8, directory=None, ext='xml', download=download,
                delay=0):
    """
    Harvest OAI for `url`. Will download all files into `directory`. Optionally
    add a delay between requests.

    argument    OAI name
    --------------------
    begin       from
    collection  set
    end         until
    prefix      metadataPrefix
    verb        verb
    """
    if url is None:
        raise RuntimeError('A URL must be given.')
    if directory is None:
        raise RuntimeError('A directory must be given.')
    if not os.path.exists(directory):
        raise RuntimeError('Directory does not exist: %s' % directory)

    params = {'from': begin, 'until': end,
              'metadataPrefix': prefix, 'set': collection,
              'verb': verb}
    params = dict([(k, v) for k, v in params.iteritems() if v])

    # first request with all params
    full_url = '%s?%s' % (url, urllib.urlencode(params))
    path = os.path.join(directory, '%s.%s' % (random_string(length=16), ext))

    for retry in range(max_retries):
        try:
            download(url=full_url, filename=path, timeout=30)
            time.sleep(delay)
            break
        except RuntimeError as err:
            logger.info('Retry %s on %s' % (retry, full_url))
            unlink(path)
    else:
        raise RuntimeError('Max retries (%s) exceeded: %s' % (
                           max_retries, full_url))

    # any subsequent request uses 'resumptiontoken'
    while True:
        with open(path) as handle:
            soup = BeautifulSoup.BeautifulStoneSoup(handle.read())
        token = soup.find('resumptiontoken')
        if token is None:
            break

        # subsequent requests are done with resumptiontoken only ...
        params = {'resumptionToken': token.text, 'verb': verb}
        full_url = '%s?%s' % (url, urllib.urlencode(params))
        path = os.path.join(directory, "%s.%s" % (
                            random_string(length=16), ext))

        retry = 0
        while True:
            if retry >= max_retries:
                raise RuntimeError("Max retries (%s) exceeded: %s" % (
                                   max_retries, full_url))
            try:
                download(url=full_url, filename=path)
                time.sleep(delay)
                break
            except RuntimeError as err:
                retry += 1
                logger.info("Retry #%s on %s" % (retry, full_url))
                unlink(path)
Beispiel #6
0
 def test_run_l100(self):
     task = ConcreteLineCount(filename='l-100.txt')
     unlink(task.output().path)
     luigi.build([task], local_scheduler=True)
     content = task.output().open().read().strip()
     self.assertEquals('100', content)