Exemple #1
0
def iter_tsv(input_stream, cols=None, encoding='utf-8'):
    """
    If a tuple is given in cols, use the elements as names to construct
    a namedtuple.

    Columns can be marked as ignored by using ``X`` or ``0`` as column name.

    Example (ignore the first four columns of a five column TSV):

    ::

        def run(self):
            with self.input().open() as handle:
                for row in handle.iter_tsv(cols=('X', 'X', 'X', 'X', 'iln')):
                    print(row.iln)
    """
    if cols:
        cols = [c if not c in ('x', 'X', 0, None) else random_string(length=5)
                for c in cols]
        Record = collections.namedtuple('Record', cols)
        for line in input_stream:
            yield Record._make(line.decode(encoding).rstrip('\n').split('\t'))
    else:
        for line in input_stream:
            yield tuple(line.decode(encoding).rstrip('\n').split('\t'))
Exemple #2
0
def random_tmp_path(prefix='gluish'):
    """
    Return a random path, that is located under the system's tmp dir. This
    is just a path, nothing gets touched or created.

    Just use:

        tempfile.mktemp(prefix='gluish-')

    instead.
    """
    warnings.warn("deprecated", DeprecationWarning)
    return os.path.join(tempfile.gettempdir(), '%s-%s' % (prefix, random_string()))
Exemple #3
0
    def test_create_dir(self):
        target = os.path.join(tempfile.gettempdir(), random_string())
        task = Directory(path=target)
        luigi.build([task], local_scheduler=True)
        self.assertEquals(task.output().path, target)
        self.assertTrue(os.path.isdir(task.output().path))

        # task must be idempotent
        task = Directory(path=target)
        self.assertTrue(task.complete())
        luigi.build([task], local_scheduler=True)
        self.assertEquals(task.output().path, target)
        self.assertTrue(os.path.isdir(task.output().path))
Exemple #4
0
class DailyIndex(FrontpageTask, luigi.WrapperTask):
    """ Wraps a couple of downloads, so they can be parallelized. """
    date = luigi.DateParameter(default=daily())
    indicator = luigi.Parameter(default=random_string())

    def requires(self):
        """ Index all pages. """
        for url in NEWSPAPERS:
            yield IndexPage(url=url, date=self.date)

    def output(self):
        """ This is just a wrapper task. """
        return self.input()
Exemple #5
0
    def test_create_dir(self):
        target = os.path.join(tempfile.gettempdir(), random_string())
        task = Directory(path=target)
        luigi.build([task], local_scheduler=True)
        self.assertEquals(task.output().path, target)
        self.assertTrue(os.path.isdir(task.output().path))

        # task must be idempotent
        task = Directory(path=target)
        self.assertTrue(task.complete())
        luigi.build([task], local_scheduler=True)
        self.assertEquals(task.output().path, target)
        self.assertTrue(os.path.isdir(task.output().path))
Exemple #6
0
class FTPMirror(CommonTask):
    """
    A generic FTP directory sync. Required lftp (http://lftp.yar.ru/).
    The output of this task is a single file, that contains the paths
    to all the mirrored files.
    """
    host = luigi.Parameter()
    username = luigi.Parameter(default='anonymous')
    password = luigi.Parameter(default='')
    pattern = luigi.Parameter(default='*', description="e.g. '*leip_*.zip'")
    base = luigi.Parameter(default='.')
    indicator = luigi.Parameter(default=random_string())

    def requires(self):
        return Executable(name='lftp', message='http://lftp.yar.ru/')

    def run(self):
        """ The indicator is always recreated, while the subdir
        for a given (host, username, base, pattern) is just synced. """
        base = os.path.dirname(self.output().path)
        subdir = hashlib.sha1('{host}:{username}:{base}:{pattern}'.format(
            host=self.host,
            username=self.username,
            base=self.base,
            pattern=self.pattern)).hexdigest()
        # target is the root of the mirror
        target = os.path.join(base, subdir)
        if not os.path.exists(target):
            os.makedirs(target)

        command = """lftp -u {username},{password}
        -e "set net:max-retries 5; set net:timeout 10; mirror --verbose=0
        --only-newer -I {pattern} {base} {target}; exit" {host}"""

        shellout(command,
                 host=self.host,
                 username=pipes.quote(self.username),
                 password=pipes.quote(self.password),
                 pattern=pipes.quote(self.pattern),
                 target=pipes.quote(target),
                 base=pipes.quote(self.base))

        with self.output().open('w') as output:
            for path in iterfiles(target):
                logger.debug("Mirrored: %s" % path)
                output.write_tsv(path)

    def output(self):
        return luigi.LocalTarget(path=self.path(digest=True), format=TSV)
Exemple #7
0
class FTPFileCopyTaskWithWrongUsername(TestTask):
    """ Indicator make this task run on each test run. """
    indicator = luigi.Parameter(default=random_string())

    def requires(self):
        return FTPFile(host='ftp.cs.brown.edu',
                       username='******',
                       password='******',
                       filepath='/pub/techreports/00/cs00-07.pdf')

    def run(self):
        self.input().move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext='pdf'))
Exemple #8
0
class MirrorTask(TestTask):
    """ Indicator make this task run on each test run. """
    indicator = luigi.Parameter(default=random_string())

    def requires(self):
        return FTPMirror(host='ftp.cs.brown.edu',
                         username='******',
                         password='******',
                         pattern='*02*pdf',
                         base='/pub/techreports/00')

    def run(self):
        self.input().move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path())
Exemple #9
0
    def run(self):
        line_count = sum(1 for line in open(self.filename))
        lines = int((line_count + self.chunks) / self.chunks)

        taskdir = os.path.dirname(self.output().fn)
        if not os.path.exists(taskdir):
            os.makedirs(taskdir)

        prefix = random_string()
        shellout("cd {taskdir} && split -l {lines} {input} {prefix}",
                 taskdir=taskdir, lines=lines, input=self.filename,
                 prefix=prefix)

        with self.output().open('w') as output:
            for path in sorted(iterfiles(taskdir)):
                if os.path.basename(path).startswith(prefix):
                    output.write_tsv(path)
Exemple #10
0
 def run(self):
     prefix = "{0}-".format(random_string())
     output = shellout(
         "cd {tmp} && split -l {lines} -a 8 {input} {prefix} && cd -",
         lines=self.lines,
         tmp=tempfile.gettempdir(),
         input=self.input().path,
         prefix=prefix,
     )
     target = os.path.join(self.taskdir())
     if not os.path.exists(target):
         os.makedirs(target)
     with self.output().open("w") as output:
         for path in iterfiles(tempfile.gettempdir()):
             filename = os.path.basename(path)
             if filename.startswith(prefix):
                 dst = os.path.join(target, filename)
                 shutil.move(path, dst)
                 output.write_tsv(dst)
Exemple #11
0
    def run(self):
        line_count = sum(1 for line in open(self.filename))
        lines = int((line_count + self.chunks) / self.chunks)

        taskdir = os.path.dirname(self.output().fn)
        if not os.path.exists(taskdir):
            os.makedirs(taskdir)

        prefix = random_string()
        shellout("cd {taskdir} && split -l {lines} {input} {prefix}",
                 taskdir=taskdir,
                 lines=lines,
                 input=self.filename,
                 prefix=prefix)

        with self.output().open('w') as output:
            for path in sorted(iterfiles(taskdir)):
                if os.path.basename(path).startswith(prefix):
                    output.write_tsv(path)
Exemple #12
0
 def test_random_string(self):
     """ Test random string length. """
     self.assertEquals(16, len(random_string()))
     self.assertEquals(10, len(random_string(length=10)))
Exemple #13
0
 def test_random_string(self):
     """ Test random string length. """
     self.assertEquals(16, len(random_string()))
     self.assertEquals(10, len(random_string(length=10)))
Exemple #14
0
def oai_harvest(url=None, collection=None, begin=None, end=None,
                prefix='oai_dc', verb='ListRecords',
                max_retries=8, directory=None, ext='xml', download=download,
                delay=0):
    """
    Harvest OAI for `url`. Will download all files into `directory`. Optionally
    add a delay between requests.

    argument    OAI name
    --------------------
    begin       from
    collection  set
    end         until
    prefix      metadataPrefix
    verb        verb
    """
    if url is None:
        raise RuntimeError('A URL must be given.')
    if directory is None:
        raise RuntimeError('A directory must be given.')
    if not os.path.exists(directory):
        raise RuntimeError('Directory does not exist: %s' % directory)

    params = {'from': begin, 'until': end,
              'metadataPrefix': prefix, 'set': collection,
              'verb': verb}
    params = dict([(k, v) for k, v in params.iteritems() if v])

    # first request with all params
    full_url = '%s?%s' % (url, urllib.urlencode(params))
    path = os.path.join(directory, '%s.%s' % (random_string(length=16), ext))

    for retry in range(max_retries):
        try:
            download(url=full_url, filename=path, timeout=30)
            time.sleep(delay)
            break
        except RuntimeError as err:
            logger.info('Retry %s on %s' % (retry, full_url))
            unlink(path)
    else:
        raise RuntimeError('Max retries (%s) exceeded: %s' % (
                           max_retries, full_url))

    # any subsequent request uses 'resumptiontoken'
    while True:
        with open(path) as handle:
            soup = BeautifulSoup.BeautifulStoneSoup(handle.read())
        token = soup.find('resumptiontoken')
        if token is None:
            break

        # subsequent requests are done with resumptiontoken only ...
        params = {'resumptionToken': token.text, 'verb': verb}
        full_url = '%s?%s' % (url, urllib.urlencode(params))
        path = os.path.join(directory, "%s.%s" % (
                            random_string(length=16), ext))

        retry = 0
        while True:
            if retry >= max_retries:
                raise RuntimeError("Max retries (%s) exceeded: %s" % (
                                   max_retries, full_url))
            try:
                download(url=full_url, filename=path)
                time.sleep(delay)
                break
            except RuntimeError as err:
                retry += 1
                logger.info("Retry #%s on %s" % (retry, full_url))
                unlink(path)