Example #1
0
File: app.py Project: miku/pkpy
def pypi_build(name, target='deb'):
    """ Take a package name and return the filename of the target. """
    cache_key = hashlib.sha1('%s:%s' % (name, target)).hexdigest()
    cache = shelve.open(CACHE)

    if not cache_key in cache:
        logger.debug('Building %s for %s...' % (target, name))
        stopover = tempfile.mkdtemp(prefix='pkpy-')
        try:
            shellout("""cd {stopover} && 
                        fpm --verbose -s python -t {target} {name}""",
                     stopover=stopover, name=name, target=target)
            src = iterfiles(stopover).next()
            basename = os.path.basename(src)
            dst = os.path.join(PACKAGE_CACHE, basename)
            shutil.copyfile(src, dst)
            shutil.rmtree(stopover)
            cache[cache_key] = basename
        except RuntimeError as err:
            logger.error(err)
            return abort(404)
    else:
        logger.debug('Cache hit...')

    filename = cache[cache_key]
    cache.close()
    return filename
Example #2
0
    def run(self):
        """ The indicator is always recreated, while the subdir
        for a given (host, username, base, pattern) is just synced. """
        base = os.path.dirname(self.output().path)
        subdir = hashlib.sha1('{host}:{username}:{base}:{pattern}'.format(
            host=self.host,
            username=self.username,
            base=self.base,
            pattern=self.pattern)).hexdigest()
        # target is the root of the mirror
        target = os.path.join(base, subdir)
        if not os.path.exists(target):
            os.makedirs(target)

        command = """lftp -u {username},{password}
        -e "set net:max-retries 5; set net:timeout 10; mirror --verbose=0
        --only-newer -I {pattern} {base} {target}; exit" {host}"""

        shellout(command,
                 host=self.host,
                 username=pipes.quote(self.username),
                 password=pipes.quote(self.password),
                 pattern=pipes.quote(self.pattern),
                 target=pipes.quote(target),
                 base=pipes.quote(self.base))

        with self.output().open('w') as output:
            for path in iterfiles(target):
                logger.debug("Mirrored: %s" % path)
                output.write_tsv(path)
Example #3
0
 def run(self):
     target = os.path.dirname(self.output().path)
     pattern = os.path.join(config.get('lfer', 'glob'))
     shellout("rsync -avz {src} {target}", src=pattern, target=target)
     with self.output().open('w') as output:
         for path in sorted(iterfiles(target)):
             output.write_tsv(path)
Example #4
0
    def run(self):
        """ The indicator is always recreated, while the subdir
        for a given (host, username, base, pattern) is just synced. """
        base = os.path.dirname(self.output().path)
        subdir = hashlib.sha1('{host}:{username}:{base}:{pattern}'.format(
            host=self.host, username=self.username, base=self.base,
            pattern=self.pattern)).hexdigest()
        # target is the root of the mirror
        target = os.path.join(base, subdir)
        if not os.path.exists(target):
            os.makedirs(target)

        command = """lftp -u {username},{password}
        -e "set net:max-retries 5; set net:timeout 10; mirror --verbose=0
        --only-newer -I {pattern} {base} {target}; exit" {host}"""

        shellout(command, host=self.host, username=pipes.quote(self.username),
                 password=pipes.quote(self.password),
                 pattern=pipes.quote(self.pattern),
                 target=pipes.quote(target),
                 base=pipes.quote(self.base))

        with self.output().open('w') as output:
            for path in iterfiles(target):
                logger.debug("Mirrored: %s" % path)
                output.write_tsv(path)
Example #5
0
    def run(self):
        target = os.path.join(os.path.dirname(self.output().path), str(self.date))
        if not os.path.exists(target):
            os.makedirs(target)

        _, errorlog = tempfile.mkstemp(prefix='siskin-')
        stylesheet = self.input().get('stylesheet').path
        size = wc(self.input().get('filelist').path)

        with self.input().get('filelist').open() as handle:
            for i, row in enumerate(handle.iter_tsv(cols=('path',)), start=1):
                basename = os.path.basename(row.path)
                name = basename.replace(".xml", ".marcxml")
                destination = os.path.join(target, name)
                if not os.path.exists(destination):
                    try:
                        output = shellout("xsltproc {xsl} {input} > {output}",
                                          input=row.path, xsl=stylesheet)
                        luigi.File(output).move(destination)
                    except RuntimeError as err:
                        self.logger.error("{0}: {1}".format(row.path, err))
                        with open(errorlog, 'a') as log:
                            log.write('%s\t%s\n' % (row.path, err))
                self.logger.debug("{0}/{1} {2}".format(i, size, row.path))

        # write receipt
        with self.output().open('w') as output:
            for path in iterfiles(target):
                output.write_tsv(path)

        # this is just a temporary artefact for now
        self.logger.debug("Conversion errors logged at: {0}".format(errorlog))
Example #6
0
File: app.py Project: miku/pkpy
def github_clone_and_build(username, repo, target='deb'):
    """ Clone a repo (username, repo) and build the `target` package.
    Returns the filename, that is placed directly under `static`. """
    repo_url = '[email protected]:%s/%s.git' % (username, repo)

    cache_key = hashlib.sha1('%s:%s' % (repo_url, target)).hexdigest()
    cache = shelve.open(CACHE)

    if not cache_key in cache:
        logger.debug('Building (%s, %s) ...' % (repo_url, target))
        stopover = tempfile.mkdtemp(prefix='pkpy-')
        shellout("""
            cd {stopover} && git clone {repo_url} &&
            cd {repo} && fpm --verbose -s python -t {target} .""",
                 stopover=stopover, repo_url=repo_url, repo=repo, target=target)
        src = iterfiles(stopover, fun=lambda fn: fn.endswith(target)).next()
        basename = os.path.basename(src)
        dst = os.path.join(PACKAGE_CACHE, basename)
        shutil.copyfile(src, dst)
        shutil.rmtree(stopover)
        cache[cache_key] = basename
    else:
        logger.debug('Cache hit...')
    filename = cache[cache_key]
    cache.close()
    return filename
Example #7
0
 def run(self):
     source = os.path.join(config.get('core', 'swb-mirror'), 'nationallizenzen')
     target = os.path.dirname(self.output().path)
     shellout("rsync -avz {source} {target}", source=source, target=target)
     with self.output().open('w') as output:
         for path in iterfiles(target):
             output.write_tsv(path)
Example #8
0
 def run(self):
     stopover = tempfile.mkdtemp(prefix='siskin-')
     shellout("scp {origin} {stopover}".format(origin=config.get('pao', 'scp-src'), stopover=stopover))
     _, combined = tempfile.mkstemp(prefix='siskin-')
     for path in iterfiles(directory=stopover,
                           fun=lambda path: re.search(r'pao[\d].mrc', path)):
         shellout("cat {path} >> {output}", path=path, output=combined)
     luigi.File(combined).move(self.output().path)
Example #9
0
    def run(self):
        stopover = tempfile.mkdtemp(prefix='siskin-')
        origin = config.get('gbv', 'scp-src').format(tag=self.tag)
        shellout("scp {origin} {output}", origin=origin, output=stopover)

        _, combined = tempfile.mkstemp(prefix='siskin-')
        for path in iterfiles(stopover):
            shellout("cat {input} >> {output}", input=path, output=combined)
        luigi.File(combined).move(self.output().fn)
Example #10
0
 def run(self):
     target = tempfile.mkdtemp(prefix='siskin-')
     shellout("scp -rCpq {src} {target}", src=config.get('gbi', 'scp-src'), target=target)
     if not os.path.exists(self.taskdir()):
         os.makedirs(self.taskdir())
     dst = os.path.join(self.taskdir(), 'mirror')
     shellout("mv {target} {output}", target=target, output=dst)
     with self.output().open('w') as output:
         for path in iterfiles(dst):
             output.write_tsv(path)
Example #11
0
 def run(self):
     target = os.path.join(self.taskdir(), str(self.date))
     if not os.path.exists(target):
         os.makedirs(target)
     _, stopover = tempfile.mkstemp(prefix='siskin-')
     shellout("wget --retry-connrefused -O {stopover} '{url}' && unzip -o -d {dir} {stopover}", dir=target, stopover=stopover, url=self.url)
     files = list(iterfiles(target))
     if not len(files) == 1:
         raise RuntimeError('more than one file')
     luigi.File(files[0]).move(self.output().path)
Example #12
0
 def run(self):
     # gather files
     stopover = tempfile.mkdtemp(prefix='siskin-')
     shellout("scp {origin} {output}", origin=config.get('ksd', 'scp-src'), output=stopover)
     # combine files
     _, combined = tempfile.mkstemp(prefix='siskin-')
     for path in sorted(iterfiles(stopover), reverse=True):
         shellout("cat {input} >> {output}", input=path, output=combined)
     # clean dups
     output = shellout("marcuniq {input} > {output}", input=combined)
     luigi.File(output).move(self.output().path)
Example #13
0
    def run(self):
        filemap = {'all': 'http://www.textgrid.de/fileadmin/digitale-bibliothek/literatur.zip',
                   'v1': 'http://www.textgrid.de/fileadmin/digitale-bibliothek/literatur-nur-texte-1.zip',
                   'v2': 'http://www.textgrid.de/fileadmin/digitale-bibliothek/literatur-nur-texte-2.zip'}

        if self.corpus not in filemap:
            raise RuntimeError('available corpus ids: all, v1, v2')

        output = shellout("wget --retry-connrefused '{url}' -O {output}", url=filemap[self.corpus])
        shellout("unzip -d '{dir}' {input}", dir=self.input().get('dir').path, input=output)
        with self.output().open('w') as output:
            for path in iterfiles(self.input().get('dir').path):
                output.write_tsv(path.encode('utf-8'))
Example #14
0
 def run(self):
     target = os.path.join(self.taskdir(), self.version, self.format)
     if not os.path.exists(target):
         os.makedirs(target)
     url = os.path.join(self.base, self.version, "datasets", self.format)
     stopover = tempfile.mkdtemp(prefix='siskin-')
     shellout("""wget -q -nd -P {directory} -rc -np -A.{format}.gz '{url}'""",
                 url=url, directory=stopover, format=self.format)
     for path in glob.glob(unicode(os.path.join(stopover, '*'))):
         dst = os.path.join(target, os.path.basename(path))
         if not os.path.exists(dst):
             # this is atomic given path and target are on the same device
             shutil.move(path, target)
     with self.output().open('w') as output:
         for path in iterfiles(target, fun=lambda p: p.endswith('nt.gz')):
             output.write_tsv(self.version, self.format, path)
Example #15
0
    def run(self):
        stopover = tempfile.mkdtemp(prefix='gluish-')
        oai_harvest(url=self.url, begin=self.begin, end=self.end,
                    prefix=self.prefix, directory=stopover,
                    collection=self.collection, delay=self.delay)

        with self.output().open('w') as output:
            output.write("""<collection
                xmlns="http://www.openarchives.org/OAI/2.0/"
                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
            """)
            for path in iterfiles(stopover):
                with open(path) as handle:
                    soup = BeautifulSoup.BeautifulStoneSoup(handle.read())
                    for record in soup.findAll('record'):
                        output.write(str(record)) # or unicode?
            output.write('</collection>\n')
Example #16
0
    def run(self):
        line_count = sum(1 for line in open(self.filename))
        lines = int((line_count + self.chunks) / self.chunks)

        taskdir = os.path.dirname(self.output().fn)
        if not os.path.exists(taskdir):
            os.makedirs(taskdir)

        prefix = random_string()
        shellout("cd {taskdir} && split -l {lines} {input} {prefix}",
                 taskdir=taskdir, lines=lines, input=self.filename,
                 prefix=prefix)

        with self.output().open('w') as output:
            for path in sorted(iterfiles(taskdir)):
                if os.path.basename(path).startswith(prefix):
                    output.write_tsv(path)
Example #17
0
 def run(self):
     """ Harvest files for a certain timeframe in a temporary
     directory, then combine all records into a single file. """
     stopover = tempfile.mkdtemp(prefix='tasktree-')
     oai_harvest(url="http://oai.bnf.fr/oai2/OAIHandler",
                 begin=self.begin, end=self.end, prefix=self.prefix,
                 directory=stopover, collection=self.collection)
     with self.output().open('w') as output:
         output.write("""<collection
             xmlns="http://www.openarchives.org/OAI/2.0/"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">""")
         for path in iterfiles(stopover):
             with open(path) as handle:
                 soup = BeautifulSoup.BeautifulStoneSoup(handle.read())
                 for record in soup.findAll('record'):
                     output.write(str(record)) # or unicode?
         output.write('</collection>\n')
Example #18
0
    def run(self):
        line_count = sum(1 for line in open(self.filename))
        lines = int((line_count + self.chunks) / self.chunks)

        taskdir = os.path.dirname(self.output().fn)
        if not os.path.exists(taskdir):
            os.makedirs(taskdir)

        prefix = random_string()
        shellout("cd {taskdir} && split -l {lines} {input} {prefix}",
                 taskdir=taskdir,
                 lines=lines,
                 input=self.filename,
                 prefix=prefix)

        with self.output().open('w') as output:
            for path in sorted(iterfiles(taskdir)):
                if os.path.basename(path).startswith(prefix):
                    output.write_tsv(path)
Example #19
0
    def run(self):
        base = "http://www.universitypressscholarship.com/"

        with self.input().open() as handle:
            for row in handle.iter_tsv(cols=('path',)):
                dirname, basename = row.path.split('/')[-2:]
                slugged = dirname.replace('%20', '-').lower()
                url = urlparse.urljoin(base, row.path)
                dst = os.path.join(self.taskdir(), '{0}-{1}'.format(slugged, basename))
                if os.path.exists(dst):
                    continue
                output = shellout("""wget --retry-connrefused "{url}" -O {output} """, url=url)
                luigi.File(output).move(dst)

        with self.output().open('w') as output:
            for path in iterfiles(self.taskdir()):
                if not path.endswith('mrc'):
                    continue
                output.write_tsv(path)
Example #20
0
 def run(self):
     prefix = "{0}-".format(random_string())
     output = shellout(
         "cd {tmp} && split -l {lines} -a 8 {input} {prefix} && cd -",
         lines=self.lines,
         tmp=tempfile.gettempdir(),
         input=self.input().path,
         prefix=prefix,
     )
     target = os.path.join(self.taskdir())
     if not os.path.exists(target):
         os.makedirs(target)
     with self.output().open("w") as output:
         for path in iterfiles(tempfile.gettempdir()):
             filename = os.path.basename(path)
             if filename.startswith(prefix):
                 dst = os.path.join(target, filename)
                 shutil.move(path, dst)
                 output.write_tsv(dst)
Example #21
0
    def run(self):
        stopover = tempfile.mkdtemp(prefix='gluish-')
        oai_harvest(url=self.url,
                    begin=self.begin,
                    end=self.end,
                    prefix=self.prefix,
                    directory=stopover,
                    collection=self.collection,
                    delay=self.delay)

        with self.output().open('w') as output:
            output.write("""<collection
                xmlns="http://www.openarchives.org/OAI/2.0/"
                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
            """)
            for path in iterfiles(stopover):
                with open(path) as handle:
                    soup = BeautifulSoup.BeautifulStoneSoup(handle.read())
                    for record in soup.findAll('record'):
                        output.write(str(record))  # or unicode?
            output.write('</collection>\n')
Example #22
0
    def run(self):
        # create target subdirectory
        target = os.path.join(os.path.dirname(self.output().path), str(self.closest()))
        if not os.path.exists(target):
            os.makedirs(target)
        size = wc(self.input().path)

        with self.input().open() as handle:
            for i, row in enumerate(handle.iter_tsv(cols=('url',)), start=1):
                name = os.path.join(target, row.url.split('/')[-2])
                destination = "{name}.xml".format(name=name)
                if not os.path.exists(destination):
                    output = shellout("""wget -q --retry-connrefused
                                      {url} -O {output}""", url=row.url)
                    luigi.File(output).move(destination)
                self.logger.debug("{0}/{1} {2}".format(i, size, row.url))

        # write "receipt"
        with self.output().open('w') as output:
            for path in iterfiles(target):
                if path.endswith('.xml'):
                    output.write_tsv(path)
Example #23
0
 def run(self):
     shellout("rsync -avz {src} {dst}", src=config.get('pilsen', 'src'), dst=self.taskdir())
     with self.output().open('w') as output:
         for path in iterfiles(self.taskdir(), fun=lambda p: '-luigi-tmp-' not in p):
             output.write_tsv(path)