def pypi_build(name, target='deb'): """ Take a package name and return the filename of the target. """ cache_key = hashlib.sha1('%s:%s' % (name, target)).hexdigest() cache = shelve.open(CACHE) if not cache_key in cache: logger.debug('Building %s for %s...' % (target, name)) stopover = tempfile.mkdtemp(prefix='pkpy-') try: shellout("""cd {stopover} && fpm --verbose -s python -t {target} {name}""", stopover=stopover, name=name, target=target) src = iterfiles(stopover).next() basename = os.path.basename(src) dst = os.path.join(PACKAGE_CACHE, basename) shutil.copyfile(src, dst) shutil.rmtree(stopover) cache[cache_key] = basename except RuntimeError as err: logger.error(err) return abort(404) else: logger.debug('Cache hit...') filename = cache[cache_key] cache.close() return filename
def run(self): """ The indicator is always recreated, while the subdir for a given (host, username, base, pattern) is just synced. """ base = os.path.dirname(self.output().path) subdir = hashlib.sha1('{host}:{username}:{base}:{pattern}'.format( host=self.host, username=self.username, base=self.base, pattern=self.pattern)).hexdigest() # target is the root of the mirror target = os.path.join(base, subdir) if not os.path.exists(target): os.makedirs(target) command = """lftp -u {username},{password} -e "set net:max-retries 5; set net:timeout 10; mirror --verbose=0 --only-newer -I {pattern} {base} {target}; exit" {host}""" shellout(command, host=self.host, username=pipes.quote(self.username), password=pipes.quote(self.password), pattern=pipes.quote(self.pattern), target=pipes.quote(target), base=pipes.quote(self.base)) with self.output().open('w') as output: for path in iterfiles(target): logger.debug("Mirrored: %s" % path) output.write_tsv(path)
def run(self): target = os.path.dirname(self.output().path) pattern = os.path.join(config.get('lfer', 'glob')) shellout("rsync -avz {src} {target}", src=pattern, target=target) with self.output().open('w') as output: for path in sorted(iterfiles(target)): output.write_tsv(path)
def run(self): target = os.path.join(os.path.dirname(self.output().path), str(self.date)) if not os.path.exists(target): os.makedirs(target) _, errorlog = tempfile.mkstemp(prefix='siskin-') stylesheet = self.input().get('stylesheet').path size = wc(self.input().get('filelist').path) with self.input().get('filelist').open() as handle: for i, row in enumerate(handle.iter_tsv(cols=('path',)), start=1): basename = os.path.basename(row.path) name = basename.replace(".xml", ".marcxml") destination = os.path.join(target, name) if not os.path.exists(destination): try: output = shellout("xsltproc {xsl} {input} > {output}", input=row.path, xsl=stylesheet) luigi.File(output).move(destination) except RuntimeError as err: self.logger.error("{0}: {1}".format(row.path, err)) with open(errorlog, 'a') as log: log.write('%s\t%s\n' % (row.path, err)) self.logger.debug("{0}/{1} {2}".format(i, size, row.path)) # write receipt with self.output().open('w') as output: for path in iterfiles(target): output.write_tsv(path) # this is just a temporary artefact for now self.logger.debug("Conversion errors logged at: {0}".format(errorlog))
def github_clone_and_build(username, repo, target='deb'): """ Clone a repo (username, repo) and build the `target` package. Returns the filename, that is placed directly under `static`. """ repo_url = '[email protected]:%s/%s.git' % (username, repo) cache_key = hashlib.sha1('%s:%s' % (repo_url, target)).hexdigest() cache = shelve.open(CACHE) if not cache_key in cache: logger.debug('Building (%s, %s) ...' % (repo_url, target)) stopover = tempfile.mkdtemp(prefix='pkpy-') shellout(""" cd {stopover} && git clone {repo_url} && cd {repo} && fpm --verbose -s python -t {target} .""", stopover=stopover, repo_url=repo_url, repo=repo, target=target) src = iterfiles(stopover, fun=lambda fn: fn.endswith(target)).next() basename = os.path.basename(src) dst = os.path.join(PACKAGE_CACHE, basename) shutil.copyfile(src, dst) shutil.rmtree(stopover) cache[cache_key] = basename else: logger.debug('Cache hit...') filename = cache[cache_key] cache.close() return filename
def run(self): source = os.path.join(config.get('core', 'swb-mirror'), 'nationallizenzen') target = os.path.dirname(self.output().path) shellout("rsync -avz {source} {target}", source=source, target=target) with self.output().open('w') as output: for path in iterfiles(target): output.write_tsv(path)
def run(self): stopover = tempfile.mkdtemp(prefix='siskin-') shellout("scp {origin} {stopover}".format(origin=config.get('pao', 'scp-src'), stopover=stopover)) _, combined = tempfile.mkstemp(prefix='siskin-') for path in iterfiles(directory=stopover, fun=lambda path: re.search(r'pao[\d].mrc', path)): shellout("cat {path} >> {output}", path=path, output=combined) luigi.File(combined).move(self.output().path)
def run(self): stopover = tempfile.mkdtemp(prefix='siskin-') origin = config.get('gbv', 'scp-src').format(tag=self.tag) shellout("scp {origin} {output}", origin=origin, output=stopover) _, combined = tempfile.mkstemp(prefix='siskin-') for path in iterfiles(stopover): shellout("cat {input} >> {output}", input=path, output=combined) luigi.File(combined).move(self.output().fn)
def run(self): target = tempfile.mkdtemp(prefix='siskin-') shellout("scp -rCpq {src} {target}", src=config.get('gbi', 'scp-src'), target=target) if not os.path.exists(self.taskdir()): os.makedirs(self.taskdir()) dst = os.path.join(self.taskdir(), 'mirror') shellout("mv {target} {output}", target=target, output=dst) with self.output().open('w') as output: for path in iterfiles(dst): output.write_tsv(path)
def run(self): target = os.path.join(self.taskdir(), str(self.date)) if not os.path.exists(target): os.makedirs(target) _, stopover = tempfile.mkstemp(prefix='siskin-') shellout("wget --retry-connrefused -O {stopover} '{url}' && unzip -o -d {dir} {stopover}", dir=target, stopover=stopover, url=self.url) files = list(iterfiles(target)) if not len(files) == 1: raise RuntimeError('more than one file') luigi.File(files[0]).move(self.output().path)
def run(self): # gather files stopover = tempfile.mkdtemp(prefix='siskin-') shellout("scp {origin} {output}", origin=config.get('ksd', 'scp-src'), output=stopover) # combine files _, combined = tempfile.mkstemp(prefix='siskin-') for path in sorted(iterfiles(stopover), reverse=True): shellout("cat {input} >> {output}", input=path, output=combined) # clean dups output = shellout("marcuniq {input} > {output}", input=combined) luigi.File(output).move(self.output().path)
def run(self): filemap = {'all': 'http://www.textgrid.de/fileadmin/digitale-bibliothek/literatur.zip', 'v1': 'http://www.textgrid.de/fileadmin/digitale-bibliothek/literatur-nur-texte-1.zip', 'v2': 'http://www.textgrid.de/fileadmin/digitale-bibliothek/literatur-nur-texte-2.zip'} if self.corpus not in filemap: raise RuntimeError('available corpus ids: all, v1, v2') output = shellout("wget --retry-connrefused '{url}' -O {output}", url=filemap[self.corpus]) shellout("unzip -d '{dir}' {input}", dir=self.input().get('dir').path, input=output) with self.output().open('w') as output: for path in iterfiles(self.input().get('dir').path): output.write_tsv(path.encode('utf-8'))
def run(self): target = os.path.join(self.taskdir(), self.version, self.format) if not os.path.exists(target): os.makedirs(target) url = os.path.join(self.base, self.version, "datasets", self.format) stopover = tempfile.mkdtemp(prefix='siskin-') shellout("""wget -q -nd -P {directory} -rc -np -A.{format}.gz '{url}'""", url=url, directory=stopover, format=self.format) for path in glob.glob(unicode(os.path.join(stopover, '*'))): dst = os.path.join(target, os.path.basename(path)) if not os.path.exists(dst): # this is atomic given path and target are on the same device shutil.move(path, target) with self.output().open('w') as output: for path in iterfiles(target, fun=lambda p: p.endswith('nt.gz')): output.write_tsv(self.version, self.format, path)
def run(self): stopover = tempfile.mkdtemp(prefix='gluish-') oai_harvest(url=self.url, begin=self.begin, end=self.end, prefix=self.prefix, directory=stopover, collection=self.collection, delay=self.delay) with self.output().open('w') as output: output.write("""<collection xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> """) for path in iterfiles(stopover): with open(path) as handle: soup = BeautifulSoup.BeautifulStoneSoup(handle.read()) for record in soup.findAll('record'): output.write(str(record)) # or unicode? output.write('</collection>\n')
def run(self): line_count = sum(1 for line in open(self.filename)) lines = int((line_count + self.chunks) / self.chunks) taskdir = os.path.dirname(self.output().fn) if not os.path.exists(taskdir): os.makedirs(taskdir) prefix = random_string() shellout("cd {taskdir} && split -l {lines} {input} {prefix}", taskdir=taskdir, lines=lines, input=self.filename, prefix=prefix) with self.output().open('w') as output: for path in sorted(iterfiles(taskdir)): if os.path.basename(path).startswith(prefix): output.write_tsv(path)
def run(self): """ Harvest files for a certain timeframe in a temporary directory, then combine all records into a single file. """ stopover = tempfile.mkdtemp(prefix='tasktree-') oai_harvest(url="http://oai.bnf.fr/oai2/OAIHandler", begin=self.begin, end=self.end, prefix=self.prefix, directory=stopover, collection=self.collection) with self.output().open('w') as output: output.write("""<collection xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">""") for path in iterfiles(stopover): with open(path) as handle: soup = BeautifulSoup.BeautifulStoneSoup(handle.read()) for record in soup.findAll('record'): output.write(str(record)) # or unicode? output.write('</collection>\n')
def run(self): base = "http://www.universitypressscholarship.com/" with self.input().open() as handle: for row in handle.iter_tsv(cols=('path',)): dirname, basename = row.path.split('/')[-2:] slugged = dirname.replace('%20', '-').lower() url = urlparse.urljoin(base, row.path) dst = os.path.join(self.taskdir(), '{0}-{1}'.format(slugged, basename)) if os.path.exists(dst): continue output = shellout("""wget --retry-connrefused "{url}" -O {output} """, url=url) luigi.File(output).move(dst) with self.output().open('w') as output: for path in iterfiles(self.taskdir()): if not path.endswith('mrc'): continue output.write_tsv(path)
def run(self): prefix = "{0}-".format(random_string()) output = shellout( "cd {tmp} && split -l {lines} -a 8 {input} {prefix} && cd -", lines=self.lines, tmp=tempfile.gettempdir(), input=self.input().path, prefix=prefix, ) target = os.path.join(self.taskdir()) if not os.path.exists(target): os.makedirs(target) with self.output().open("w") as output: for path in iterfiles(tempfile.gettempdir()): filename = os.path.basename(path) if filename.startswith(prefix): dst = os.path.join(target, filename) shutil.move(path, dst) output.write_tsv(dst)
def run(self): # create target subdirectory target = os.path.join(os.path.dirname(self.output().path), str(self.closest())) if not os.path.exists(target): os.makedirs(target) size = wc(self.input().path) with self.input().open() as handle: for i, row in enumerate(handle.iter_tsv(cols=('url',)), start=1): name = os.path.join(target, row.url.split('/')[-2]) destination = "{name}.xml".format(name=name) if not os.path.exists(destination): output = shellout("""wget -q --retry-connrefused {url} -O {output}""", url=row.url) luigi.File(output).move(destination) self.logger.debug("{0}/{1} {2}".format(i, size, row.url)) # write "receipt" with self.output().open('w') as output: for path in iterfiles(target): if path.endswith('.xml'): output.write_tsv(path)
def run(self): shellout("rsync -avz {src} {dst}", src=config.get('pilsen', 'src'), dst=self.taskdir()) with self.output().open('w') as output: for path in iterfiles(self.taskdir(), fun=lambda p: '-luigi-tmp-' not in p): output.write_tsv(path)