def run(self): target = os.path.join(os.path.dirname(self.output().path), str(self.date)) if not os.path.exists(target): os.makedirs(target) _, errorlog = tempfile.mkstemp(prefix="siskin-") stylesheet = self.input().get("stylesheet").path size = wc(self.input().get("filelist").path) with self.input().get("filelist").open() as handle: for i, row in enumerate(handle.iter_tsv(cols=("path",)), start=1): basename = os.path.basename(row.path) name = basename.replace(".xml", ".marcxml") destination = os.path.join(target, name) if not os.path.exists(destination): try: output = shellout("xsltproc {xsl} {input} > {output}", input=row.path, xsl=stylesheet) luigi.File(output).move(destination) except RuntimeError as err: self.logger.error("{0}: {1}".format(row.path, err)) with open(errorlog, "a") as log: log.write("%s\t%s\n" % (row.path, err)) self.logger.debug("{0}/{1} {2}".format(i, size, row.path)) # write receipt with self.output().open("w") as output: for path in iterfiles(target): output.write_tsv(path) # this is just a temporary artefact for now self.logger.debug("Conversion errors logged at: {0}".format(errorlog))
def run(self): source = os.path.join(config.get('core', 'swb-mirror'), 'nationallizenzen') target = os.path.dirname(self.output().path) shellout("rsync -avz {source} {target}", source=source, target=target) with self.output().open('w') as output: for path in iterfiles(target): output.write_tsv(path)
def run(self): target = os.path.dirname(self.output().path) pattern = os.path.join(config.get('lfer', 'glob')) shellout("rsync -avz {src} {target}", src=pattern, target=target) with self.output().open('w') as output: for path in sorted(iterfiles(target)): output.write_tsv(path)
def run(self): """ The indicator is always recreated, while the subdir for a given (host, username, base, pattern) is just synced. """ base = os.path.dirname(self.output().path) subdir = hashlib.sha1('{host}:{username}:{base}:{pattern}'.format( host=self.host, username=self.username, base=self.base, pattern=self.pattern)).hexdigest() # target is the root of the mirror target = os.path.join(base, subdir) if not os.path.exists(target): os.makedirs(target) command = """lftp -u {username},{password} -e "set net:max-retries {max_retries}; set net:timeout {timeout}; mirror --verbose=0 --only-newer -I {pattern} {base} {target}; exit" {host}""" shellout(command, host=self.host, username=pipes.quote(self.username), password=pipes.quote(self.password), pattern=pipes.quote(self.pattern), target=pipes.quote(target), base=pipes.quote(self.base), max_retries=self.max_retries, timeout=self.timeout) with self.output().open('w') as output: for path in iterfiles(target): self.logger.debug("Mirrored: %s" % path) output.write_tsv(path)
def run(self): # create target subdirectory target = os.path.join(os.path.dirname(self.output().path), str(self.closest())) if not os.path.exists(target): os.makedirs(target) size = wc(self.input().path) with self.input().open() as handle: for i, row in enumerate(handle.iter_tsv(cols=("url",)), start=1): name = os.path.join(target, row.url.split("/")[-2]) destination = "{name}.xml".format(name=name) if not os.path.exists(destination): output = shellout( """wget -q --retry-connrefused {url} -O {output}""", url=row.url, ) luigi.File(output).move(destination) self.logger.debug("{0}/{1} {2}".format(i, size, row.url)) # write "receipt" with self.output().open("w") as output: for path in iterfiles(target): if path.endswith(".xml"): output.write_tsv(path)
def run(self): stopover = tempfile.mkdtemp(prefix='siskin-') shellout("scp {origin} {stopover}".format(origin=config.get('pao', 'scp-src'), stopover=stopover)) _, combined = tempfile.mkstemp(prefix='siskin-') for path in iterfiles(directory=stopover, fun=lambda path: re.search(r'pao[\d].mrc', path)): shellout("cat {path} >> {output}", path=path, output=combined) luigi.File(combined).move(self.output().path)
def run(self): stopover = tempfile.mkdtemp(prefix='siskin-') origin = config.get('gbv', 'scp-src').format(tag=self.tag) shellout("scp {origin} {output}", origin=origin, output=stopover) _, combined = tempfile.mkstemp(prefix='siskin-') for path in iterfiles(stopover): shellout("cat {input} >> {output}", input=path, output=combined) luigi.File(combined).move(self.output().fn)
def run(self): target = os.path.join(self.taskdir(), str(self.date)) if not os.path.exists(target): os.makedirs(target) _, stopover = tempfile.mkstemp(prefix='siskin-') shellout("wget --retry-connrefused -O {stopover} '{url}' && unzip -o -d {dir} {stopover}", dir=target, stopover=stopover, url=self.url) files = list(iterfiles(target)) if not len(files) == 1: raise RuntimeError('more than one file') luigi.File(files[0]).move(self.output().path)
def run(self): directory = self.config.get('elsevierjournals', 'backlog-dir') _, output = tempfile.mkstemp(prefix='siskin-') for path in sorted( iterfiles(directory, fun=lambda p: p.endswith('.tar'))): shellout( "span-import -i elsevier-tar {input} | pigz -c >> {output}", input=path, output=output) luigi.LocalTarget(output).move(self.output().path)
def run(self): # gather files stopover = tempfile.mkdtemp(prefix='siskin-') shellout("scp {origin} {output}", origin=config.get('ksd', 'scp-src'), output=stopover) # combine files _, combined = tempfile.mkstemp(prefix='siskin-') for path in sorted(iterfiles(stopover), reverse=True): shellout("cat {input} >> {output}", input=path, output=combined) # clean dups output = shellout("marcuniq {input} > {output}", input=combined) luigi.File(output).move(self.output().path)
def run(self): target = os.path.join(self.taskdir(), 'mirror') shellout("mkdir -p {target} && rsync {rsync_options} {src} {target}", rsync_options=self.config.get('gbi', 'rsync-options', '-avzP'), src=self.config.get('gbi', 'scp-src'), target=target) if not os.path.exists(self.taskdir()): os.makedirs(self.taskdir()) with self.output().open('w') as output: for path in iterfiles(target): output.write_tsv(path)
def run(self): filemap = {'all': 'http://www.textgrid.de/fileadmin/digitale-bibliothek/literatur.zip', 'v1': 'http://www.textgrid.de/fileadmin/digitale-bibliothek/literatur-nur-texte-1.zip', 'v2': 'http://www.textgrid.de/fileadmin/digitale-bibliothek/literatur-nur-texte-2.zip'} if self.corpus not in filemap: raise RuntimeError('available corpus ids: all, v1, v2') output = shellout("wget --retry-connrefused '{url}' -O {output}", url=filemap[self.corpus]) shellout("unzip -d '{dir}' {input}", dir=self.input().get('dir').path, input=output) with self.output().open('w') as output: for path in iterfiles(self.input().get('dir').path): output.write_tsv(path.encode('utf-8'))
def run(self): target = os.path.join(self.taskdir(), 'mirror') shellout("mkdir -p {target} && rsync {rsync_options} {src} {target}", rsync_options=self.config.get('cambridge', 'rsync-options', fallback='-avzP'), src=self.config.get('cambridge', 'scp-src'), target=target) if not os.path.exists(self.taskdir()): os.makedirs(self.taskdir()) with self.output().open('w') as output: for path in iterfiles(target): output.write_tsv(path)
def run(self): prefix = '{0}-'.format(random_string()) output = shellout("cd {tmp} && split -l {lines} -a 8 {input} {prefix} && cd -", lines=self.lines, tmp=tempfile.gettempdir(), input=self.input().path, prefix=prefix) target = os.path.join(self.taskdir()) if not os.path.exists(target): os.makedirs(target) with self.output().open('w') as output: for path in iterfiles(tempfile.gettempdir()): filename = os.path.basename(path) if filename.startswith(prefix): dst = os.path.join(target, filename) shutil.move(path, dst) output.write_tsv(dst)
def run(self): with self.input().open() as handle: doc = json.load(handle) tempdir = tempfile.mkdtemp(prefix='tmp-siskin-') for attachment in doc['issue']['attachments']: target = os.path.join(tempdir, os.path.basename(attachment["content_url"])) shellout("""curl -vL --fail -H "X-Redmine-API-Key:{apikey}" -o {target} "{url}" """, url=attachment["content_url"], apikey=self.config.get("redmine", "apikey"), target=target) with self.output().open('w') as output: for path in iterfiles(tempdir): self.logger.debug("Downloaded: %s", path) output.write_tsv(path)
def run(self): target = os.path.join(self.taskdir(), self.version, self.format) if not os.path.exists(target): os.makedirs(target) url = os.path.join(self.base, self.version, "datasets", self.format) stopover = tempfile.mkdtemp(prefix='siskin-') shellout("""wget -q -nd -P {directory} -rc -np -A.{format}.gz '{url}'""", url=url, directory=stopover, format=self.format) for path in glob.glob(unicode(os.path.join(stopover, '*'))): dst = os.path.join(target, os.path.basename(path)) if not os.path.exists(dst): # this is atomic given path and target are on the same device shutil.move(path, target) with self.output().open('w') as output: for path in iterfiles(target, fun=lambda p: p.endswith('nt.gz')): output.write_tsv(self.version, self.format, path)
def run(self): base = "http://www.universitypressscholarship.com/" with self.input().open() as handle: for row in handle.iter_tsv(cols=('path',)): dirname, basename = row.path.split('/')[-2:] slugged = dirname.replace('%20', '-').lower() url = urlparse.urljoin(base, row.path) dst = os.path.join(self.taskdir(), '{0}-{1}'.format(slugged, basename)) if os.path.exists(dst): continue output = shellout("""wget --retry-connrefused "{url}" -O {output} """, url=url) luigi.File(output).move(dst) with self.output().open('w') as output: for path in iterfiles(self.taskdir()): if not path.endswith('mrc'): continue output.write_tsv(path)
def run(self): """ The indicator is always recreated, while the subdir for a given (host, username, base, pattern) is just synced. """ base = os.path.dirname(self.output().path) subdir = hashlib.sha1('{host}:{username}:{base}:{pattern}'.format( host=self.host, username=self.username, base=self.base, pattern=self.pattern).encode('utf-8')).hexdigest() target = os.path.join(base, subdir) # target is the root of the mirror if not os.path.exists(target): os.makedirs(target) exclude_glob = "" if not self.exclude_glob == "": exclude_glob = "--exclude-glob %s" % self.exclude_glob command = """lftp -u {username},{password} -e " set sftp:auto-confirm yes; set net:max-retries {max_retries}; set net:timeout {timeout}; set mirror:parallel-directories 1; set ssl:verify-certificate no; set ftp:ssl-protect-data true; mirror --verbose=0 --only-newer {exclude_glob} -I {pattern} {base} {target}; exit" {host}""" shellout(command, host=self.host, username=pipes.quote(self.username), password=pipes.quote(self.password), pattern=pipes.quote(self.pattern), target=pipes.quote(target), base=pipes.quote(self.base), max_retries=self.max_retries, timeout=self.timeout, exclude_glob=exclude_glob) with self.output().open('w') as output: for path in iterfiles(target): self.logger.debug("Mirrored: %s", path) output.write_tsv(path)
def run(self): directory = self.config.get('elsevierjournals', 'backlog-dir') _, output = tempfile.mkstemp(prefix='siskin-') for path in sorted(iterfiles(directory, fun=lambda p: p.endswith('.tar'))): shellout("span-import -i elsevier-tar {input} | pigz -c >> {output}", input=path, output=output) luigi.LocalTarget(output).move(self.output().path)
def run(self): shellout("rsync -avz {src} {dst}", src=config.get('liberec', 'src'), dst=self.taskdir()) with self.output().open('w') as output: for path in iterfiles(self.taskdir(), fun=lambda p: '-luigi-tmp-' not in p): output.write_tsv(path)
def run(self): directory = self.config.get("elsevierjournals", "backlog-dir") _, output = tempfile.mkstemp(prefix="siskin-") for path in sorted(iterfiles(directory, fun=lambda p: p.endswith(".tar"))): shellout("span-import -i elsevier-tar {input} | pigz -c >> {output}", input=path, output=output) luigi.File(output).move(self.output().path)
def run(self): """ Iterate over all zipfiles in reverse, convert and concat binary marc into tempfile, then deduplicate. """ # Load all deletions into set. deleted = set() deldir = os.path.dirname(self.input().get('deletions').path) for path in sorted(iterfiles(deldir), reverse=True): with open(path) as handle: for i, line in enumerate(handle, start=1): line = line.strip() if len(line) > 20: self.logger.warn("suspicious id: %s", line) deleted.add(line) # Load updates. pattern = re.compile(r'^date-[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}.zip$') datadir = os.path.dirname(self.input().get('data').path) # Combine all binary MARC records in this file. _, combined = tempfile.mkstemp(prefix='siskin-') for path in sorted(iterfiles(datadir), reverse=True): filename = os.path.basename(path) if not pattern.match(filename): self.logger.warn("ignoring invalid filename: %s", path) continue if os.stat(path).st_size < 22: self.logger.warn("ignoring possibly empty zip file: %s", path) continue with zipfile.ZipFile(path) as zf: for name in zf.namelist(): with zf.open(name) as handle: with tempfile.NamedTemporaryFile(delete=False) as dst: shutil.copyfileobj(handle, dst) shellout( "yaz-marcdump -i marcxml -o marc {input} >> {output}", input=dst.name, output=combined, ignoremap={5: 'expected error from yaz'}) os.remove(dst.name) # Finally, concatenate initial dump. shellout("cat {input} >> {output}", input=self.input().get('dump').path, output=combined) # Already seen identifier. seen = set() with self.output().open('wb') as output: writer = pymarc.MARCWriter(output) # Iterate over MARC records (which are newest to oldest, keep track of seen identifiers). with open(combined) as handle: reader = pymarc.MARCReader(handle, force_utf8=True, to_unicode=True) for record in reader: field = record["001"] if not field: self.logger.debug("missing identifier") continue id = field.value() if id in seen: self.logger.debug("skipping duplicate: %s", id) continue if id in deleted: self.logger.debug("skipping deleted: %s", id) continue self.logger.debug("adding %s", id) writer.write(record) seen.add(id) self.logger.debug( "found %s unique records (deletion list contained %s ids)", len(seen), len(deleted)) os.remove(combined)
def run(self): """ Iterate over all zipfiles in reverse, convert and concat binary marc into tempfile, then deduplicate. """ # Load all deletions into set. deleted = set() deldir = os.path.dirname(self.input().get('deletions').path) for path in sorted(iterfiles(deldir), reverse=True): with open(path) as handle: for i, line in enumerate(handle, start=1): line = line.strip() if len(line) > 20: self.logger.warn("suspicious id: %s", line) deleted.add(line) # Load updates. pattern = re.compile(r'^date-[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}.zip$') datadir = os.path.dirname(self.input().get('data').path) # Combine all binary MARC records in this file. _, combined = tempfile.mkstemp(prefix='siskin-') for path in sorted(iterfiles(datadir), reverse=True): filename = os.path.basename(path) if not pattern.match(filename): self.logger.warn("ignoring invalid filename: %s", path) continue if os.stat(path).st_size < 22: self.logger.warn("ignoring possibly empty zip file: %s", path) continue with zipfile.ZipFile(path) as zf: for name in zf.namelist(): with zf.open(name) as handle: with tempfile.NamedTemporaryFile(delete=False) as dst: shutil.copyfileobj(handle, dst) shellout("yaz-marcdump -i marcxml -o marc {input} >> {output}", input=dst.name, output=combined, ignoremap={5: 'expected error from yaz'}) os.remove(dst.name) # Finally, concatenate initial dump. shellout("cat {input} >> {output}", input=self.input().get('dump').path, output=combined) # Already seen identifier. seen = set() with self.output().open('wb') as output: writer = pymarc.MARCWriter(output) # Iterate over MARC records (which are newest to oldest, keep track of seen identifiers). with open(combined) as handle: reader = pymarc.MARCReader(handle, force_utf8=True, to_unicode=True) for record in reader: field = record["001"] if not field: self.logger.debug("missing identifier") continue id = field.value() if id in seen: self.logger.debug("skipping duplicate: %s", id) continue if id in deleted: self.logger.debug("skipping deleted: %s", id) continue self.logger.debug("adding %s", id) writer.write(record) seen.add(id) self.logger.debug("found %s unique records (deletion list contained %s ids)", len(seen), len(deleted)) os.remove(combined)