def run(self): _, temp = tempfile.mkstemp(prefix='byoi-') with self.input().open() as handle: for path in map(str.strip, handle): print('processing: %s' % path) shellout("jq -r -c '.message.items[]' <(unpigz -c {input}) | pigz -c >> {output}", input=path, output=temp) luigi.File(temp).move(self.output().path)
def run(self): _, combined = tempfile.mkstemp(prefix='tasktree-') for target in self.input(): shellout("cat {input} >> {output}", input=target.path, output=combined) output = shellout("LANG=C sort -k1,1 -k3,3 {input} > {output}", input=combined) luigi.File(output).move(self.output().fn)
def run(self): _, combined = tempfile.mkstemp(prefix='tasktree-') with self.input().open() as handle: for row in handle.iter_tsv(cols=('date', 'path')): shellout("unzip -p {input} >> {output}", input=row.path, output=combined) luigi.File(combined).move(self.output().path)
def run(self): diskdir = self.config.get('ceeol', 'disk-dir') _, stopover = tempfile.mkstemp(prefix='siskin-') for path in glob.glob(os.path.join(diskdir, 'articles', 'articles_*xml')): shellout("span-import -i ceeol {input} | pigz -c >> {output}", input=path, output=stopover) shellout("cat {update} | pigz -c >> {output}", update=self.input().path, output=stopover) luigi.LocalTarget(stopover).move(self.output().path)
def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') with self.input().open() as handle: groups = itertools.groupby( handle.iter_tsv(cols=('archive', 'member')), lambda row: row.archive) for archive, items in groups: # Write members to extract to temporary file. _, memberfile = tempfile.mkstemp(prefix='siskin-') with open(memberfile, 'w') as output: for item in items: output.write("%s\n" % item.member) self.logger.debug("for archive %s extract via: %s", archive, memberfile) if not isinstance(archive, six.string_types): archive = archive.decode(encoding='utf-8') # The unzippa will not exhaust ARG_MAX. shellout("""unzippa -v -m {memberfile} {archive} | sed -e 's@<?xml version="1.0" encoding="UTF-8"?>@@g' | pigz -c >> {output}""", archive=archive, memberfile=memberfile, output=stopover) try: os.remove(output.name) except OSError as err: self.logger.warn(err) luigi.LocalTarget(stopover).move(self.output().path)
def complete(self): path = "{date}-kxp".format(date=self.yesterday.strftime("%y%m%d")) ids = set() if not os.path.exists(path): return False for index in os.listdir(path): for f in os.listdir(path + "/" + index): with gzip.open("{fd}".format(fd=path + "/" + index + "/" + f), "rt") as inp: for line in inp: ids.add(json.loads(line).get("identifier")) cmd = "zcat {fd} | jq -rc .identifier >> schemaorg-ids-{date}.txt".format( fd=path + "/" + index + "/" + f, date=self.yesterday.strftime("%y%m%d")) shellout(cmd) es_ids = set() for record in esidfilegenerator( host="{host}".format( **self.config).rsplit("/")[-1].rsplit(":")[0], port="{host}".format( **self.config).rsplit("/")[-1].rsplit(":")[1], index="slub-resources", type="schemaorg", idfile="schemaorg-ids-{date}.txt".format( date=self.yesterday.strftime("%y%m%d")), source=False): es_ids.add(record.get("_id")) if len(es_ids) == len(ids) and len(es_ids) > 0: return True return False
def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') temp = shellout("unpigz -c {input} > {output}", input=self.input().get('input').path) output = shellout("""jq -r '[.doi?, .["rft.issn"][]?, .["rft.eissn"][]?] | @csv' {input} | LC_ALL=C sort -S50% > {output} """, input=temp, output=stopover) os.remove(temp) luigi.File(output).move(self.output().path)
def run(self): _, combined = tempfile.mkstemp(prefix='siskin-') for target in self.input(): shellout("cat {input} >> {output}", input=target.path, output=combined) output = shellout("marcuniq {input} > {output}", input=combined) luigi.File(output).move(self.output().path)
def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') shellout("""jq -r '.doi' <(unpigz -c {input}) | grep -v "null" | grep -o "10.*" 2> /dev/null > {output} """, input=self.input().get('input').path, output=stopover) output = shellout("""sort -u {input} > {output} """, input=stopover) luigi.LocalTarget(output).move(self.output().path)
def run(self): output = shellout("xsltproc {stylesheet} {input} > {output}", input=self.input().path, stylesheet=self.assets('OAIDCtoMARCXML.xsl')) output = shellout("yaz-marcdump -i marcxml -o marc {input} > {output}", input=output) luigi.File(output).move(self.output().path)
def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') with self.input().open() as handle: for row in handle.iter_tsv(cols=('path',)): shellout(""" unzip -l {input} | grep "xml$" | awk '{{print "{input}\t"$4}}' >> {output} """, preserve_whitespace=True, input=row.path, output=stopover) luigi.File(stopover).move(self.output().path)
def run(self): """ Download, maybe unzip, combine with Gold List. """ key = "http://amsl.technology/discovery/metadata-usage/Dokument/KBART_FREEJOURNALS" link = "%s%s" % (self.config.get('amsl', 'uri-download-prefix'), key) downloaded = shellout("curl --fail {link} > {output} ", link=link) _, stopover = tempfile.mkstemp(prefix='siskin-') try: _ = zipfile.ZipFile(downloaded) output = shellout("unzip -p {input} >> {output}", input=downloaded) except zipfile.BadZipfile: # At least the file is not a zip. output = shellout("cat {input} >> {output}", input=downloaded) # Include OA list, refs #11579. shellout("""curl -s https://pub.uni-bielefeld.de/download/2913654/2913655 | cut -d, -f1,2 | tr -d '"' | grep -E '[0-9]{{4,4}}-[0-9]{{3,3}}[0-9xX]' | tr ',' '\n' | awk '{{ print "\t\t"$0 }}' >> {output}""", output=output, preserve_whitespace=True) luigi.LocalTarget(output).move(self.output().path)
def run(self): archive = None with self.input().get('dblist-fulltext').open() as handle: for row in handle.iter_tsv(cols=('kind', 'path', 'db')): if row.db == self.db: archive = row.path with self.input().get('dblist-references').open() as handle: for row in handle.iter_tsv(cols=('kind', 'path', 'db')): if row.db == self.db: archive = row.path if archive is None: # an non-existent database name amount to an empty dump file self.logger.debug('no such db: %s' % self.db) with self.output().open('w') as output: pass return dbzip = shellout("7z x -so {archive} {db}.zip 2> /dev/null > {output}", archive=archive, db=self.db) output = shellout("""unzip -p {dbzip} \*.xml 2> /dev/null | iconv -f iso-8859-1 -t utf-8 | LC_ALL=C grep -v "^<\!DOCTYPE GENIOS PUBLIC" | LC_ALL=C sed -e 's@<?xml version="1.0" encoding="ISO-8859-1" ?>@@g' | LC_ALL=C sed -e 's@</Document>@<x-origin>{origin}</x-origin><x-issue>{issue}</x-issue></Document>@' | pigz -c >> {output} """, dbzip=dbzip, origin=archive, issue=self.issue) luigi.File(output).move(self.output().path)
def complete(self): if not self.output().exists(): return False with chdir(str(self.output().path)): output = shellout("""git fetch origin {branch} > {output} 2>&1""", branch=self.branch) result = True with open(output, 'rb') as fh: content = fh.readlines() if len(content) >= 3: result = False revparseoutput = shellout( """git rev-parse {branch} > {output} 2>&1""", branch=self.branch) originrevparseoutput = shellout( """git rev-parse origin/{branch} > {output} 2>&1""", branch=self.branch) revparse = getfirstline(revparseoutput, "0") originrevparse = getfirstline(originrevparseoutput, "1") if revparse != originrevparse: result = False return result
def run(self): """ @IHBhY2thZ2UgbWFpbgoK@IGltcG9ydCAoCiAg@@ImVuY29kaW5nL2pzb24iCiAg@@ImVuY29kaW5nL3htbCIK@@IC AibG9nIgog@@ICJvcyIKCiAg@@InN0cmluZ3MiCgog@@ICJnaXRodWIuY29tL21pa3UveG1sc3RyZWFtIgog@KQoK@ IHR5cGUgRGVzY3JpcHRpb24gc3RydWN0IHsK@@ICBYTUxOYW1l@IHhtbC5OYW1lIGB4bWw6IkRlc2NyaXB0aW9uImA K@@ICBJc3Nu@@IFtdc3RyaW5nIGB4bWw6Imlzc24iYAog@@IFNob3J0VGl0bGUgW11zdHJpbmcgYHhtbDoic2hvcnR UaXRsZSJgCiAg@@VGl0bGUg@ICBbXXN0cmluZyBgeG1sOiJ0aXRsZSJgCiAgICB9Cgog@ZnVuYyBtYWluKCkgewog@ @IHNtIDo9IG1ha2UobWFwW3N0cmluZ11zdHJpbmcpCiAg@@c2Nhbm5lciA6PSB4bWxzdHJlYW0uTmV3U2Nhbm5lcih vcy5TdGRpbiwgbmV3KERlc2NyaXB0aW9uKSkK@@ICBmb3Igc2Nhbm5lci5TY2FuKCkgewog@@@ICB0YWcgOj0gc2Nh bm5lci5FbGVtZW50KCkK@@@@aWYgdiwgb2sgOj0gdGFnLigqRGVzY3JpcHRpb24pOyBvayB7CiAg@@@@ICBpZiBsZW 4odi5TaG9ydFRpdGxlKSA9PSAwIHsK@@@@@@ICBjb250aW51ZQog@@@@@fQog@@@@@Zm9yIF8sIHMgOj0gcmFuZ2Ug di5TaG9ydFRpdGxlIHsK@@@@@@ICBmb3IgXywgdCA6PSByYW5nZSB2LlRpdGxlIHsK@@@@@@@@c21bc10gPSB0CiAg @@@@@@@IHNtW3N0cmluZ3MuVG9Mb3dlcihzKV0gPSB0CiAg@@@@@@fQog@@@@@fQog@@@ICB9CiAg@@fQog@@IGlmI GVyciA6PSBzY2FubmVyLkVycigpOyBlcnIgIT0gbmlsIHsK@@@@bG9nLkZhdGFsKGVycikK@@ICB9CiAg@@aWYgZXJ yIDo9IGpzb24uTmV3RW5jb2Rlcihvcy5TdGRvdXQpLkVuY29kZShzbSk7IGVyciAhPSBuaWwgewog@@@ICBsb2cuRm F0YWwoZXJyKQog@@IH0K@IH0K """ source = self.run.__doc__.replace("\n", "").replace(" ", "").replace("@", "ICAg") tempcode = shellout("""echo '{code}' | base64 -d > {output}.go """, code=source, preserve_whitespace=True) output = shellout( """ unpigz -c {input} | go run {code}.go > {output} """, code=tempcode, input=self.input().path) os.remove(tempcode) luigi.LocalTarget(output).move(self.output().path)
def run(self): _, tmpfile = tempfile.mkstemp(prefix='byoi-') for target in self.input(): shellout("cat {input} >> {output}", input=target.path, output=tmpfile) luigi.File(tmpfile).move(self.output().path)
def run(self): _, output = tempfile.mkstemp(prefix='siskin-') for target in self.input(): shellout("cat {input} >> {output}", input=target.path, output=output) luigi.LocalTarget(output).move(self.output().path)
def run(self): target = os.path.dirname(self.output().path) pattern = os.path.join(config.get('lfer', 'glob')) shellout("rsync -avz {src} {target}", src=pattern, target=target) with self.output().open('w') as output: for path in sorted(iterfiles(target)): output.write_tsv(path)
def run(self): """ Download, maybe unzip, combine with Gold List. """ key = "http://amsl.technology/discovery/metadata-usage/Dokument/KBART_FREEJOURNALS" link = "%s%s" % (self.config.get('amsl', 'uri-download-prefix'), key) downloaded = shellout("curl --fail {link} > {output} ", link=link) _, stopover = tempfile.mkstemp(prefix='siskin-') try: _ = zipfile.ZipFile(downloaded) output = shellout("unzip -p {input} >> {output}", input=downloaded) except zipfile.BadZipfile: # At least the file is not a zip. output = shellout("cat {input} >> {output}", input=downloaded) # Include OA list, refs #11579. shellout( """curl -s https://pub.uni-bielefeld.de/download/2913654/2913655 | cut -d, -f1,2 | tr -d '"' | grep -E '[0-9]{{4,4}}-[0-9]{{3,3}}[0-9xX]' | tr ',' '\n' | awk '{{ print "\t\t"$0 }}' >> {output}""", output=output, preserve_whitespace=True) luigi.LocalTarget(output).move(self.output().path)
def run(self): """ The indicator is always recreated, while the subdir for a given (host, username, base, pattern) is just synced. """ base = os.path.dirname(self.output().path) subdir = hashlib.sha1('{host}:{username}:{base}:{pattern}'.format( host=self.host, username=self.username, base=self.base, pattern=self.pattern)).hexdigest() # target is the root of the mirror target = os.path.join(base, subdir) if not os.path.exists(target): os.makedirs(target) command = """lftp -u {username},{password} -e "set net:max-retries 5; set net:timeout 10; mirror --verbose=0 --only-newer -I {pattern} {base} {target}; exit" {host}""" shellout(command, host=self.host, username=pipes.quote(self.username), password=pipes.quote(self.password), pattern=pipes.quote(self.pattern), target=pipes.quote(target), base=pipes.quote(self.base)) with self.output().open('w') as output: for path in iterfiles(target): logger.debug("Mirrored: %s" % path) output.write_tsv(path)
def run(self): source = os.path.join(config.get('core', 'swb-mirror'), 'nationallizenzen') target = os.path.dirname(self.output().path) shellout("rsync -avz {source} {target}", source=source, target=target) with self.output().open('w') as output: for path in iterfiles(target): output.write_tsv(path)
def run(self): paths = [p.strip() for p in self.config.get("ceeol", "updates").split(",") if p.strip()] _, stopover = tempfile.mkstemp(prefix="siskin-") self.logger.debug("found %d updates", len(paths)) for p in paths: shellout("span-import -i ceeol-marcxml {input} >> {output}", input=p, output=stopover) luigi.LocalTarget(stopover).move(self.output().path)
def run(self): with self.input().open() as handle: holdings = json.load(handle) _, stopover = tempfile.mkstemp(prefix='siskin-') # The property which contains the URI of the holding file. Might change. urikey = 'DokumentURI' for holding in holdings: if holding["ISIL"] == self.isil: if urikey not in holding: raise RuntimeError('possible AMSL API change, expected: %s, available keys: %s' % (urikey, holding.keys())) # refs. #7142 if 'kbart' not in holding[urikey].lower(): self.logger.debug("skipping non-KBART holding URI: %s" % holding[urikey]) continue link = "%s%s" % (self.config.get('amsl', 'uri-download-prefix'), holding[urikey]) downloaded = shellout("curl --fail {link} > {output} ", link=link) try: _ = zipfile.ZipFile(downloaded) output = shellout("unzip -p {input} >> {output}", input=downloaded, output=stopover) except zipfile.BadZipfile: # at least the file is not a zip. output = shellout("cat {input} >> {output}", input=downloaded, output=stopover) luigi.File(stopover).move(self.output().path)
def run(self): paths = [ p.strip() for p in self.config.get("ceeol", "updates").split(",") if p.strip() ] _, stopover = tempfile.mkstemp(prefix="siskin-") self.logger.debug("found %d updates", len(paths)) for p in paths: shellout("span-import -i ceeol-marcxml {input} >> {output}", input=p, output=stopover) # Append MARC updates. paths = [ p.strip() for p in self.config.get("ceeol", "updates-marc").split(",") if p.strip() ] _, stopover = tempfile.mkstemp(prefix="siskin-") self.logger.debug("found %d updates (MARC)", len(paths)) with open(stopover, 'a') as output: for p in paths: self.logger.debug("converting: %s", p) for doc in convert_ceeol_to_intermediate_schema(p): output.write(json.dumps(doc) + "\n") luigi.LocalTarget(stopover).move(self.output().path)
def run(self): output = shellout("""python {script} {input} {output}""", script=self.assets('30/30_marcbinary.py'), input=self.input().path) output = shellout("""yaz-marcdump -o marcxml {input} > {output}""", input=output) luigi.LocalTarget(output).move(self.output().path)
def run(self): """ itereates over the in LODProcessFromRdi generated JSON-Linked-Data enriches them with identifier from entityfacts enriches them with subjects from the GND enriches them with identifier from wikidata enriches them with identifier from geonames, if its a geographic Place ingests them into a elasticsearch node """ path = "{date}-aut-data".format(date=self.yesterday.strftime("%y%m%d")) for index in os.listdir(path): # doing several enrichment things before indexing the data for f in os.listdir(path + "/" + index): cmd = ". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && zcat {fd} | ".format( fd=path + "/" + index + "/" + f ) # with -pipeline, all the data get's thru, not only enriched docs # cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/sameAs2id.py -pipeline -stdin -searchserver {host} | ".format(**self.config) cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/entityfacts-bot.py -pipeline -stdin -searchserver {host} | ".format( **self.config) cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/gnd-sachgruppen.py -pipeline -stdin -searchserver {host} | ".format( **self.config) cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/wikidata.py -pipeline -stdin | " if index == "geo": cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/geonames.py -pipeline -stdin -searchserver {geonames_host} | ".format( **self.config) cmd += "esbulk -verbose -server {host} -w 1 -size 20 -index {index} -type schemaorg -id identifier".format( **self.config, index=index) shellout(cmd) put_dict("{host}/date/actual/1".format(**self.config), {"date": str(self.yesterday.strftime("%Y-%m-%d"))})
def run(self): """ TODO(miku): This contains things, that would better be factored out in separate tasks. """ titles = {} output = shellout("span-gh-dump {hfile} > {output}", hfile=self.hfile) with luigi.File(output, format=TSV).open() as handle: for row in handle.iter_tsv(cols=('issn', 'title')): titles[row.issn] = row.title issns_held = set() output = shellout("xmlstarlet sel -t -v '//issn' {hfile} | sort | uniq > {output}", hfile=self.hfile) with luigi.File(output, format=TSV).open() as handle: for row in handle.iter_tsv(cols=('issn',)): issns_held.add(row.issn) issns_crossref = set() with self.input().open() as handle: for row in handle.iter_tsv(cols=('issn',)): issns_crossref.add(row.issn) covered = issns_held.intersection(issns_crossref) with self.output().open('w') as output: for issn in issns_held: if issn in covered: output.write_tsv("COVERED", issn, titles[issn]) else: output.write_tsv("NOT_COVERED", issn, titles[issn])
def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') shellout(r"""cat {input} | awk '{{ print $0"\treferences"}}' >> {output}""", input=self.input().get('references').path, output=stopover) shellout(r"""cat {input} | awk '{{ print $0"\tfulltext"}}' >> {output}""", input=self.input().get('fulltext').path, output=stopover) luigi.File(stopover).move(self.output().path)
def pypi_build(name, target='deb'): """ Take a package name and return the filename of the target. """ cache_key = hashlib.sha1('%s:%s' % (name, target)).hexdigest() cache = shelve.open(CACHE) if not cache_key in cache: logger.debug('Building %s for %s...' % (target, name)) stopover = tempfile.mkdtemp(prefix='pkpy-') try: shellout("""cd {stopover} && fpm --verbose -s python -t {target} {name}""", stopover=stopover, name=name, target=target) src = iterfiles(stopover).next() basename = os.path.basename(src) dst = os.path.join(PACKAGE_CACHE, basename) shutil.copyfile(src, dst) shutil.rmtree(stopover) cache[cache_key] = basename except RuntimeError as err: logger.error(err) return abort(404) else: logger.debug('Cache hit...') filename = cache[cache_key] cache.close() return filename
def github_clone_and_build(username, repo, target='deb'): """ Clone a repo (username, repo) and build the `target` package. Returns the filename, that is placed directly under `static`. """ repo_url = '[email protected]:%s/%s.git' % (username, repo) cache_key = hashlib.sha1('%s:%s' % (repo_url, target)).hexdigest() cache = shelve.open(CACHE) if not cache_key in cache: logger.debug('Building (%s, %s) ...' % (repo_url, target)) stopover = tempfile.mkdtemp(prefix='pkpy-') shellout(""" cd {stopover} && git clone {repo_url} && cd {repo} && fpm --verbose -s python -t {target} .""", stopover=stopover, repo_url=repo_url, repo=repo, target=target) src = iterfiles(stopover, fun=lambda fn: fn.endswith(target)).next() basename = os.path.basename(src) dst = os.path.join(PACKAGE_CACHE, basename) shutil.copyfile(src, dst) shutil.rmtree(stopover) cache[cache_key] = basename else: logger.debug('Cache hit...') filename = cache[cache_key] cache.close() return filename
def run(self): _, dbpath = tempfile.mkstemp(prefix='siskin-') shellout("cayley init -alsologtostderr -config {config} -dbpath={dbpath}", config=self.assets('cayley.bolt.conf'), dbpath=dbpath) shellout("GOMAXPROCS={gomaxprocs} cayley load -config {config} -alsologtostderr -dbpath={dbpath} --triples {input}", gomaxprocs=self.gomaxprocs, config=self.assets('cayley.bolt.conf'), dbpath=dbpath, input=self.input().get('ntriples').path) shutil.move(dbpath, self.output().path)
def run(self): grobid_dir = "work/{}/{}/grobid_tei".format(self.crawl, self.item) json_dir = "work/{}/{}/grobid_json".format(self.crawl, self.item) shellout("mkdir -p {json_dir}", json_dir=json_dir) # Generate fulltext json (one file per PDF) for tei_path in glob.glob(grobid_dir + "/*.tei.xml"): json_path = tei_path.replace('/grobid_tei/', '/grobid_json/')\ .replace('.tei.xml', '.json') # jq in the pipeline validates JSON shellout(""" bin/grobid2json.py {tei_path} | jq -c . > {json_path}""", tei_path=tei_path, json_path=json_path) # Just the header info (one json file for the whole item) # jq in the pipeline validates JSON output = shellout(""" cd {grobid_dir} && find . -name "*.tei.xml" | parallel -j 4 ../../../../bin/grobid2json.py --no-encumbered {{}} | jq -c . > {output}""", grobid_dir=grobid_dir) luigi.LocalTarget(output).move(self.output().path)
def run(self): """ transforms the geonames TSV Dump to line-delimited JSON """ cmd = ". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && ~/git/efre-lod-elasticsearch-tools/helperscripts/tsv2json.py {file}.txt | gzip > {file}.ldj.gz".format( **self.config) shellout(cmd)
def run(self): url = self.config.get('thieme', 'oai') shellout("METHA_DIR={dir} metha-sync -set {set} -format {prefix} {url}", set=self.set, prefix=self.prefix, url=url, dir=self.config.get('core', 'metha-dir')) output = shellout("METHA_DIR={dir} metha-cat -format {prefix} {url} | pigz -c > {output}", prefix=self.prefix, url=url, dir=self.config.get('core', 'metha-dir')) luigi.File(output).move(self.output().path)
def run(self): pdf_dir = "work/{}/{}/pdfs".format(self.crawl, self.item) grobid_dir = "work/{}/{}/grobid_tei".format(self.crawl, self.item) shellout("mkdir -p {grobid_dir}", grobid_dir=grobid_dir) output = shellout(""" /usr/bin/time -v -o {output} java -Xmx6G -jar {GROBID_JAR} -gH {GROBID_HOME} -dIn {pdf_dir} -r -dOut {grobid_dir} -exe processFullText""", pdf_dir=pdf_dir, grobid_dir=grobid_dir, GROBID_JAR=GROBID_JAR, GROBID_HOME=GROBID_HOME) # java/GROBID doesn't error when it runs. One way to check that it ran # is that there should be a TEI file for every file in PDF dir. #assert(glob_count(pdf_dir + "/*.pdf") == glob_count(grobid_dir + "/*.tei.xml")) luigi.LocalTarget(output).move(self.output().path)
def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') for i in range(27): url = self.template.format(part=i) shellout("""curl --retry 1 --compress "{url}" >> {output}""", url=url, output=stopover) luigi.File(stopover).move(self.output().path)
def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') with self.input().open() as handle: groups = itertools.groupby(handle.iter_tsv(cols=('archive', 'member')), lambda row: row.archive) for archive, items in groups: # Write members to extract to temporary file. _, memberfile = tempfile.mkstemp(prefix='siskin-') with open(memberfile, 'w') as output: for item in items: output.write("%s\n" % item.member) self.logger.debug("for archive %s extract via: %s", archive, memberfile) # The unzippa will not exhaust ARG_MAX. shellout("""unzippa -v -m {memberfile} {archive} | sed -e 's@<?xml version="1.0" encoding="UTF-8"?>@@g' | pigz -c >> {output}""", archive=archive.decode(encoding='utf-8'), memberfile=memberfile, output=stopover) try: os.remove(output.name) except OSError as err: self.logger.warn(err) luigi.LocalTarget(stopover).move(self.output().path)
def run(self): shellout( """metha-sync -rm -format cmdi -set dta https://clarin.bbaw.de/oai-dta/""" ) output = shellout( """metha-cat -format cmdi -set dta https://clarin.bbaw.de/oai-dta/ > {output}""" ) luigi.LocalTarget(output).move(self.output().path)
def run(self): _, combined = tempfile.mkstemp(prefix='siskin-') for target in self.input(): tmp = shellout("""yaz-marcdump -f utf-8 -t utf-8 -i marcxml -o marc {input} > {output}""", input=target.fn, ignoremap={5: 'TODO: fix this'}) shellout("cat {input} >> {output}", input=tmp, output=combined) luigi.File(combined).move(self.output().path)
def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') with self.input().open() as handle: for row in handle.iter_tsv(cols=('path',)): shellout(""" unzip -p {input} | grep -o 'DB="[^"]*' | sed -e 's/DB="//g' >> {output} """, input=row.path, output=stopover) output = shellout("sort -u {input} > {output} && rm {input}", input=stopover) luigi.File(output).move(self.output().path)
def merge(targets): """ Helper function to concatenate the outputs for a number of targets. """ _, tf = tempfile.mkstemp(prefix='lab-') for target in targets: shellout("cat {input} >> {output}", input=target.path, output=tf) return tf
def run(self): _, output = tempfile.mkstemp(prefix='siskin-') with self.input().open() as handle: for row in sorted(handle.iter_tsv(cols=('path', ))): if not str(row.path).endswith('.tar'): continue shellout("span-import -i elsevier-tar {input} | pigz -c >> {output}", input=row.path, output=output) luigi.LocalTarget(output).move(self.output().path)
def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') shellout("""LC_ALL=C zgrep -E "http(s)?://.*.crossref.org" {input} >> {output}""", input=self.input().path, output=stopover) shellout("""LC_ALL=C zgrep -v "^200" {input} >> {output}""", input=self.input().path, output=stopover) output = shellout("sort -S50% -u {input} | cut -f4 | sed s@http://doi.org/api/handles/@@g > {output}", input=stopover) luigi.File(output).move(self.output().path)
def run(self): # XXX: Does this really work? output = shellout("""iconv -f utf-8 -t utf-8 -c {input} > {output}""", input=self.input().path) output = shellout("""flux.sh {flux} in={input} > {output}""", flux=self.assets("127/127.flux"), input=output) luigi.LocalTarget(output).move(self.output().path)
def run(self): output = shellout(""" span-tag -c {config} {input} > {output} """, config=self.input().get('config').path, input=self.input().get('file').path) output = shellout(""" span-export -o {format} {input} > {output} """, input=output, format=self.format) luigi.LocalTarget(output).move(self.output().path)
def run(self): output = shellout("span-tag -c {config} <(unpigz -c {input}) | pigz -c > {output}", config=self.input().get('config').path, input=self.input().get('file').path) output = shellout("span-export -o {format} <(unpigz -c {input}) | pigz -c > {output}", format=self.format, input=output) luigi.LocalTarget(output).move(self.output().path)
def run(self): """ iterates over the id-file from LODKXPTransform2ldj, searches for the right titledata (de-14) and merges them with the merge_lok_with_tit script. finally, the data gets loaded into the kxp-de14 index """ shellout( """. ~/git/efre-lod-elasticsearch-tools/init_environment.sh && ~/git/efre-lod-elasticsearch-tools/helperscripts/merge_lok_with_tit.py -selectbody \'{{\"query\": {{\"match\": {{\"852.__.a.keyword\": \"DE-14\"}}}}}}\' -title_server {rawdata_host}/kxp-tit/mrc -local_server {rawdata_host}/kxp-lok/mrc -idfile {date}-lok-ppns.txt | tee data.ldj | esbulk -server {rawdata_host} -index kxp-de14 -type mrc -id 001 -w 1 -verbose && jq -rc \'.\"001\"' data.ldj > ids.txt && rm data.ldj""", rawdata_host=self.config.get("rawdata_host"), date=self.yesterday.strftime("%y%m%d"))
def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') shellout( """jq -r '.doi' <(unpigz -c {input}) | grep -v "null" | grep -o "10.*" 2> /dev/null > {output} """, input=self.input().get('input').path, output=stopover) output = shellout("""sort -u {input} > {output} """, input=stopover) luigi.LocalTarget(output).move(self.output().path)
def run(self): output = shellout( """sed -e 's/'$(echo "\o001")'/ /g' < {input} > {output}""", input=self.config.get('khm', 'dump')) # TODO(miku): maybe check, if cleanup is still required. output = shellout("python {script} {input} {output}", script=self.assets("109/109_marcbinary.py"), input=output) luigi.LocalTarget(output).move(self.output().path)
def run(self): """ calls esmarc with the idfile created in LODTITTransform2ldj """ if os.stat("{date}.mrc.bz2".format(date=self.date)).st_size > 0: cmd = ". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && ~/git/efre-lod-elasticsearch-tools/processing/esmarc.py -z -server {rawdata_host}/finc-main-k10plus/mrc -idfile {date}-ppns.txt -prefix {date}-data".format( **self.config, date=self.date) shellout(cmd) sleep(5)
def run(self): endpoint, set = "https://monami.hs-mittweida.de/oai", "institutes:medien" shellout("""metha-sync -set {set} {endpoint}""", set=set, endpoint=endpoint) output = shellout( """metha-cat -set "institutes:medien" {endpoint} > {output} """, endpoint=endpoint) luigi.LocalTarget(output).move(self.output().path)
def run(self): shellout("""metha-sync -rm -format {format} {endpoint} """, endpoint=self.endpoint, format=self.format) output = shellout( """metha-cat -format {format} {endpoint} > {output}""", endpoint=self.endpoint, format=self.format) luigi.LocalTarget(output).move(self.output().path)
def run(self): shellout( "metha-sync -format {format} http://digi.ub.uni-heidelberg.de/cgi-bin/digioai.cgi", format=self.format) output = shellout( "metha-cat -format {format} -root Records {url} > {output}", format=self.format, url=self.url) luigi.LocalTarget(output).move(self.output().path)
def run(self): """ takes the in LODTransform2ldj created TXT file and gets those records from the elasticsearch node transforms them to JSON-Linked Data with esmarc.py, gzips the files """ cmd = ". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && ~/git/efre-lod-elasticsearch-tools/processing/esmarc.py -z -server {host}/swb-aut/mrc -idfile {date}-norm-aut-ppns.txt -prefix {date}-aut-data".format( **self.config, date=self.yesterday.strftime("%y%m%d")) shellout(cmd) sleep(5)
def run(self): shellout("""METHA_DIR={dir} metha-sync "{endpoint}" """, dir=self.config.get('core', 'metha-dir'), endpoint=self.endpoint) output = shellout( """METHA_DIR={dir} metha-cat -root Records "{endpoint}" > {output}""", dir=self.config.get('core', 'metha-dir'), endpoint=self.endpoint) luigi.LocalTarget(output).move(self.output().path)
def run(self): """ iterates over the id-file from LODKXPTransform2ldj, gets this set of records from the kxp-de14 index, transforms them to JSON-Linked-Data """ # delete("{rawdata_host}/kxp-tit-{date}".format(**self.config,date=self.yesterday.strftime("%y%m%d"))) # delete("{rawdata_host}/kxp-lok-{date}".format(**self.config,date=self.yesterday.strftime("%y%m%d"))) cmd = ". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && ~/git/efre-lod-elasticsearch-tools/processing/esmarc.py -z -server {rawdata_host}/kxp-de14/mrc -idfile ids.txt -prefix {date}-kxp".format( **self.config, date=self.yesterday.strftime("%y%m%d")) shellout(cmd) sleep(5)
def run(self): """ fills the local data index and the source title data index with the data transformed in LODKXPTransform2ldj """ for typ in ["tit", "lok"]: # put_dict("{rawdata_host}/kxp-{typ}".format(**self.config,typ=typ,date=self.yesterday.strftime("%y%m%d")),{"mappings":{"mrc":{"date_detection":False}}}) # put_dict("{rawdata_host}/kxp-{typ}/_settings".format(**self.config,typ=typ,date=self.yesterday.strftime("%y%m%d")),{"index.mapping.total_fields.limit":5000}) cmd = "esbulk -z -verbose -server {rawdata_host} -w {workers} -index kxp-{typ} -type mrc -id 001 {date}-{typ}.ldj.gz" "".format( **self.config, typ=typ, date=self.yesterday.strftime("%y%m%d")) shellout(cmd)
def run(self): directory = self.config.get('elsevierjournals', 'backlog-dir') _, output = tempfile.mkstemp(prefix='siskin-') for path in sorted( iterfiles(directory, fun=lambda p: p.endswith('.tar'))): shellout( "span-import -i elsevier-tar {input} | pigz -c >> {output}", input=path, output=output) luigi.LocalTarget(output).move(self.output().path)