Example #1
0
 def run(self):
     _, temp = tempfile.mkstemp(prefix='byoi-')
     with self.input().open() as handle:
         for path in map(str.strip, handle):
             print('processing: %s' % path)
             shellout("jq -r -c '.message.items[]' <(unpigz -c {input}) | pigz -c >> {output}", input=path, output=temp)
     luigi.File(temp).move(self.output().path)
Example #2
0
 def run(self):
     _, combined = tempfile.mkstemp(prefix='tasktree-')
     for target in self.input():
         shellout("cat {input} >> {output}", input=target.path,
                  output=combined)
     output = shellout("LANG=C sort -k1,1 -k3,3 {input} > {output}", input=combined)
     luigi.File(output).move(self.output().fn)
Example #3
0
 def run(self):
     _, combined = tempfile.mkstemp(prefix='tasktree-')
     with self.input().open() as handle:
         for row in handle.iter_tsv(cols=('date', 'path')):
             shellout("unzip -p {input} >> {output}", input=row.path,
                      output=combined)
     luigi.File(combined).move(self.output().path)
Example #4
0
File: ceeol.py Project: miku/siskin
 def run(self):
     diskdir = self.config.get('ceeol', 'disk-dir')
     _, stopover = tempfile.mkstemp(prefix='siskin-')
     for path in glob.glob(os.path.join(diskdir, 'articles', 'articles_*xml')):
         shellout("span-import -i ceeol {input} | pigz -c >> {output}", input=path, output=stopover)
     shellout("cat {update} | pigz -c >> {output}", update=self.input().path, output=stopover)
     luigi.LocalTarget(stopover).move(self.output().path)
Example #5
0
File: jstor.py Project: zazi/siskin
    def run(self):
        _, stopover = tempfile.mkstemp(prefix='siskin-')
        with self.input().open() as handle:
            groups = itertools.groupby(
                handle.iter_tsv(cols=('archive', 'member')),
                lambda row: row.archive)

            for archive, items in groups:
                # Write members to extract to temporary file.
                _, memberfile = tempfile.mkstemp(prefix='siskin-')
                with open(memberfile, 'w') as output:
                    for item in items:
                        output.write("%s\n" % item.member)

                self.logger.debug("for archive %s extract via: %s", archive,
                                  memberfile)

                if not isinstance(archive, six.string_types):
                    archive = archive.decode(encoding='utf-8')

                # The unzippa will not exhaust ARG_MAX.
                shellout("""unzippa -v -m {memberfile} {archive} |
                            sed -e 's@<?xml version="1.0" encoding="UTF-8"?>@@g' | pigz -c >> {output}""",
                         archive=archive,
                         memberfile=memberfile,
                         output=stopover)

                try:
                    os.remove(output.name)
                except OSError as err:
                    self.logger.warn(err)

            luigi.LocalTarget(stopover).move(self.output().path)
Example #6
0
 def complete(self):
     path = "{date}-kxp".format(date=self.yesterday.strftime("%y%m%d"))
     ids = set()
     if not os.path.exists(path):
         return False
     for index in os.listdir(path):
         for f in os.listdir(path + "/" + index):
             with gzip.open("{fd}".format(fd=path + "/" + index + "/" + f),
                            "rt") as inp:
                 for line in inp:
                     ids.add(json.loads(line).get("identifier"))
             cmd = "zcat {fd} | jq -rc .identifier >> schemaorg-ids-{date}.txt".format(
                 fd=path + "/" + index + "/" + f,
                 date=self.yesterday.strftime("%y%m%d"))
             shellout(cmd)
     es_ids = set()
     for record in esidfilegenerator(
             host="{host}".format(
                 **self.config).rsplit("/")[-1].rsplit(":")[0],
             port="{host}".format(
                 **self.config).rsplit("/")[-1].rsplit(":")[1],
             index="slub-resources",
             type="schemaorg",
             idfile="schemaorg-ids-{date}.txt".format(
                 date=self.yesterday.strftime("%y%m%d")),
             source=False):
         es_ids.add(record.get("_id"))
     if len(es_ids) == len(ids) and len(es_ids) > 0:
         return True
     return False
Example #7
0
 def run(self):
     _, stopover = tempfile.mkstemp(prefix='siskin-')
     temp = shellout("unpigz -c {input} > {output}", input=self.input().get('input').path)
     output = shellout("""jq -r '[.doi?, .["rft.issn"][]?, .["rft.eissn"][]?] | @csv' {input} | LC_ALL=C sort -S50% > {output} """,
                       input=temp, output=stopover)
     os.remove(temp)
     luigi.File(output).move(self.output().path)
Example #8
0
 def run(self):
     _, combined = tempfile.mkstemp(prefix='siskin-')
     for target in self.input():
         shellout("cat {input} >> {output}", input=target.path,
                  output=combined)
     output = shellout("marcuniq {input} > {output}", input=combined)
     luigi.File(output).move(self.output().path)
Example #9
0
 def run(self):
     _, stopover = tempfile.mkstemp(prefix='siskin-')
     shellout("""jq -r '.doi' <(unpigz -c {input}) | grep -v "null" | grep -o "10.*" 2> /dev/null > {output} """,
              input=self.input().get('input').path,
              output=stopover)
     output = shellout("""sort -u {input} > {output} """, input=stopover)
     luigi.LocalTarget(output).move(self.output().path)
Example #10
0
 def run(self):
     output = shellout("xsltproc {stylesheet} {input} > {output}",
                       input=self.input().path,
                       stylesheet=self.assets('OAIDCtoMARCXML.xsl'))
     output = shellout("yaz-marcdump -i marcxml -o marc {input} > {output}",
                       input=output)
     luigi.File(output).move(self.output().path)
Example #11
0
 def run(self):
     _, stopover = tempfile.mkstemp(prefix='siskin-')
     with self.input().open() as handle:
         for row in handle.iter_tsv(cols=('path',)):
             shellout(""" unzip -l {input} | grep "xml$" | awk '{{print "{input}\t"$4}}' >> {output} """,
                      preserve_whitespace=True, input=row.path, output=stopover)
     luigi.File(stopover).move(self.output().path)
Example #12
0
File: amsl.py Project: miku/siskin
    def run(self):
        """
        Download, maybe unzip, combine with Gold List.
        """
        key = "http://amsl.technology/discovery/metadata-usage/Dokument/KBART_FREEJOURNALS"
        link = "%s%s" % (self.config.get('amsl', 'uri-download-prefix'), key)

        downloaded = shellout("curl --fail {link} > {output} ", link=link)
        _, stopover = tempfile.mkstemp(prefix='siskin-')

        try:
            _ = zipfile.ZipFile(downloaded)
            output = shellout("unzip -p {input} >> {output}", input=downloaded)
        except zipfile.BadZipfile:
            # At least the file is not a zip.
            output = shellout("cat {input} >> {output}", input=downloaded)

        # Include OA list, refs #11579.
        shellout("""curl -s https://pub.uni-bielefeld.de/download/2913654/2913655 | cut -d, -f1,2 | tr -d '"' |
                    grep -E '[0-9]{{4,4}}-[0-9]{{3,3}}[0-9xX]' | tr ',' '\n' |
                    awk '{{ print "\t\t"$0 }}' >> {output}""",
                 output=output,
                 preserve_whitespace=True)

        luigi.LocalTarget(output).move(self.output().path)
Example #13
0
    def run(self):
        archive = None

        with self.input().get('dblist-fulltext').open() as handle:
            for row in handle.iter_tsv(cols=('kind', 'path', 'db')):
                if row.db == self.db:
                    archive = row.path

        with self.input().get('dblist-references').open() as handle:
            for row in handle.iter_tsv(cols=('kind', 'path', 'db')):
                if row.db == self.db:
                    archive = row.path

        if archive is None:
            # an non-existent database name amount to an empty dump file
            self.logger.debug('no such db: %s' % self.db)
            with self.output().open('w') as output:
                pass
            return

        dbzip = shellout("7z x -so {archive} {db}.zip 2> /dev/null > {output}", archive=archive, db=self.db)
        output = shellout("""unzip -p {dbzip} \*.xml 2> /dev/null |
                             iconv -f iso-8859-1 -t utf-8 |
                             LC_ALL=C grep -v "^<\!DOCTYPE GENIOS PUBLIC" |
                             LC_ALL=C sed -e 's@<?xml version="1.0" encoding="ISO-8859-1" ?>@@g' |
                             LC_ALL=C sed -e 's@</Document>@<x-origin>{origin}</x-origin><x-issue>{issue}</x-issue></Document>@' |
                             pigz -c >> {output} """, dbzip=dbzip, origin=archive, issue=self.issue)

        luigi.File(output).move(self.output().path)
Example #14
0
    def complete(self):
        if not self.output().exists():
            return False

        with chdir(str(self.output().path)):
            output = shellout("""git fetch origin {branch} > {output} 2>&1""",
                              branch=self.branch)

            result = True

            with open(output, 'rb') as fh:
                content = fh.readlines()
                if len(content) >= 3:
                    result = False

            revparseoutput = shellout(
                """git rev-parse {branch} > {output} 2>&1""",
                branch=self.branch)
            originrevparseoutput = shellout(
                """git rev-parse origin/{branch} > {output} 2>&1""",
                branch=self.branch)

            revparse = getfirstline(revparseoutput, "0")
            originrevparse = getfirstline(originrevparseoutput, "1")

            if revparse != originrevparse:
                result = False

        return result
Example #15
0
File: zdb.py Project: zazi/siskin
 def run(self):
     """
     @IHBhY2thZ2UgbWFpbgoK@IGltcG9ydCAoCiAg@@ImVuY29kaW5nL2pzb24iCiAg@@ImVuY29kaW5nL3htbCIK@@IC
     AibG9nIgog@@ICJvcyIKCiAg@@InN0cmluZ3MiCgog@@ICJnaXRodWIuY29tL21pa3UveG1sc3RyZWFtIgog@KQoK@
     IHR5cGUgRGVzY3JpcHRpb24gc3RydWN0IHsK@@ICBYTUxOYW1l@IHhtbC5OYW1lIGB4bWw6IkRlc2NyaXB0aW9uImA
     K@@ICBJc3Nu@@IFtdc3RyaW5nIGB4bWw6Imlzc24iYAog@@IFNob3J0VGl0bGUgW11zdHJpbmcgYHhtbDoic2hvcnR
     UaXRsZSJgCiAg@@VGl0bGUg@ICBbXXN0cmluZyBgeG1sOiJ0aXRsZSJgCiAgICB9Cgog@ZnVuYyBtYWluKCkgewog@
     @IHNtIDo9IG1ha2UobWFwW3N0cmluZ11zdHJpbmcpCiAg@@c2Nhbm5lciA6PSB4bWxzdHJlYW0uTmV3U2Nhbm5lcih
     vcy5TdGRpbiwgbmV3KERlc2NyaXB0aW9uKSkK@@ICBmb3Igc2Nhbm5lci5TY2FuKCkgewog@@@ICB0YWcgOj0gc2Nh
     bm5lci5FbGVtZW50KCkK@@@@aWYgdiwgb2sgOj0gdGFnLigqRGVzY3JpcHRpb24pOyBvayB7CiAg@@@@ICBpZiBsZW
     4odi5TaG9ydFRpdGxlKSA9PSAwIHsK@@@@@@ICBjb250aW51ZQog@@@@@fQog@@@@@Zm9yIF8sIHMgOj0gcmFuZ2Ug
     di5TaG9ydFRpdGxlIHsK@@@@@@ICBmb3IgXywgdCA6PSByYW5nZSB2LlRpdGxlIHsK@@@@@@@@c21bc10gPSB0CiAg
     @@@@@@@IHNtW3N0cmluZ3MuVG9Mb3dlcihzKV0gPSB0CiAg@@@@@@fQog@@@@@fQog@@@ICB9CiAg@@fQog@@IGlmI
     GVyciA6PSBzY2FubmVyLkVycigpOyBlcnIgIT0gbmlsIHsK@@@@bG9nLkZhdGFsKGVycikK@@ICB9CiAg@@aWYgZXJ
     yIDo9IGpzb24uTmV3RW5jb2Rlcihvcy5TdGRvdXQpLkVuY29kZShzbSk7IGVyciAhPSBuaWwgewog@@@ICBsb2cuRm
     F0YWwoZXJyKQog@@IH0K@IH0K
     """
     source = self.run.__doc__.replace("\n",
                                       "").replace(" ",
                                                   "").replace("@", "ICAg")
     tempcode = shellout("""echo '{code}' | base64 -d > {output}.go """,
                         code=source,
                         preserve_whitespace=True)
     output = shellout(
         """ unpigz -c {input} | go run {code}.go > {output} """,
         code=tempcode,
         input=self.input().path)
     os.remove(tempcode)
     luigi.LocalTarget(output).move(self.output().path)
Example #16
0
 def run(self):
     _, tmpfile = tempfile.mkstemp(prefix='byoi-')
     for target in self.input():
         shellout("cat {input} >> {output}",
                  input=target.path,
                  output=tmpfile)
     luigi.File(tmpfile).move(self.output().path)
Example #17
0
 def run(self):
     _, output = tempfile.mkstemp(prefix='siskin-')
     for target in self.input():
         shellout("cat {input} >> {output}",
                  input=target.path,
                  output=output)
     luigi.LocalTarget(output).move(self.output().path)
Example #18
0
 def run(self):
     target = os.path.dirname(self.output().path)
     pattern = os.path.join(config.get('lfer', 'glob'))
     shellout("rsync -avz {src} {target}", src=pattern, target=target)
     with self.output().open('w') as output:
         for path in sorted(iterfiles(target)):
             output.write_tsv(path)
Example #19
0
    def run(self):
        """
        Download, maybe unzip, combine with Gold List.
        """
        key = "http://amsl.technology/discovery/metadata-usage/Dokument/KBART_FREEJOURNALS"
        link = "%s%s" % (self.config.get('amsl', 'uri-download-prefix'), key)

        downloaded = shellout("curl --fail {link} > {output} ", link=link)
        _, stopover = tempfile.mkstemp(prefix='siskin-')

        try:
            _ = zipfile.ZipFile(downloaded)
            output = shellout("unzip -p {input} >> {output}", input=downloaded)
        except zipfile.BadZipfile:
            # At least the file is not a zip.
            output = shellout("cat {input} >> {output}", input=downloaded)

        # Include OA list, refs #11579.
        shellout(
            """curl -s https://pub.uni-bielefeld.de/download/2913654/2913655 | cut -d, -f1,2 | tr -d '"' |
                    grep -E '[0-9]{{4,4}}-[0-9]{{3,3}}[0-9xX]' | tr ',' '\n' |
                    awk '{{ print "\t\t"$0 }}' >> {output}""",
            output=output,
            preserve_whitespace=True)

        luigi.LocalTarget(output).move(self.output().path)
Example #20
0
    def run(self):
        """ The indicator is always recreated, while the subdir
        for a given (host, username, base, pattern) is just synced. """
        base = os.path.dirname(self.output().path)
        subdir = hashlib.sha1('{host}:{username}:{base}:{pattern}'.format(
            host=self.host,
            username=self.username,
            base=self.base,
            pattern=self.pattern)).hexdigest()
        # target is the root of the mirror
        target = os.path.join(base, subdir)
        if not os.path.exists(target):
            os.makedirs(target)

        command = """lftp -u {username},{password}
        -e "set net:max-retries 5; set net:timeout 10; mirror --verbose=0
        --only-newer -I {pattern} {base} {target}; exit" {host}"""

        shellout(command,
                 host=self.host,
                 username=pipes.quote(self.username),
                 password=pipes.quote(self.password),
                 pattern=pipes.quote(self.pattern),
                 target=pipes.quote(target),
                 base=pipes.quote(self.base))

        with self.output().open('w') as output:
            for path in iterfiles(target):
                logger.debug("Mirrored: %s" % path)
                output.write_tsv(path)
Example #21
0
 def run(self):
     source = os.path.join(config.get('core', 'swb-mirror'), 'nationallizenzen')
     target = os.path.dirname(self.output().path)
     shellout("rsync -avz {source} {target}", source=source, target=target)
     with self.output().open('w') as output:
         for path in iterfiles(target):
             output.write_tsv(path)
Example #22
0
File: ceeol.py Project: miku/siskin
 def run(self):
     paths = [p.strip() for p in self.config.get("ceeol", "updates").split(",") if p.strip()]
     _, stopover = tempfile.mkstemp(prefix="siskin-")
     self.logger.debug("found %d updates", len(paths))
     for p in paths:
         shellout("span-import -i ceeol-marcxml {input} >> {output}", input=p, output=stopover)
     luigi.LocalTarget(stopover).move(self.output().path)
Example #23
0
    def run(self):
        with self.input().open() as handle:
            holdings = json.load(handle)

        _, stopover = tempfile.mkstemp(prefix='siskin-')

        # The property which contains the URI of the holding file. Might change.
        urikey = 'DokumentURI'

        for holding in holdings:
            if holding["ISIL"] == self.isil:

                if urikey not in holding:
                    raise RuntimeError('possible AMSL API change, expected: %s, available keys: %s' % (urikey, holding.keys()))

                # refs. #7142
                if 'kbart' not in holding[urikey].lower():
                    self.logger.debug("skipping non-KBART holding URI: %s" % holding[urikey])
                    continue

                link = "%s%s" % (self.config.get('amsl', 'uri-download-prefix'), holding[urikey])
                downloaded = shellout("curl --fail {link} > {output} ", link=link)
                try:
                    _ = zipfile.ZipFile(downloaded)
                    output = shellout("unzip -p {input} >> {output}", input=downloaded, output=stopover)
                except zipfile.BadZipfile:
                    # at least the file is not a zip.
                    output = shellout("cat {input} >> {output}", input=downloaded, output=stopover)

        luigi.File(stopover).move(self.output().path)
Example #24
0
File: ceeol.py Project: zazi/siskin
    def run(self):
        paths = [
            p.strip() for p in self.config.get("ceeol", "updates").split(",")
            if p.strip()
        ]
        _, stopover = tempfile.mkstemp(prefix="siskin-")
        self.logger.debug("found %d updates", len(paths))
        for p in paths:
            shellout("span-import -i ceeol-marcxml {input} >> {output}",
                     input=p,
                     output=stopover)

        # Append MARC updates.
        paths = [
            p.strip()
            for p in self.config.get("ceeol", "updates-marc").split(",")
            if p.strip()
        ]
        _, stopover = tempfile.mkstemp(prefix="siskin-")
        self.logger.debug("found %d updates (MARC)", len(paths))
        with open(stopover, 'a') as output:
            for p in paths:
                self.logger.debug("converting: %s", p)
                for doc in convert_ceeol_to_intermediate_schema(p):
                    output.write(json.dumps(doc) + "\n")
        luigi.LocalTarget(stopover).move(self.output().path)
Example #25
0
 def run(self):
     output = shellout("""python {script} {input} {output}""",
                       script=self.assets('30/30_marcbinary.py'),
                       input=self.input().path)
     output = shellout("""yaz-marcdump -o marcxml {input} > {output}""",
                       input=output)
     luigi.LocalTarget(output).move(self.output().path)
Example #26
0
 def run(self):
     """
     itereates over the in LODProcessFromRdi generated JSON-Linked-Data
     enriches them with identifier from entityfacts
     enriches them with subjects from the GND
     enriches them with identifier from wikidata
     enriches them with identifier from geonames, if its a geographic Place
     ingests them into a elasticsearch node
     """
     path = "{date}-aut-data".format(date=self.yesterday.strftime("%y%m%d"))
     for index in os.listdir(path):
         # doing several enrichment things before indexing the data
         for f in os.listdir(path + "/" + index):
             cmd = ". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && zcat {fd} | ".format(
                 fd=path + "/" + index + "/" + f
             )  # with -pipeline, all the data get's thru, not only enriched docs
             # cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/sameAs2id.py         -pipeline -stdin -searchserver {host} | ".format(**self.config)
             cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/entityfacts-bot.py   -pipeline -stdin -searchserver {host} | ".format(
                 **self.config)
             cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/gnd-sachgruppen.py   -pipeline -stdin -searchserver {host} | ".format(
                 **self.config)
             cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/wikidata.py          -pipeline -stdin | "
             if index == "geo":
                 cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/geonames.py       -pipeline -stdin -searchserver {geonames_host} | ".format(
                     **self.config)
             cmd += "esbulk -verbose -server {host} -w 1 -size 20 -index {index} -type schemaorg -id identifier".format(
                 **self.config, index=index)
             shellout(cmd)
     put_dict("{host}/date/actual/1".format(**self.config),
              {"date": str(self.yesterday.strftime("%Y-%m-%d"))})
Example #27
0
    def run(self):
        """
        TODO(miku): This contains things, that would better be factored out in separate tasks.
        """
        titles = {}
        output = shellout("span-gh-dump {hfile} > {output}", hfile=self.hfile)
        with luigi.File(output, format=TSV).open() as handle:
            for row in handle.iter_tsv(cols=('issn', 'title')):
                titles[row.issn] = row.title

        issns_held = set()
        output = shellout("xmlstarlet sel -t -v '//issn' {hfile} | sort | uniq > {output}", hfile=self.hfile)
        with luigi.File(output, format=TSV).open() as handle:
            for row in handle.iter_tsv(cols=('issn',)):
                issns_held.add(row.issn)

        issns_crossref = set()
        with self.input().open() as handle:
            for row in handle.iter_tsv(cols=('issn',)):
                issns_crossref.add(row.issn)

        covered = issns_held.intersection(issns_crossref)

        with self.output().open('w') as output:
            for issn in issns_held:
                if issn in covered:
                    output.write_tsv("COVERED", issn, titles[issn])
                else:
                    output.write_tsv("NOT_COVERED", issn, titles[issn])
Example #28
0
    def run(self):
        """ The indicator is always recreated, while the subdir
        for a given (host, username, base, pattern) is just synced. """
        base = os.path.dirname(self.output().path)
        subdir = hashlib.sha1('{host}:{username}:{base}:{pattern}'.format(
            host=self.host, username=self.username, base=self.base,
            pattern=self.pattern)).hexdigest()
        # target is the root of the mirror
        target = os.path.join(base, subdir)
        if not os.path.exists(target):
            os.makedirs(target)

        command = """lftp -u {username},{password}
        -e "set net:max-retries 5; set net:timeout 10; mirror --verbose=0
        --only-newer -I {pattern} {base} {target}; exit" {host}"""

        shellout(command, host=self.host, username=pipes.quote(self.username),
                 password=pipes.quote(self.password),
                 pattern=pipes.quote(self.pattern),
                 target=pipes.quote(target),
                 base=pipes.quote(self.base))

        with self.output().open('w') as output:
            for path in iterfiles(target):
                logger.debug("Mirrored: %s" % path)
                output.write_tsv(path)
Example #29
0
 def run(self):
     _, stopover = tempfile.mkstemp(prefix='siskin-')
     shellout(r"""cat {input} | awk '{{ print $0"\treferences"}}' >> {output}""",
              input=self.input().get('references').path, output=stopover)
     shellout(r"""cat {input} | awk '{{ print $0"\tfulltext"}}' >> {output}""",
              input=self.input().get('fulltext').path, output=stopover)
     luigi.File(stopover).move(self.output().path)
Example #30
0
File: app.py Project: miku/pkpy
def pypi_build(name, target='deb'):
    """ Take a package name and return the filename of the target. """
    cache_key = hashlib.sha1('%s:%s' % (name, target)).hexdigest()
    cache = shelve.open(CACHE)

    if not cache_key in cache:
        logger.debug('Building %s for %s...' % (target, name))
        stopover = tempfile.mkdtemp(prefix='pkpy-')
        try:
            shellout("""cd {stopover} && 
                        fpm --verbose -s python -t {target} {name}""",
                     stopover=stopover, name=name, target=target)
            src = iterfiles(stopover).next()
            basename = os.path.basename(src)
            dst = os.path.join(PACKAGE_CACHE, basename)
            shutil.copyfile(src, dst)
            shutil.rmtree(stopover)
            cache[cache_key] = basename
        except RuntimeError as err:
            logger.error(err)
            return abort(404)
    else:
        logger.debug('Cache hit...')

    filename = cache[cache_key]
    cache.close()
    return filename
Example #31
0
File: app.py Project: miku/pkpy
def github_clone_and_build(username, repo, target='deb'):
    """ Clone a repo (username, repo) and build the `target` package.
    Returns the filename, that is placed directly under `static`. """
    repo_url = '[email protected]:%s/%s.git' % (username, repo)

    cache_key = hashlib.sha1('%s:%s' % (repo_url, target)).hexdigest()
    cache = shelve.open(CACHE)

    if not cache_key in cache:
        logger.debug('Building (%s, %s) ...' % (repo_url, target))
        stopover = tempfile.mkdtemp(prefix='pkpy-')
        shellout("""
            cd {stopover} && git clone {repo_url} &&
            cd {repo} && fpm --verbose -s python -t {target} .""",
                 stopover=stopover, repo_url=repo_url, repo=repo, target=target)
        src = iterfiles(stopover, fun=lambda fn: fn.endswith(target)).next()
        basename = os.path.basename(src)
        dst = os.path.join(PACKAGE_CACHE, basename)
        shutil.copyfile(src, dst)
        shutil.rmtree(stopover)
        cache[cache_key] = basename
    else:
        logger.debug('Cache hit...')
    filename = cache[cache_key]
    cache.close()
    return filename
Example #32
0
 def run(self):
     _, dbpath = tempfile.mkstemp(prefix='siskin-')
     shellout("cayley init -alsologtostderr -config {config} -dbpath={dbpath}",
              config=self.assets('cayley.bolt.conf'), dbpath=dbpath)
     shellout("GOMAXPROCS={gomaxprocs} cayley load -config {config} -alsologtostderr -dbpath={dbpath} --triples {input}",
              gomaxprocs=self.gomaxprocs, config=self.assets('cayley.bolt.conf'), dbpath=dbpath, input=self.input().get('ntriples').path)
     shutil.move(dbpath, self.output().path)
Example #33
0
    def run(self):

        grobid_dir = "work/{}/{}/grobid_tei".format(self.crawl, self.item)
        json_dir = "work/{}/{}/grobid_json".format(self.crawl, self.item)
        shellout("mkdir -p {json_dir}", json_dir=json_dir)

        # Generate fulltext json (one file per PDF)
        for tei_path in glob.glob(grobid_dir + "/*.tei.xml"):
            json_path = tei_path.replace('/grobid_tei/', '/grobid_json/')\
                                .replace('.tei.xml', '.json')
            # jq in the pipeline validates JSON
            shellout("""
                bin/grobid2json.py {tei_path}
                    | jq -c .
                    > {json_path}""",
                tei_path=tei_path,
                json_path=json_path)

        # Just the header info (one json file for the whole item)
        # jq in the pipeline validates JSON
        output = shellout("""
            cd {grobid_dir}
            && find . -name "*.tei.xml"
                | parallel -j 4 ../../../../bin/grobid2json.py --no-encumbered {{}}
                | jq -c .
                > {output}""",
            grobid_dir=grobid_dir)

        luigi.LocalTarget(output).move(self.output().path)
 def run(self):
     """
     transforms the geonames TSV Dump to line-delimited JSON
     """
     cmd = ". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && ~/git/efre-lod-elasticsearch-tools/helperscripts/tsv2json.py {file}.txt | gzip > {file}.ldj.gz".format(
         **self.config)
     shellout(cmd)
Example #35
0
 def run(self):
     url = self.config.get('thieme', 'oai')
     shellout("METHA_DIR={dir} metha-sync -set {set} -format {prefix} {url}",
              set=self.set, prefix=self.prefix, url=url, dir=self.config.get('core', 'metha-dir'))
     output = shellout("METHA_DIR={dir} metha-cat -format {prefix} {url} | pigz -c > {output}",
                       prefix=self.prefix, url=url, dir=self.config.get('core', 'metha-dir'))
     luigi.File(output).move(self.output().path)
Example #36
0
    def run(self):

        pdf_dir = "work/{}/{}/pdfs".format(self.crawl, self.item)
        grobid_dir = "work/{}/{}/grobid_tei".format(self.crawl, self.item)
        shellout("mkdir -p {grobid_dir}", grobid_dir=grobid_dir)

        output = shellout("""
            /usr/bin/time -v -o {output}
            java
                -Xmx6G
                -jar {GROBID_JAR}
                -gH {GROBID_HOME}
                -dIn {pdf_dir}
                -r
                -dOut {grobid_dir}
                -exe processFullText""",
            pdf_dir=pdf_dir,
            grobid_dir=grobid_dir,
            GROBID_JAR=GROBID_JAR,
            GROBID_HOME=GROBID_HOME)

        # java/GROBID doesn't error when it runs. One way to check that it ran
        # is that there should be a TEI file for every file in PDF dir.
        #assert(glob_count(pdf_dir + "/*.pdf") == glob_count(grobid_dir + "/*.tei.xml"))

        luigi.LocalTarget(output).move(self.output().path)
Example #37
0
 def run(self):
     _, stopover = tempfile.mkstemp(prefix='siskin-')
     for i in range(27):
         url = self.template.format(part=i)
         shellout("""curl --retry 1 --compress "{url}" >> {output}""",
                  url=url, output=stopover)
     luigi.File(stopover).move(self.output().path)
Example #38
0
File: jstor.py Project: miku/siskin
    def run(self):
        _, stopover = tempfile.mkstemp(prefix='siskin-')
        with self.input().open() as handle:
            groups = itertools.groupby(handle.iter_tsv(cols=('archive', 'member')), lambda row: row.archive)

            for archive, items in groups:
                # Write members to extract to temporary file.
                _, memberfile = tempfile.mkstemp(prefix='siskin-')
                with open(memberfile, 'w') as output:
                    for item in items:
                        output.write("%s\n" % item.member)

                self.logger.debug("for archive %s extract via: %s", archive, memberfile)

                # The unzippa will not exhaust ARG_MAX.
                shellout("""unzippa -v -m {memberfile} {archive} |
                            sed -e 's@<?xml version="1.0" encoding="UTF-8"?>@@g' | pigz -c >> {output}""",
                         archive=archive.decode(encoding='utf-8'),
                         memberfile=memberfile,
                         output=stopover)

                try:
                    os.remove(output.name)
                except OSError as err:
                    self.logger.warn(err)

            luigi.LocalTarget(stopover).move(self.output().path)
Example #39
0
File: dta.py Project: zazi/siskin
 def run(self):
     shellout(
         """metha-sync -rm -format cmdi -set dta https://clarin.bbaw.de/oai-dta/"""
     )
     output = shellout(
         """metha-cat -format cmdi -set dta https://clarin.bbaw.de/oai-dta/ > {output}"""
     )
     luigi.LocalTarget(output).move(self.output().path)
Example #40
0
 def run(self):
     _, combined = tempfile.mkstemp(prefix='siskin-')
     for target in self.input():
         tmp = shellout("""yaz-marcdump -f utf-8 -t utf-8 -i
                           marcxml -o marc {input} > {output}""",
                           input=target.fn, ignoremap={5: 'TODO: fix this'})
         shellout("cat {input} >> {output}", input=tmp, output=combined)
     luigi.File(combined).move(self.output().path)
Example #41
0
 def run(self):
     _, stopover = tempfile.mkstemp(prefix='siskin-')
     with self.input().open() as handle:
         for row in handle.iter_tsv(cols=('path',)):
             shellout(""" unzip -p {input} | grep -o 'DB="[^"]*' | sed -e 's/DB="//g' >> {output} """,
                      input=row.path, output=stopover)
     output = shellout("sort -u {input} > {output} && rm {input}", input=stopover)
     luigi.File(output).move(self.output().path)
Example #42
0
File: s06.py Project: miku/siskin
def merge(targets):
    """
    Helper function to concatenate the outputs for a number of targets.
    """
    _, tf = tempfile.mkstemp(prefix='lab-')
    for target in targets:
        shellout("cat {input} >> {output}", input=target.path, output=tf)
    return tf
Example #43
0
 def run(self):
     _, output = tempfile.mkstemp(prefix='siskin-')
     with self.input().open() as handle:
         for row in sorted(handle.iter_tsv(cols=('path', ))):
             if not str(row.path).endswith('.tar'):
                 continue
             shellout("span-import -i elsevier-tar {input} | pigz -c >> {output}", input=row.path, output=output)
     luigi.LocalTarget(output).move(self.output().path)
Example #44
0
 def run(self):
     _, stopover = tempfile.mkstemp(prefix='siskin-')
     shellout("""LC_ALL=C zgrep -E "http(s)?://.*.crossref.org" {input} >> {output}""",
              input=self.input().path, output=stopover)
     shellout("""LC_ALL=C zgrep -v "^200" {input} >> {output}""",
              input=self.input().path, output=stopover)
     output = shellout("sort -S50% -u {input} | cut -f4 | sed s@http://doi.org/api/handles/@@g > {output}", input=stopover)
     luigi.File(output).move(self.output().path)
Example #45
0
 def run(self):
     # XXX: Does this really work?
     output = shellout("""iconv -f utf-8 -t utf-8 -c {input} > {output}""",
                       input=self.input().path)
     output = shellout("""flux.sh {flux} in={input} > {output}""",
                       flux=self.assets("127/127.flux"),
                       input=output)
     luigi.LocalTarget(output).move(self.output().path)
Example #46
0
 def run(self):
     output = shellout(""" span-tag -c {config} {input} > {output} """,
                       config=self.input().get('config').path,
                       input=self.input().get('file').path)
     output = shellout(""" span-export -o {format} {input} > {output} """,
                       input=output,
                       format=self.format)
     luigi.LocalTarget(output).move(self.output().path)
Example #47
0
File: x06.py Project: zazi/siskin
def merge(targets):
    """
    Helper function to concatenate the outputs for a number of targets.
    """
    _, tf = tempfile.mkstemp(prefix='lab-')
    for target in targets:
        shellout("cat {input} >> {output}", input=target.path, output=tf)
    return tf
Example #48
0
 def run(self):
     output = shellout("span-tag -c {config} <(unpigz -c {input}) | pigz -c > {output}",
                       config=self.input().get('config').path,
                       input=self.input().get('file').path)
     output = shellout("span-export -o {format} <(unpigz -c {input}) | pigz -c > {output}",
                       format=self.format,
                       input=output)
     luigi.LocalTarget(output).move(self.output().path)
Example #49
0
 def run(self):
     """
     iterates over the id-file from LODKXPTransform2ldj, searches for the right titledata (de-14) and merges them with the merge_lok_with_tit script. finally, the data gets loaded into the kxp-de14 index
     """
     shellout(
         """. ~/git/efre-lod-elasticsearch-tools/init_environment.sh && ~/git/efre-lod-elasticsearch-tools/helperscripts/merge_lok_with_tit.py -selectbody \'{{\"query\": {{\"match\": {{\"852.__.a.keyword\": \"DE-14\"}}}}}}\' -title_server {rawdata_host}/kxp-tit/mrc -local_server {rawdata_host}/kxp-lok/mrc -idfile {date}-lok-ppns.txt | tee data.ldj | esbulk -server {rawdata_host} -index kxp-de14 -type mrc -id 001 -w 1 -verbose && jq -rc \'.\"001\"' data.ldj > ids.txt && rm data.ldj""",
         rawdata_host=self.config.get("rawdata_host"),
         date=self.yesterday.strftime("%y%m%d"))
Example #50
0
 def run(self):
     _, stopover = tempfile.mkstemp(prefix='siskin-')
     shellout(
         """jq -r '.doi' <(unpigz -c {input}) | grep -v "null" | grep -o "10.*" 2> /dev/null > {output} """,
         input=self.input().get('input').path,
         output=stopover)
     output = shellout("""sort -u {input} > {output} """, input=stopover)
     luigi.LocalTarget(output).move(self.output().path)
Example #51
0
File: khm.py Project: zazi/siskin
 def run(self):
     output = shellout(
         """sed -e 's/'$(echo "\o001")'/ /g' < {input} > {output}""",
         input=self.config.get('khm', 'dump'))
     # TODO(miku): maybe check, if cleanup is still required.
     output = shellout("python {script} {input} {output}",
                       script=self.assets("109/109_marcbinary.py"),
                       input=output)
     luigi.LocalTarget(output).move(self.output().path)
 def run(self):
     """
     calls esmarc with the idfile created in LODTITTransform2ldj
     """
     if os.stat("{date}.mrc.bz2".format(date=self.date)).st_size > 0:
         cmd = ". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && ~/git/efre-lod-elasticsearch-tools/processing/esmarc.py -z -server {rawdata_host}/finc-main-k10plus/mrc -idfile {date}-ppns.txt -prefix {date}-data".format(
             **self.config, date=self.date)
         shellout(cmd)
         sleep(5)
Example #53
0
File: hsmw.py Project: zazi/siskin
 def run(self):
     endpoint, set = "https://monami.hs-mittweida.de/oai", "institutes:medien"
     shellout("""metha-sync -set {set} {endpoint}""",
              set=set,
              endpoint=endpoint)
     output = shellout(
         """metha-cat -set "institutes:medien" {endpoint} > {output} """,
         endpoint=endpoint)
     luigi.LocalTarget(output).move(self.output().path)
Example #54
0
 def run(self):
     shellout("""metha-sync -rm -format {format} {endpoint} """,
              endpoint=self.endpoint,
              format=self.format)
     output = shellout(
         """metha-cat -format {format} {endpoint} > {output}""",
         endpoint=self.endpoint,
         format=self.format)
     luigi.LocalTarget(output).move(self.output().path)
Example #55
0
File: hhbd.py Project: zazi/siskin
 def run(self):
     shellout(
         "metha-sync -format {format} http://digi.ub.uni-heidelberg.de/cgi-bin/digioai.cgi",
         format=self.format)
     output = shellout(
         "metha-cat -format {format} -root Records {url} > {output}",
         format=self.format,
         url=self.url)
     luigi.LocalTarget(output).move(self.output().path)
Example #56
0
 def run(self):
     """
     takes the in LODTransform2ldj created TXT file and gets those records from the elasticsearch node
     transforms them to JSON-Linked Data with esmarc.py, gzips the files
     """
     cmd = ". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && ~/git/efre-lod-elasticsearch-tools/processing/esmarc.py -z -server {host}/swb-aut/mrc -idfile {date}-norm-aut-ppns.txt -prefix {date}-aut-data".format(
         **self.config, date=self.yesterday.strftime("%y%m%d"))
     shellout(cmd)
     sleep(5)
Example #57
0
File: ijoc.py Project: zazi/siskin
 def run(self):
     shellout("""METHA_DIR={dir} metha-sync "{endpoint}" """,
              dir=self.config.get('core', 'metha-dir'),
              endpoint=self.endpoint)
     output = shellout(
         """METHA_DIR={dir} metha-cat -root Records "{endpoint}" > {output}""",
         dir=self.config.get('core', 'metha-dir'),
         endpoint=self.endpoint)
     luigi.LocalTarget(output).move(self.output().path)
Example #58
0
 def run(self):
     """
     iterates over the id-file from LODKXPTransform2ldj, gets this set of records from the kxp-de14 index, transforms them to JSON-Linked-Data
     """
     # delete("{rawdata_host}/kxp-tit-{date}".format(**self.config,date=self.yesterday.strftime("%y%m%d")))
     # delete("{rawdata_host}/kxp-lok-{date}".format(**self.config,date=self.yesterday.strftime("%y%m%d")))
     cmd = ". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && ~/git/efre-lod-elasticsearch-tools/processing/esmarc.py  -z -server {rawdata_host}/kxp-de14/mrc -idfile ids.txt -prefix {date}-kxp".format(
         **self.config, date=self.yesterday.strftime("%y%m%d"))
     shellout(cmd)
     sleep(5)
Example #59
0
 def run(self):
     """
     fills the local data index and the source title data index with the data transformed in LODKXPTransform2ldj
     """
     for typ in ["tit", "lok"]:
         # put_dict("{rawdata_host}/kxp-{typ}".format(**self.config,typ=typ,date=self.yesterday.strftime("%y%m%d")),{"mappings":{"mrc":{"date_detection":False}}})
         # put_dict("{rawdata_host}/kxp-{typ}/_settings".format(**self.config,typ=typ,date=self.yesterday.strftime("%y%m%d")),{"index.mapping.total_fields.limit":5000})
         cmd = "esbulk -z -verbose -server {rawdata_host} -w {workers} -index kxp-{typ} -type mrc -id 001 {date}-{typ}.ldj.gz" "".format(
             **self.config, typ=typ, date=self.yesterday.strftime("%y%m%d"))
         shellout(cmd)
Example #60
0
 def run(self):
     directory = self.config.get('elsevierjournals', 'backlog-dir')
     _, output = tempfile.mkstemp(prefix='siskin-')
     for path in sorted(
             iterfiles(directory, fun=lambda p: p.endswith('.tar'))):
         shellout(
             "span-import -i elsevier-tar {input} | pigz -c >> {output}",
             input=path,
             output=output)
     luigi.LocalTarget(output).move(self.output().path)