Esempio n. 1
0
 def run(self):
     _, tmpfile = tempfile.mkstemp(prefix='byoi-')
     for target in self.input():
         shellout("cat {input} >> {output}",
                  input=target.path,
                  output=tmpfile)
     luigi.File(tmpfile).move(self.output().path)
Esempio n. 2
0
 def run(self):
     """
     Only use the first file, so it is faster. To use more files, drop the `head -1`.
     """
     directory = os.path.join(self.inputdir(), 'crossref')
     output = shellout("find {directory} -name '*.ldj.gz' | head -1 > {output}", directory=directory)
     luigi.File(output).move(self.output().path)
Esempio n. 3
0
 def run(self):
     """
     TODO: Concatenate all input files.
     """
     _, tmpfile = tempfile.mkstemp(prefix='byoi-')
     # TODO: loop over inputs and run `cat`
     luigi.File(tmpfile).move(self.output().path)
Esempio n. 4
0
 def run(self):
     output = shellout('marctotsv -k -s "|" {input} 001 653.a > {output}',
              input=self.input().get('dump').path)
     with luigi.File(output, format=TSV).open() as handle:
         with self.output().open('w') as output:
             for row in handle.iter_tsv(cols=('id', 'terms')):
                 for subfield in row.terms.split('|'):
                     for term in subfield.split('--'):
                         term = term.strip()
                         output.write_tsv(row.id, term)
Esempio n. 5
0
    def run(self):
        """
        TODO: For each file, we want to run a jq command.
        """
        _, temp = tempfile.mkstemp(prefix='byoi-')
        with self.input().open() as handle:
            # TODO: insert code here
            pass

        luigi.File(temp).move(self.output().path)
Esempio n. 6
0
 def run(self):
     _, temp = tempfile.mkstemp(prefix='byoi-')
     with self.input().open() as handle:
         for path in map(str.strip, handle):
             print('processing: %s' % path)
             shellout(
                 "jq -r -c '.message.items[]' <(unpigz -c {input}) | pigz -c >> {output}",
                 input=path,
                 output=temp)
     luigi.File(temp).move(self.output().path)
Esempio n. 7
0
    def run(self):
        command = """lftp -u {username},{password}
        -e "set net:max-retries 5; set net:timeout 10; get -c
        {filepath} -o {output}; exit" {host}"""

        output = shellout(command,
                          host=self.host,
                          username=pipes.quote(self.username),
                          password=pipes.quote(self.password),
                          filepath=pipes.quote(self.filepath))
        luigi.File(output).move(self.output().path)
Esempio n. 8
0
def shellout(template, **kwargs):
    """
    Takes a shell command template and executes it. The template must use
    the new (2.6+) format mini language. `kwargs` must contain any defined
    placeholder, only `output` is optional.
    Raises RuntimeError on nonzero exit codes.

    Simple template:

    wc -l < {input} > {output}

    Quoted curly braces:

    ps ax|awk '{{print $1}}' > {output}

    Usage with luigi:

    ...
    tmp = shellout('wc -l < {input} > {output}', input=self.input().fn)
    luigi.File(tmp).move(self.output.fn())
    ....

    """
    preserve_spaces = kwargs.get('preserve_spaces', False)
    stopover = luigi.File(
        is_tmp=True
    )  # Should return a random path string, e.g. /tmp/as3as8d90a8s9f8d
    if not 'output' in kwargs:
        kwargs.update({'output': stopover.fn})
    command = template.format(**kwargs)
    if not preserve_spaces:
        command = re.sub(' +', ' ', command)
    # logger.debug(cyan(command))
    code = subprocess.call([command], shell=True)
    if not code == 0:
        raise RuntimeError('%s exitcode: %s' % (command, code))
    # return kwargs.get('output')
    return stopover if stopover else luigi.File(kwargs.get('output'))
Esempio n. 9
0
 def run(self):
     server = "datendienst.dnb.de"
     path = "/cgi-bin/mabit.pl"
     params = urllib.urlencode({
         'cmd': 'fetch',
         'userID': 'opendata',
         'pass': '******',
         'mabheft': 'GND.rdf.gz'
     })
     url = "http://{server}{path}?{params}".format(server=server,
                                                   path=path,
                                                   params=params)
     output = shellout("""wget --retry-connrefused "{url}" -O {output}""",
                       url=url)
     luigi.File(output).move(self.output().fn)
Esempio n. 10
0
    def run(self):
        stopover = random_tmp_path()
        pattern = re.compile(
            """rdf:about="http://d-nb.info/gnd/([0-9X-]+)">""")

        with dbopen(stopover) as cursor:
            cursor.execute("""CREATE TABLE gnd
                              (id text  PRIMARY KEY, content blob)""")
            cursor.execute("""CREATE INDEX IF NOT EXISTS
                              idx_gnd_id ON gnd (id)""")

            with self.input().open() as handle:
                groups = itertools.groupby(handle, key=str.isspace)
                for i, (k, lines) in enumerate(groups):
                    if k:
                        continue
                    lines = map(string.strip, list(lines))
                    match = pattern.search(lines[0])
                    if match:
                        row = (match.group(1), '\n'.join(lines))
                        cursor.execute("INSERT INTO gnd VALUES (?, ?)", row)

        luigi.File(path=stopover).move(self.output().fn)
Esempio n. 11
0
    def run(self):
        stopover = random_tmp_path()

        with self.input().open() as handle:
            with dbopen(stopover) as cursor:
                cursor.execute(
                    """CREATE TABLE IF NOT EXISTS successor (id text,
                    successor text, PRIMARY KEY (id, successor))""")

                for line in handle:
                    id, successor = line.strip().split()
                    if id == successor:
                        continue
                    cursor.execute("INSERT INTO successor VALUES (?, ?)",
                                   (id, successor))

                cursor.execute("""CREATE INDEX IF NOT EXISTS
                                  idx_successor_id ON successor (id)""")
                cursor.execute("""CREATE INDEX IF NOT EXISTS
                                  idx_successor_successor
                                  ON successor (successor)""")

        luigi.File(stopover).move(self.output().fn)
Esempio n. 12
0
 def run(self):
     """ Just copy the fixture, so we have some output. """
     luigi.File(path=self.fixture).copy(self.output().path)
Esempio n. 13
0
 def run(self):
     """ Just run wget quietly. """
     output = shellout('wget -q "{url}" -O {output}', url=self.url)
     luigi.File(output).move(self.output().path)
Esempio n. 14
0
 def run(self):
     """
     TODO: convert input to intermediate schema via span-import.
     """
     luigi.File(output).move(self.output().path)
Esempio n. 15
0
 def run(self):
     url = "http://viaf.org/viaf/data/viaf-20131014-links.txt.gz"
     output = shellout("""wget --retry-connrefused {url} -O {output}""",
                       url=url)
     luigi.File(output).move(self.output().fn)
Esempio n. 16
0
 def run(self):
     output = shellout("gunzip -c {input} > {output}",
                       input=self.input().fn)
     luigi.File(output).move(self.output().fn)
Esempio n. 17
0
 def run(self):
     url = "http://gutenberg.readingroo.ms/cache/generated/feeds/catalog.marc.bz2"
     output = shellout('wget -q "{url}" -O {output}', url=url)
     output = shellout('bunzip2 {input} -c > {output}', input=output)
     luigi.File(output).move(self.output().path)
Esempio n. 18
0
 def run(self):
     output = shellout(
         "span-import -w 2 -i crossref <(unpigz -c {input}) | pigz -c > {output}",
         input=self.input().path)
     luigi.File(output).move(self.output().path)
Esempio n. 19
0
 def run(self):
     output = shellout("span-export <(unpigz -c {input}) | pigz -c > {output}", input=self.input().path)
     luigi.File(output).move(self.output().path)
Esempio n. 20
0
 def run(self):
     """ wc -l wrapped. """
     tmp = shellout("wc -l < {input} > {output}", input=self.input().fn)
     luigi.File(tmp).move(self.output().fn)
Esempio n. 21
0
 def run(self):
     """ Simulate touch. """
     luigi.File(path=self.output().path).open('w')
Esempio n. 22
0
 def run(self):
     temp = shellout("pagerank {input} > {output}",
                     input=self.input().get('data').fn)
     luigi.File(temp).move(self.output().fn)
Esempio n. 23
0
 def run(self):
     output = shellout('span-tag -c {config} <(unpigz -c {input}) | pigz -c > {output}',
                       config=self.input().get('config').path,
                       input=self.input().get('input').path)
     luigi.File(output).move(self.output().path)
Esempio n. 24
0
 def run(self):
     output = shellout("cut -f 2- {input}| sort | uniq -c | sort -nr > {output}",
                       input=self.input().path)
     luigi.File(output).move(self.output().path)