def run(self): output = shellout('marctotsv -k -s "|" {input} 001 653.a > {output}', input=self.input().get('dump').path) with luigi.File(output, format=TSV).open() as handle: with self.output().open('w') as output: for row in handle.iter_tsv(cols=('id', 'terms')): for subfield in row.terms.split('|'): for term in subfield.split('--'): term = term.strip() output.write_tsv(row.id, term)
def run(self): """ Just run wget quietly. """ output = shellout('wget -q "{url}" -O {output}', url=self.url) luigi.File(output).move(self.output().path)
def run(self): output = shellout("cut -f 2- {input}| sort | uniq -c | sort -nr > {output}", input=self.input().path) luigi.File(output).move(self.output().path)
def run(self): url = "http://gutenberg.readingroo.ms/cache/generated/feeds/catalog.marc.bz2" output = shellout('wget -q "{url}" -O {output}', url=url) output = shellout('bunzip2 {input} -c > {output}', input=output) luigi.File(output).move(self.output().path)