Example #1
0
 def run(self):
     with self.input().get('surface').open() as handle:
         with self.output().open('w') as output:
             with self.input().get('file').open() as fh:
                 with sqlitedb(self.input().get('seekmap').path) as cursor:
                     regions = []
                     for row in handle.iter_tsv(cols=('id', 'date')):
                         cursor.execute("SELECT offset, length FROM seekmap where id = ?", (row.id,))
                         regions.append(cursor.fetchone())
                     copyregions(fh, output, regions)
Example #2
0
    def run(self):
        output = shellout("cut -f3 {input} | LANG=C sort | LANG=C uniq > {output}", input=self.input().path)
        with open(output) as handle:
            dates = map(string.strip, handle.readlines())

        with self.output().open('w') as output:
            for date in dates:
                task = NEPCombine(date=datetime.date(*(int(v) for v in date.split('-'))))
                luigi.build([task], local_scheduler=True)
                with task.output().open() as fh:
                    seekfile = shellout("""LANG=C grep "{date}" "{input}" | cut -f 4,5 > {output}""", date=str(date), input=self.input().path)
                    with luigi.File(seekfile, format=TSV).open() as handle:
                        seekmap = ((int(offset), int(length)) for offset, length in handle.iter_tsv())
                        copyregions(fh, output, seekmap)
Example #3
0
    def run(self):
        combined = shellout(r"unzip -p {input} \*.mrc > {output}",
                            input=self.input().path)

        # there is a broken record inside!
        mmap = shellout("""marcmap {input} | LANG=C grep -v ^ebr10661760 \
                           | awk '{{print $2":"$3}}' > {output} """,
                           input=combined)
        # prepare seekmap
        seekmap = []
        with open(mmap) as handle:
            for line in handle:
                offset, length = map(int, line.strip().split(':'))
                seekmap.append((offset, length))

        # create the filtered file
        _, tmp = tempfile.mkstemp(prefix='gluish-')
        with open(tmp, 'w') as output:
            with open(combined) as handle:
                copyregions(handle, output, seekmap)

        output = shellout("""yaz-marcdump -f marc8s -t utf8 -o marc
                          -l 9=97 {input} > {output}""", input=tmp)
        luigi.File(output).move(self.output().path)