Ejemplo n.º 1
0
    def run(self):
        output = shellout("cut -f2 {input} | LANG=C sort | LANG=C uniq > {output}", input=self.input().path)
        with open(output) as handle:
            dates = map(string.strip, handle.readlines())

        with self.output().open('w') as output:
            for date in dates:
                dateobj = datetime.date(*map(int, date.split('-')))
                marc = SWBOpenDataMarc(date=dateobj)
                sdb = SWBOpenDataSeekMapDB(date=dateobj)
                luigi.build([marc, sdb], local_scheduler=True)
                with open(marc.output().path) as handle:
                    with sqlite3db(sdb.output().path) as cursor:
                        idset = df[df.date == date].id.values.tolist()
                        limit, offset = self.limit, 0
                        while True:
                            cursor.execute("""
                                SELECT offset, length
                                FROM seekmap WHERE id IN (%s)""" % (
                                    ','.join(("'%s'" % id for id in idset[offset:offset + limit]))))
                            rows = cursor.fetchall()
                            if not rows:
                                break
                            else:
                                copyregions(handle, output, rows)
                                offset += limit
Ejemplo n.º 2
0
 def run(self):
     with self.input().get('surface').open() as handle:
         with self.output().open('w') as output:
             with self.input().get('file').open() as fh:
                 with sqlite3db(self.input().get('seekmap').path) as cursor:
                     regions = []
                     for row in handle.iter_tsv(cols=('id', 'date')):
                         cursor.execute("SELECT offset, length FROM seekmap where id = ?", (row.id,))
                         regions.append(cursor.fetchone())
                     copyregions(fh, output, regions)
Ejemplo n.º 3
0
    def test_copyregions(self):
        with tempfile.NamedTemporaryFile(delete=False) as handle:
            handle.write('0123456789\n')

        with open(handle.name) as src:
            with tempfile.NamedTemporaryFile(delete=False) as dst:
                copyregions(src, dst, [(3, 2)])

        with open(dst.name) as handle:
            self.assertEquals("34", handle.read())
Ejemplo n.º 4
0
    def run(self):
        output = shellout("cut -f3 {input} | LANG=C sort | LANG=C uniq > {output}", input=self.input().path)
        with open(output) as handle:
            dates = map(string.strip, handle.readlines())

        with self.output().open('w') as output:
            for date in dates:
                task = NEPCombine(date=datetime.date(*(int(v) for v in date.split('-'))))
                luigi.build([task], local_scheduler=True)
                with task.output().open() as fh:
                    seekfile = shellout("""LANG=C grep "{date}" "{input}" | cut -f 4,5 > {output}""", date=str(date), input=self.input().path)
                    with luigi.File(seekfile, format=TSV).open() as handle:
                        seekmap = ((int(offset), int(length)) for offset, length in handle.iter_tsv())
                        copyregions(fh, output, seekmap)
Ejemplo n.º 5
0
    def run(self):
        combined = shellout(r"unzip -p {input} \*.mrc > {output}",
                            input=self.input().path)

        # there is a broken record inside!
        mmap = shellout("""marcmap {input} | LANG=C grep -v ^ebr10661760 \
                           | awk '{{print $2":"$3}}' > {output} """,
                           input=combined)
        # prepare seekmap
        seekmap = []
        with open(mmap) as handle:
            for line in handle:
                offset, length = map(int, line.strip().split(':'))
                seekmap.append((offset, length))

        # create the filtered file
        _, tmp = tempfile.mkstemp(prefix='gluish-')
        with open(tmp, 'w') as output:
            with open(combined) as handle:
                copyregions(handle, output, seekmap)

        output = shellout("""yaz-marcdump -f marc8s -t utf8 -o marc
                          -l 9=97 {input} > {output}""", input=tmp)
        luigi.File(output).move(self.output().path)