Example #1
0
    def run(self):
        target = os.path.join(os.path.dirname(self.output().path), str(self.date))
        if not os.path.exists(target):
            os.makedirs(target)

        _, errorlog = tempfile.mkstemp(prefix='siskin-')
        stylesheet = self.input().get('stylesheet').path
        size = wc(self.input().get('filelist').path)

        with self.input().get('filelist').open() as handle:
            for i, row in enumerate(handle.iter_tsv(cols=('path',)), start=1):
                basename = os.path.basename(row.path)
                name = basename.replace(".xml", ".marcxml")
                destination = os.path.join(target, name)
                if not os.path.exists(destination):
                    try:
                        output = shellout("xsltproc {xsl} {input} > {output}",
                                          input=row.path, xsl=stylesheet)
                        luigi.File(output).move(destination)
                    except RuntimeError as err:
                        self.logger.error("{0}: {1}".format(row.path, err))
                        with open(errorlog, 'a') as log:
                            log.write('%s\t%s\n' % (row.path, err))
                self.logger.debug("{0}/{1} {2}".format(i, size, row.path))

        # write receipt
        with self.output().open('w') as output:
            for path in iterfiles(target):
                output.write_tsv(path)

        # this is just a temporary artefact for now
        self.logger.debug("Conversion errors logged at: {0}".format(errorlog))
Example #2
0
 def test_fx(self):
     task = FXRates()
     luigi.build([task], local_scheduler=True)
     self.assertEquals(wc(task.output().path), len(ECBFXTest.EXPECTED_CURRENCIES))
     with task.output().open() as handle:
         for row in handle.iter_tsv(cols=('symbol', 'rate')):
             self.assertTrue(row.symbol in ECBFXTest.EXPECTED_CURRENCIES)
             try:
                 decimal.Decimal(row.rate)
             except decimal.InvalidOperation as err:
                 self.fail(err)
Example #3
0
 def test_fx(self):
     task = FXRates()
     luigi.build([task], local_scheduler=True)
     self.assertEquals(wc(task.output().path),
                       len(ECBFXTest.EXPECTED_CURRENCIES))
     with task.output().open() as handle:
         for row in handle.iter_tsv(cols=('symbol', 'rate')):
             self.assertTrue(row.symbol in ECBFXTest.EXPECTED_CURRENCIES)
             try:
                 decimal.Decimal(row.rate)
             except decimal.InvalidOperation as err:
                 self.fail(err)
Example #4
0
    def run(self):
        target = os.path.join(os.path.dirname(self.output().path), str(self.date))
        if not os.path.exists(target):
            os.makedirs(target)

        size = wc(self.input().path)
        _, combined = tempfile.mkstemp(prefix='siskin-')

        with self.input().open() as handle:
            for i, row in enumerate(handle.iter_tsv(cols=('path',)), start=1):

                # Cleanup wrongly nested data fields, see:
                # https://gist.github.com/miku/ea779a221d00b5524fcd
                # in 2014-05, this corrects 673 errors, while 31 are not yet
                # recoverable!
                with open(row.path) as handle:
                    f = cStringIO.StringIO(handle.read())
                    doc = etree.parse(f)

                result = doc.xpath('/marc:record/marc:datafield/marc:datafield',
                          namespaces={'marc': 'http://www.loc.gov/MARC21/slim'})
                if len(result) > 0:
                    self.logger.debug("Fixing broken MARCXML in: {0}".format(row.path))
                for misplaced in result:
                    parent = misplaced.getparent()
                    record = misplaced.getparent().getparent()
                    parent.remove(misplaced)
                    record.append(misplaced)

                _, cleaned = tempfile.mkstemp(prefix='siskin-')
                with open(cleaned, 'w') as output:
                    output.write(etree.tostring(doc, pretty_print=True))

                # actually do the conversion ...
                basename = os.path.basename(row.path)
                name = basename.replace(".marcxml", ".mrc")
                destination = os.path.join(target, name)
                if not os.path.exists(destination):
                    # exit(5) for serious decoding errors
                    # see: http://www.indexdata.com/yaz/doc/NEWS
                    shellout("""yaz-marcdump -i marcxml -o marc
                             {input} >> {output}""", input=cleaned,
                             output=combined, ignoremap={5: 'FIXME'})
                self.logger.debug("{0}/{1} {2}".format(i, size, row.path))

        luigi.File(combined).move(self.output().path)
Example #5
0
    def run(self):
        # create target subdirectory
        target = os.path.join(os.path.dirname(self.output().path), str(self.closest()))
        if not os.path.exists(target):
            os.makedirs(target)
        size = wc(self.input().path)

        with self.input().open() as handle:
            for i, row in enumerate(handle.iter_tsv(cols=('url',)), start=1):
                name = os.path.join(target, row.url.split('/')[-2])
                destination = "{name}.xml".format(name=name)
                if not os.path.exists(destination):
                    output = shellout("""wget -q --retry-connrefused
                                      {url} -O {output}""", url=row.url)
                    luigi.File(output).move(destination)
                self.logger.debug("{0}/{1} {2}".format(i, size, row.url))

        # write "receipt"
        with self.output().open('w') as output:
            for path in iterfiles(target):
                if path.endswith('.xml'):
                    output.write_tsv(path)
Example #6
0
 def test_wc(self):
     with tempfile.NamedTemporaryFile(delete=False) as handle:
         handle.write('Line 1\n')
         handle.write('Line 2\n')
         handle.write('Line 3\n')
     self.assertEquals(3, wc(handle.name))