def run(self): target = os.path.join(os.path.dirname(self.output().path), str(self.date)) if not os.path.exists(target): os.makedirs(target) _, errorlog = tempfile.mkstemp(prefix='siskin-') stylesheet = self.input().get('stylesheet').path size = wc(self.input().get('filelist').path) with self.input().get('filelist').open() as handle: for i, row in enumerate(handle.iter_tsv(cols=('path',)), start=1): basename = os.path.basename(row.path) name = basename.replace(".xml", ".marcxml") destination = os.path.join(target, name) if not os.path.exists(destination): try: output = shellout("xsltproc {xsl} {input} > {output}", input=row.path, xsl=stylesheet) luigi.File(output).move(destination) except RuntimeError as err: self.logger.error("{0}: {1}".format(row.path, err)) with open(errorlog, 'a') as log: log.write('%s\t%s\n' % (row.path, err)) self.logger.debug("{0}/{1} {2}".format(i, size, row.path)) # write receipt with self.output().open('w') as output: for path in iterfiles(target): output.write_tsv(path) # this is just a temporary artefact for now self.logger.debug("Conversion errors logged at: {0}".format(errorlog))
def test_fx(self): task = FXRates() luigi.build([task], local_scheduler=True) self.assertEquals(wc(task.output().path), len(ECBFXTest.EXPECTED_CURRENCIES)) with task.output().open() as handle: for row in handle.iter_tsv(cols=('symbol', 'rate')): self.assertTrue(row.symbol in ECBFXTest.EXPECTED_CURRENCIES) try: decimal.Decimal(row.rate) except decimal.InvalidOperation as err: self.fail(err)
def run(self): target = os.path.join(os.path.dirname(self.output().path), str(self.date)) if not os.path.exists(target): os.makedirs(target) size = wc(self.input().path) _, combined = tempfile.mkstemp(prefix='siskin-') with self.input().open() as handle: for i, row in enumerate(handle.iter_tsv(cols=('path',)), start=1): # Cleanup wrongly nested data fields, see: # https://gist.github.com/miku/ea779a221d00b5524fcd # in 2014-05, this corrects 673 errors, while 31 are not yet # recoverable! with open(row.path) as handle: f = cStringIO.StringIO(handle.read()) doc = etree.parse(f) result = doc.xpath('/marc:record/marc:datafield/marc:datafield', namespaces={'marc': 'http://www.loc.gov/MARC21/slim'}) if len(result) > 0: self.logger.debug("Fixing broken MARCXML in: {0}".format(row.path)) for misplaced in result: parent = misplaced.getparent() record = misplaced.getparent().getparent() parent.remove(misplaced) record.append(misplaced) _, cleaned = tempfile.mkstemp(prefix='siskin-') with open(cleaned, 'w') as output: output.write(etree.tostring(doc, pretty_print=True)) # actually do the conversion ... basename = os.path.basename(row.path) name = basename.replace(".marcxml", ".mrc") destination = os.path.join(target, name) if not os.path.exists(destination): # exit(5) for serious decoding errors # see: http://www.indexdata.com/yaz/doc/NEWS shellout("""yaz-marcdump -i marcxml -o marc {input} >> {output}""", input=cleaned, output=combined, ignoremap={5: 'FIXME'}) self.logger.debug("{0}/{1} {2}".format(i, size, row.path)) luigi.File(combined).move(self.output().path)
def run(self): # create target subdirectory target = os.path.join(os.path.dirname(self.output().path), str(self.closest())) if not os.path.exists(target): os.makedirs(target) size = wc(self.input().path) with self.input().open() as handle: for i, row in enumerate(handle.iter_tsv(cols=('url',)), start=1): name = os.path.join(target, row.url.split('/')[-2]) destination = "{name}.xml".format(name=name) if not os.path.exists(destination): output = shellout("""wget -q --retry-connrefused {url} -O {output}""", url=row.url) luigi.File(output).move(destination) self.logger.debug("{0}/{1} {2}".format(i, size, row.url)) # write "receipt" with self.output().open('w') as output: for path in iterfiles(target): if path.endswith('.xml'): output.write_tsv(path)
def test_wc(self): with tempfile.NamedTemporaryFile(delete=False) as handle: handle.write('Line 1\n') handle.write('Line 2\n') handle.write('Line 3\n') self.assertEquals(3, wc(handle.name))