Example #1
0
    def run(self):
        target = os.path.join(os.path.dirname(self.output().path), str(self.date))
        if not os.path.exists(target):
            os.makedirs(target)

        _, errorlog = tempfile.mkstemp(prefix="siskin-")
        stylesheet = self.input().get("stylesheet").path
        size = wc(self.input().get("filelist").path)

        with self.input().get("filelist").open() as handle:
            for i, row in enumerate(handle.iter_tsv(cols=("path",)), start=1):
                basename = os.path.basename(row.path)
                name = basename.replace(".xml", ".marcxml")
                destination = os.path.join(target, name)
                if not os.path.exists(destination):
                    try:
                        output = shellout("xsltproc {xsl} {input} > {output}", input=row.path, xsl=stylesheet)
                        luigi.File(output).move(destination)
                    except RuntimeError as err:
                        self.logger.error("{0}: {1}".format(row.path, err))
                        with open(errorlog, "a") as log:
                            log.write("%s\t%s\n" % (row.path, err))
                self.logger.debug("{0}/{1} {2}".format(i, size, row.path))

        # write receipt
        with self.output().open("w") as output:
            for path in iterfiles(target):
                output.write_tsv(path)

        # this is just a temporary artefact for now
        self.logger.debug("Conversion errors logged at: {0}".format(errorlog))
Example #2
0
    def run(self):
        # create target subdirectory
        target = os.path.join(os.path.dirname(self.output().path), str(self.closest()))
        if not os.path.exists(target):
            os.makedirs(target)
        size = wc(self.input().path)

        with self.input().open() as handle:
            for i, row in enumerate(handle.iter_tsv(cols=("url",)), start=1):
                name = os.path.join(target, row.url.split("/")[-2])
                destination = "{name}.xml".format(name=name)
                if not os.path.exists(destination):
                    output = shellout(
                        """wget -q --retry-connrefused
                                      {url} -O {output}""",
                        url=row.url,
                    )
                    luigi.File(output).move(destination)
                self.logger.debug("{0}/{1} {2}".format(i, size, row.url))

        # write "receipt"
        with self.output().open("w") as output:
            for path in iterfiles(target):
                if path.endswith(".xml"):
                    output.write_tsv(path)
Example #3
0
    def run(self):
        target = os.path.join(os.path.dirname(self.output().path), str(self.date))
        if not os.path.exists(target):
            os.makedirs(target)

        size = wc(self.input().path)
        _, combined = tempfile.mkstemp(prefix="siskin-")

        with self.input().open() as handle:
            for i, row in enumerate(handle.iter_tsv(cols=("path",)), start=1):

                # Cleanup wrongly nested data fields, see:
                # https://gist.github.com/miku/ea779a221d00b5524fcd
                # in 2014-05, this corrects 673 errors, while 31 are not yet
                # recoverable!
                with open(row.path) as handle:
                    f = cStringIO.StringIO(handle.read())
                    doc = etree.parse(f)

                result = doc.xpath(
                    "/marc:record/marc:datafield/marc:datafield", namespaces={"marc": "http://www.loc.gov/MARC21/slim"}
                )
                if len(result) > 0:
                    self.logger.debug("Fixing broken MARCXML in: {0}".format(row.path))
                for misplaced in result:
                    parent = misplaced.getparent()
                    record = misplaced.getparent().getparent()
                    parent.remove(misplaced)
                    record.append(misplaced)

                _, cleaned = tempfile.mkstemp(prefix="siskin-")
                with open(cleaned, "w") as output:
                    output.write(etree.tostring(doc, pretty_print=True))

                # actually do the conversion ...
                basename = os.path.basename(row.path)
                name = basename.replace(".marcxml", ".mrc")
                destination = os.path.join(target, name)
                if not os.path.exists(destination):
                    # exit(5) for serious decoding errors
                    # see: http://www.indexdata.com/yaz/doc/NEWS
                    shellout(
                        """yaz-marcdump -i marcxml -o marc
                             {input} >> {output}""",
                        input=cleaned,
                        output=combined,
                        ignoremap={5: "FIXME"},
                    )
                self.logger.debug("{0}/{1} {2}".format(i, size, row.path))

        luigi.File(combined).move(self.output().path)