Exemple #1
0
 def parse(self, basefile):
     if basefile == "1":
         util.writefile(self.store.parsed_path("1"),
                        "basefile 1, parsed by b")
         util.writefile(
             self.store.parsed_path("1", attachment="attach.txt"),
             "attachment for basefile 1, parsed by b")
         util.writefile(self.store.distilled_path("1"),
                        "basefile 1, metadata from b")
         return True
     else:
         raise errors.ParseError("No can do!")
Exemple #2
0
 def lazy_downloaded_to_intermediate(basefile):
     downloaded_path = self.store.downloaded_path(
         basefile, attachment="index.pdf")
     downloaded_path_html = self.store.downloaded_path(
         basefile, attachment="index.html")
     if not os.path.exists(downloaded_path):
         if os.path.exists(downloaded_path_html):
             # attempt to parse HTML instead
             return open(downloaded_path_html)
         else:
             # just grab the HTML from the XML file itself...
             tree = etree.parse(self.store.downloaded_path(basefile))
             html = tree.getroot().find("dokument").find("html")
         if html is not None:
             return StringIO(html.text)
         else:
             return StringIO(
                 "<html><h1>Dokumenttext saknas</h1></html>")
     intermediate_path = self.store.intermediate_path(basefile)
     intermediate_dir = os.path.dirname(intermediate_path)
     convert_to_pdf = not downloaded_path.endswith(".pdf")
     keep_xml = "bz2" if self.config.compress == "bz2" else True
     reader = StreamingPDFReader()
     try:
         res = reader.convert(filename=downloaded_path,
                              workdir=intermediate_dir,
                              images=self.config.pdfimages,
                              convert_to_pdf=convert_to_pdf,
                              keep_xml=keep_xml)
     except (errors.PDFFileIsEmpty, errors.ExternalCommandError) as e:
         if isinstance(e, errors.ExternalCommandError):
             self.log.debug("%s: PDF file conversion failed: %s" %
                            (basefile, str(e).split("\n")[0]))
             # if PDF file conversion fails, it'll probaby fail
             # again when we try OCR, but maybe there will
             # exist a cached intermediate file that allow us
             # to get data without even looking at the PDF file
             # again.
         elif isinstance(e, errors.PDFFileIsEmpty):
             self.log.debug("%s: PDF had no textcontent, trying OCR" %
                            basefile)
         res = reader.convert(filename=downloaded_path,
                              workdir=intermediate_dir,
                              images=self.config.pdfimages,
                              convert_to_pdf=convert_to_pdf,
                              keep_xml=keep_xml,
                              ocr_lang="swe")
     if os.path.getsize(intermediate_path) > 20 * 1024 * 1024:
         raise errors.ParseError(
             "%s: %s (after conversion) is just too damn big (%s Mbytes)"
             % (basefile, intermediate_path,
                os.path.getsize(intermediate_path) / (1024 * 1024)))
     return res
Exemple #3
0
 def parse(self, doc):
     doc.meta = self.metadata_from_basefile(doc)
     source = self.store.downloaded_path(doc.basefile)
     # maybe derive some metadata (type, year, number) from
     # basefile? It's probably not warranted to have a special
     # parse_metadata stage for these documents, we can extract
     # title, dates and other essential metadata from the body.
     if source.endswith(".fmx4"):
         doc.body = self.parse_formex(doc, source)
     elif source.endswith(".html"):
         doc.body = self.parse_html(doc, source)
     else:
         raise errors.ParseError("Can't yet parse %s" % source)
     self.parse_entry_update(doc)
     return True  # Signals that everything is OK
Exemple #4
0
    def parse(self, basefile):
        # first, check if we really need to parse. If any subrepo
        # returns that .store.needed(...., "parse") is false and we
        # have parsed file in the mainrepo, then we're done. This is
        # mainly to avoid the log message below (to be in line with
        # expected repo behaviour of not logging anything at severity
        # INFO if no real work was done), it does not noticably affect
        # performance
        force = (self.config.force is True or self.config.parseforce is True)
        if not force:
            for c in self.subrepos:
                inst = self.get_instance(c)
                needed = inst.store.needed(basefile, "parse")
                if not needed and os.path.exists(
                        self.store.parsed_path(basefile)):
                    self.log.debug("%s: Skipped" % basefile)
                    return True  # signals everything OK

        start = time.time()
        ret = False
        for inst in self.get_preferred_instances(basefile):
            try:
                ret = inst.parse(basefile)
            # Any error thrown (errors.ParseError or something
            # else) means we try next subrepo -- unless we want to
            # fail fast with a nice stacktrace during debugging.
            except Exception as e:
                if self.config.failfast:
                    raise
                else:
                    self.log.debug(
                        "%s: parse with %s failed: %s" %
                        (basefile, inst.qualified_class_name(), str(e)))
                    ret = False
            if ret:
                break

        if ret:
            oldbasefile = basefile
            if ret is not True and ret != basefile:
                # this is a signal that parse discovered that the
                # basefile was adjusted. We should raise
                # DocumentRenamedError at the very end to get
                # updateentry do the right thing.
                basefile = ret
                # Also, touch the old parsed path so we don't
                # regenerate.
                with self.store.open_parsed(oldbasefile, "w"):
                    pass

            self.copy_parsed(basefile, inst)
            self.log.info("%(basefile)s parse OK (%(elapsed).3f sec)", {
                'basefile': basefile,
                'elapsed': time.time() - start
            })

            if basefile != oldbasefile:
                msg = "%s: In subrepo %s basefile turned out to really be %s" % (
                    oldbasefile, inst.qualified_class_name(), basefile)
                raise errors.DocumentRenamedError(True, msg, oldbasefile,
                                                  basefile)
            return ret
        else:
            # subrepos should only contain those repos that actually
            # had a chance of parsing (basefile in
            # self.store.basefiles[c])
            subrepos_lbl = ", ".join([
                self.get_instance(x).qualified_class_name()
                for x in self.subrepos if basefile in self.store.basefiles[x]
            ])
            if subrepos_lbl:
                raise errors.ParseError(
                    "No instance of %s was able to parse %s" %
                    (subrepos_lbl, basefile))
            else:
                raise errors.ParseError(
                    "No available instance (out of %s) had basefile %s" %
                    (len(self.subrepos), basefile))
Exemple #5
0
    def parse(self, basefile):
        # first, check if we really need to parse. If any subrepo
        # returns that parseneeded is false and we have parsed file in
        # the mainrepo, then we're done. This is mainly to avoid the
        # log message below (to be in line with expected repo
        # behaviour of not logging anything at severity INFO if no real
        # work was done), it does not noticably affect performance
        force = (self.config.force is True or self.config.parseforce is True)
        if not force:
            for c in self.subrepos:
                inst = self.get_instance(c)
                needed = inst.parseneeded(basefile)
                if not needed and os.path.exists(
                        self.store.parsed_path(basefile)):
                    self.log.debug("%s: Skipped" % basefile)
                    return True  # signals everything OK

        start = time.time()
        ret = False
        for c in self.subrepos:
            inst = self.get_instance(c)
            if (basefile in self.store.basefiles[c]
                    or os.path.exists(inst.store.downloaded_path(basefile))):
                try:
                    # each parse method should be smart about whether
                    # to re-parse or not (i.e. use the @managedparsing
                    # decorator).
                    ret = inst.parse(basefile)
                # Any error thrown (errors.ParseError or something
                # else) means we try next subrepo -- unless we want to
                # fail fast with a nice stacktrace during debugging.
                except Exception as e:
                    if self.config.failfast:
                        raise
                    else:
                        self.log.debug(
                            "%s: parse with %s failed: %s" %
                            (basefile, inst.qualified_class_name(), str(e)))
                        ret = False
                if ret:
                    break

        if ret:
            if ret is not True and ret != basefile:
                # this is a signal that parse discovered
                # that the basefile was wrong
                basefile = ret

            self.copy_parsed(basefile, inst)
            self.log.info("%(basefile)s parse OK (%(elapsed).3f sec)", {
                'basefile': basefile,
                'elapsed': time.time() - start
            })
            return ret
        else:
            # subrepos should only contain those repos that actually
            # had a chance of parsing (basefile in
            # self.store.basefiles[c])
            subrepos_lbl = ", ".join([
                self.get_instance(x).qualified_class_name()
                for x in self.subrepos if basefile in self.store.basefiles[x]
            ])
            if subrepos_lbl:
                raise errors.ParseError(
                    "No instance of %s was able to parse %s" %
                    (subrepos_lbl, basefile))
            else:
                raise errors.ParseError(
                    "No available instance (out of %s) had basefile %s" %
                    (len(self.subrepos), basefile))