def parse(self, basefile): if basefile == "1": util.writefile(self.store.parsed_path("1"), "basefile 1, parsed by b") util.writefile( self.store.parsed_path("1", attachment="attach.txt"), "attachment for basefile 1, parsed by b") util.writefile(self.store.distilled_path("1"), "basefile 1, metadata from b") return True else: raise errors.ParseError("No can do!")
def lazy_downloaded_to_intermediate(basefile): downloaded_path = self.store.downloaded_path( basefile, attachment="index.pdf") downloaded_path_html = self.store.downloaded_path( basefile, attachment="index.html") if not os.path.exists(downloaded_path): if os.path.exists(downloaded_path_html): # attempt to parse HTML instead return open(downloaded_path_html) else: # just grab the HTML from the XML file itself... tree = etree.parse(self.store.downloaded_path(basefile)) html = tree.getroot().find("dokument").find("html") if html is not None: return StringIO(html.text) else: return StringIO( "<html><h1>Dokumenttext saknas</h1></html>") intermediate_path = self.store.intermediate_path(basefile) intermediate_dir = os.path.dirname(intermediate_path) convert_to_pdf = not downloaded_path.endswith(".pdf") keep_xml = "bz2" if self.config.compress == "bz2" else True reader = StreamingPDFReader() try: res = reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml) except (errors.PDFFileIsEmpty, errors.ExternalCommandError) as e: if isinstance(e, errors.ExternalCommandError): self.log.debug("%s: PDF file conversion failed: %s" % (basefile, str(e).split("\n")[0])) # if PDF file conversion fails, it'll probaby fail # again when we try OCR, but maybe there will # exist a cached intermediate file that allow us # to get data without even looking at the PDF file # again. elif isinstance(e, errors.PDFFileIsEmpty): self.log.debug("%s: PDF had no textcontent, trying OCR" % basefile) res = reader.convert(filename=downloaded_path, workdir=intermediate_dir, images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, ocr_lang="swe") if os.path.getsize(intermediate_path) > 20 * 1024 * 1024: raise errors.ParseError( "%s: %s (after conversion) is just too damn big (%s Mbytes)" % (basefile, intermediate_path, os.path.getsize(intermediate_path) / (1024 * 1024))) return res
def parse(self, doc): doc.meta = self.metadata_from_basefile(doc) source = self.store.downloaded_path(doc.basefile) # maybe derive some metadata (type, year, number) from # basefile? It's probably not warranted to have a special # parse_metadata stage for these documents, we can extract # title, dates and other essential metadata from the body. if source.endswith(".fmx4"): doc.body = self.parse_formex(doc, source) elif source.endswith(".html"): doc.body = self.parse_html(doc, source) else: raise errors.ParseError("Can't yet parse %s" % source) self.parse_entry_update(doc) return True # Signals that everything is OK
def parse(self, basefile): # first, check if we really need to parse. If any subrepo # returns that .store.needed(...., "parse") is false and we # have parsed file in the mainrepo, then we're done. This is # mainly to avoid the log message below (to be in line with # expected repo behaviour of not logging anything at severity # INFO if no real work was done), it does not noticably affect # performance force = (self.config.force is True or self.config.parseforce is True) if not force: for c in self.subrepos: inst = self.get_instance(c) needed = inst.store.needed(basefile, "parse") if not needed and os.path.exists( self.store.parsed_path(basefile)): self.log.debug("%s: Skipped" % basefile) return True # signals everything OK start = time.time() ret = False for inst in self.get_preferred_instances(basefile): try: ret = inst.parse(basefile) # Any error thrown (errors.ParseError or something # else) means we try next subrepo -- unless we want to # fail fast with a nice stacktrace during debugging. except Exception as e: if self.config.failfast: raise else: self.log.debug( "%s: parse with %s failed: %s" % (basefile, inst.qualified_class_name(), str(e))) ret = False if ret: break if ret: oldbasefile = basefile if ret is not True and ret != basefile: # this is a signal that parse discovered that the # basefile was adjusted. We should raise # DocumentRenamedError at the very end to get # updateentry do the right thing. basefile = ret # Also, touch the old parsed path so we don't # regenerate. with self.store.open_parsed(oldbasefile, "w"): pass self.copy_parsed(basefile, inst) self.log.info("%(basefile)s parse OK (%(elapsed).3f sec)", { 'basefile': basefile, 'elapsed': time.time() - start }) if basefile != oldbasefile: msg = "%s: In subrepo %s basefile turned out to really be %s" % ( oldbasefile, inst.qualified_class_name(), basefile) raise errors.DocumentRenamedError(True, msg, oldbasefile, basefile) return ret else: # subrepos should only contain those repos that actually # had a chance of parsing (basefile in # self.store.basefiles[c]) subrepos_lbl = ", ".join([ self.get_instance(x).qualified_class_name() for x in self.subrepos if basefile in self.store.basefiles[x] ]) if subrepos_lbl: raise errors.ParseError( "No instance of %s was able to parse %s" % (subrepos_lbl, basefile)) else: raise errors.ParseError( "No available instance (out of %s) had basefile %s" % (len(self.subrepos), basefile))
def parse(self, basefile): # first, check if we really need to parse. If any subrepo # returns that parseneeded is false and we have parsed file in # the mainrepo, then we're done. This is mainly to avoid the # log message below (to be in line with expected repo # behaviour of not logging anything at severity INFO if no real # work was done), it does not noticably affect performance force = (self.config.force is True or self.config.parseforce is True) if not force: for c in self.subrepos: inst = self.get_instance(c) needed = inst.parseneeded(basefile) if not needed and os.path.exists( self.store.parsed_path(basefile)): self.log.debug("%s: Skipped" % basefile) return True # signals everything OK start = time.time() ret = False for c in self.subrepos: inst = self.get_instance(c) if (basefile in self.store.basefiles[c] or os.path.exists(inst.store.downloaded_path(basefile))): try: # each parse method should be smart about whether # to re-parse or not (i.e. use the @managedparsing # decorator). ret = inst.parse(basefile) # Any error thrown (errors.ParseError or something # else) means we try next subrepo -- unless we want to # fail fast with a nice stacktrace during debugging. except Exception as e: if self.config.failfast: raise else: self.log.debug( "%s: parse with %s failed: %s" % (basefile, inst.qualified_class_name(), str(e))) ret = False if ret: break if ret: if ret is not True and ret != basefile: # this is a signal that parse discovered # that the basefile was wrong basefile = ret self.copy_parsed(basefile, inst) self.log.info("%(basefile)s parse OK (%(elapsed).3f sec)", { 'basefile': basefile, 'elapsed': time.time() - start }) return ret else: # subrepos should only contain those repos that actually # had a chance of parsing (basefile in # self.store.basefiles[c]) subrepos_lbl = ", ".join([ self.get_instance(x).qualified_class_name() for x in self.subrepos if basefile in self.store.basefiles[x] ]) if subrepos_lbl: raise errors.ParseError( "No instance of %s was able to parse %s" % (subrepos_lbl, basefile)) else: raise errors.ParseError( "No available instance (out of %s) had basefile %s" % (len(self.subrepos), basefile))