Example #1
0
 def downloaded_to_intermediate(self, basefile, attachment=None):
     # force just the conversion part of the PDF handling
     downloaded_path = self.store.downloaded_path(basefile,
                                                  attachment=attachment)
     intermediate_path = self.store.intermediate_path(basefile)
     intermediate_dir = os.path.dirname(intermediate_path)
     ocr_lang = None
     convert_to_pdf = not downloaded_path.endswith(".pdf")
     keep_xml = "bz2" if self.config.compress == "bz2" else True
     reader = StreamingPDFReader()
     try:
         return reader.convert(filename=downloaded_path,
                               workdir=intermediate_dir,
                               images=self.config.pdfimages,
                               convert_to_pdf=convert_to_pdf,
                               keep_xml=keep_xml,
                               ocr_lang=ocr_lang)
     except PDFFileIsEmpty as e:
         self.log.warning("%s: %s was empty, attempting OCR" %
                          (basefile, downloaded_path))
         ocr_lang = "swe"  # reasonable guess
         return reader.convert(filename=downloaded_path,
                               workdir=intermediate_dir,
                               images=self.config.pdfimages,
                               convert_to_pdf=convert_to_pdf,
                               keep_xml=keep_xml,
                               ocr_lang=ocr_lang)
Example #2
0
 def lazy_downloaded_to_intermediate(basefile):
     downloaded_path = self.store.downloaded_path(
         basefile, attachment="index.pdf")
     downloaded_path_html = self.store.downloaded_path(
         basefile, attachment="index.html")
     if not os.path.exists(downloaded_path):
         if os.path.exists(downloaded_path_html):
             # attempt to parse HTML instead
             return open(downloaded_path_html)
         else:
             # just grab the HTML from the XML file itself...
             tree = etree.parse(self.store.downloaded_path(basefile))
             html = tree.getroot().find("dokument").find("html")
         if html is not None:
             return StringIO(html.text)
         else:
             return StringIO(
                 "<html><h1>Dokumenttext saknas</h1></html>")
     intermediate_path = self.store.intermediate_path(basefile)
     intermediate_dir = os.path.dirname(intermediate_path)
     convert_to_pdf = not downloaded_path.endswith(".pdf")
     keep_xml = "bz2" if self.config.compress == "bz2" else True
     reader = StreamingPDFReader()
     try:
         res = reader.convert(filename=downloaded_path,
                              workdir=intermediate_dir,
                              images=self.config.pdfimages,
                              convert_to_pdf=convert_to_pdf,
                              keep_xml=keep_xml)
     except (errors.PDFFileIsEmpty, errors.ExternalCommandError) as e:
         if isinstance(e, errors.ExternalCommandError):
             self.log.debug("%s: PDF file conversion failed: %s" %
                            (basefile, str(e).split("\n")[0]))
             # if PDF file conversion fails, it'll probaby fail
             # again when we try OCR, but maybe there will
             # exist a cached intermediate file that allow us
             # to get data without even looking at the PDF file
             # again.
         elif isinstance(e, errors.PDFFileIsEmpty):
             self.log.debug("%s: PDF had no textcontent, trying OCR" %
                            basefile)
         res = reader.convert(filename=downloaded_path,
                              workdir=intermediate_dir,
                              images=self.config.pdfimages,
                              convert_to_pdf=convert_to_pdf,
                              keep_xml=keep_xml,
                              ocr_lang="swe")
     if os.path.getsize(intermediate_path) > 20 * 1024 * 1024:
         raise errors.ParseError(
             "%s: %s (after conversion) is just too damn big (%s Mbytes)"
             % (basefile, intermediate_path,
                os.path.getsize(intermediate_path) / (1024 * 1024)))
     return res
Example #3
0
 def lazy_downloaded_to_intermediate(basefile):
     downloaded_path = self.store.downloaded_path(basefile,
                                                  attachment="index.pdf")
     downloaded_path_html = self.store.downloaded_path(basefile,
                                                       attachment="index.html")
     if not os.path.exists(downloaded_path):
         if os.path.exists(downloaded_path_html):
             # attempt to parse HTML instead
             return open(downloaded_path_html)
         else:
             # just grab the HTML from the XML file itself...
             tree = etree.parse(self.store.downloaded_path(basefile))
             html = tree.getroot().find("dokument").find("html")
         if html is not None:
             return StringIO(html.text)
         else:
             return StringIO("<html><h1>Dokumenttext saknas</h1></html>")
     intermediate_path = self.store.intermediate_path(basefile)
     intermediate_dir = os.path.dirname(intermediate_path)
     convert_to_pdf = not downloaded_path.endswith(".pdf")
     keep_xml = "bz2" if self.config.compress == "bz2" else True
     reader = StreamingPDFReader()
     try:
         res = reader.convert(filename=downloaded_path,
                              workdir=intermediate_dir,
                              images=self.config.pdfimages,
                              convert_to_pdf=convert_to_pdf,
                              keep_xml=keep_xml)
     except (errors.PDFFileIsEmpty, errors.ExternalCommandError) as e:
         if isinstance(e, errors.ExternalCommandError):
             self.log.debug("%s: PDF file conversion failed: %s" % (basefile, str(e).split("\n")[0]))
             # if PDF file conversion fails, it'll probaby fail
             # again when we try OCR, but maybe there will
             # exist a cached intermediate file that allow us
             # to get data without even looking at the PDF file
             # again.
         elif isinstance(e, errors.PDFFileIsEmpty):
             self.log.debug("%s: PDF had no textcontent, trying OCR" % basefile)
         res = reader.convert(filename=downloaded_path,
                              workdir=intermediate_dir,
                              images=self.config.pdfimages,
                              convert_to_pdf=convert_to_pdf,
                              keep_xml=keep_xml,
                              ocr_lang="swe")
         # now the intermediate path endswith .hocr.html.bz2, not .xml.bz2 
         intermediate_path = self.store.intermediate_path(basefile)
     if os.path.getsize(intermediate_path) > 20*1024*1024:
         raise errors.ParseError("%s: %s (after conversion) is just too damn big (%s Mbytes)" % 
                                 (basefile, intermediate_path, 
                                  os.path.getsize(intermediate_path) / (1024*1024)))
     return res
Example #4
0
 def convert_pdf(self, downloaded_path, intermediate_path):
     intermediate_dir = os.path.dirname(intermediate_path)
     keep_xml = "bz2" if self.config.compress == "bz2" else True
     reader = StreamingPDFReader()
     kwargs = {'filename': downloaded_path,
               'workdir': intermediate_dir,
               'images': self.config.pdfimages,
               'keep_xml': keep_xml}
     if self.config.ocr:
         kwargs['ocr_lang'] = 'swe'
     return reader.convert(**kwargs)
Example #5
0
 def downloaded_to_intermediate(self, basefile, attachment=None):
     intermediate_path = self.store.intermediate_path(basefile)
     intermediate_dir = os.path.dirname(intermediate_path)
     keep_xml = "bz2" if self.config.compress == "bz2" else True
     reader = StreamingPDFReader()
     kwargs = {'filename': self.store.downloaded_path(basefile, attachment=attachment),
               'workdir': intermediate_dir,
               'images': self.config.pdfimages,
               'keep_xml': keep_xml}
     if self.config.ocr:
         kwargs['ocr_lang'] = 'swe'
     return reader.convert(**kwargs)
Example #6
0
    def downloaded_to_intermediate(self, basefile, attachment=None):
        # force just the conversion part of the PDF handling
        downloaded_path = self.store.downloaded_path(basefile,
                                                     attachment=attachment)
        intermediate_path = self.store.intermediate_path(basefile)
        intermediate_dir = os.path.dirname(intermediate_path)
        ocr_lang = None
        convert_to_pdf = not downloaded_path.endswith(".pdf")
        keep_xml = "bz2" if self.config.compress == "bz2" else True
        reader = StreamingPDFReader()
        try:
            return reader.convert(filename=downloaded_path,
                                  workdir=intermediate_dir,
                                  images=self.config.pdfimages,
                                  convert_to_pdf=convert_to_pdf,
                                  keep_xml=keep_xml,
                                  ocr_lang=ocr_lang,
                                  legacy_tesseract=self.config.legacytesseract)
        except PDFFileIsEmpty as e:
            if self.config.ocr:
                self.log.warning("%s: %s was empty, attempting OCR" %
                                 (basefile, downloaded_path))
                ocr_lang = "swe"  # reasonable guess
                return reader.convert(filename=downloaded_path,
                                      workdir=intermediate_dir,
                                      images=self.config.pdfimages,
                                      convert_to_pdf=convert_to_pdf,
                                      keep_xml=keep_xml,
                                      ocr_lang=ocr_lang)
            else:
                self.log.warning("%s: %s was empty, returning placeholder" %
                                 (basefile, downloaded_path))
                fp = BytesIO(b"""<pdf2xml>
                <page number="1" position="absolute" top="0" left="0" height="1029" width="701">
	        <fontspec id="0" size="12" family="TimesNewRomanPSMT" color="#000000"/>
                <text top="67" left="77" width="287" height="26" font="0">[Avg&#246;randetext saknas]</text>
                </page>
                </pdf2xml>""")
                fp.name = "dummy.xml"
                return fp
Example #7
0
 def downloaded_to_intermediate(self, basefile, attachment=None):
     intermediate_path = self.store.intermediate_path(basefile)
     intermediate_dir = os.path.dirname(intermediate_path)
     keep_xml = "bz2" if self.config.compress == "bz2" else True
     reader = StreamingPDFReader()
     kwargs = {
         'filename': self.store.downloaded_path(basefile,
                                                attachment=attachment),
         'workdir': intermediate_dir,
         'images': self.config.pdfimages,
         'keep_xml': keep_xml
     }
     if self.config.ocr:
         kwargs['ocr_lang'] = 'swe'
     return reader.convert(**kwargs)
Example #8
0
    def downloaded_to_intermediate(self, basefile, attachment=None):
        # force just the conversion part of the PDF handling
        downloaded_path = self.store.downloaded_path(basefile, attachment=attachment)
        intermediate_path = self.store.intermediate_path(basefile)
        intermediate_dir = os.path.dirname(intermediate_path)
        ocr_lang = None
        convert_to_pdf = not downloaded_path.endswith(".pdf")
        keep_xml = "bz2" if self.config.compress == "bz2" else True
        reader = StreamingPDFReader()
        try:
            return reader.convert(filename=downloaded_path,
                                  workdir=intermediate_dir,
                                  images=self.config.pdfimages,
                                  convert_to_pdf=convert_to_pdf,
                                  keep_xml=keep_xml,
                                  ocr_lang=ocr_lang)
        except PDFFileIsEmpty as e:
            if self.config.ocr:
                self.log.warning("%s: %s was empty, attempting OCR" % (basefile, downloaded_path))
                ocr_lang = "swe" # reasonable guess
                return reader.convert(filename=downloaded_path,
                                      workdir=intermediate_dir,
                                      images=self.config.pdfimages,
                                      convert_to_pdf=convert_to_pdf,
                                      keep_xml=keep_xml,
                                      ocr_lang=ocr_lang)
            else:
                self.log.warning("%s: %s was empty, returning placeholder" % (basefile, downloaded_path))
                fp = BytesIO(b"""<pdf2xml>
                <page number="1" position="absolute" top="0" left="0" height="1029" width="701">
	        <fontspec id="0" size="12" family="TimesNewRomanPSMT" color="#000000"/>
                <text top="67" left="77" width="287" height="26" font="0">[Avg&#246;randetext saknas]</text>
                </page>
                </pdf2xml>""")
                fp.name = "dummy.xml"
                return fp