Esempio n. 1
0
 def get_text(self):
     """Does OCR on this image."""
     image_writer = ImageWriter("temp")
     try:
         temp_image = image_writer.export_image(self._image_obj)
     except PDFNotImplementedError:
         # No filter method available for this stream
         # https://github.com/euske/pdfminer/issues/99
         return u""
     try:
         text = image_to_string(Image.open("temp/" + temp_image),
                                lang="fin")
     except IOError:
         # PdfMiner did not return an image
         # Let's try to create one ourselves
         # TODO: Create proper color_mode values from ColorSpace
         # Most of the times "L" will create something good enough
         # for OCR, though
         temp_image = Image.frombuffer("L",
                                       self._image_obj.srcsize,
                                       self._stream.get_data(), "raw",
                                       "L", 0, 1)
         text = image_to_string(temp_image, lang="fin")
     unlink("temp/" + temp_image)
     return text
Esempio n. 2
0
def parse_img(img_obj, labels):
    from pdfminer.image import ImageWriter
    from subprocess import call
    from pdfminer.pdftypes import PDFNotImplementedError
    from os import remove, rename, path

    try:
        logging.info("Writing " + img_obj.name)
        image_writer = ImageWriter(".")
        # TODO this is not thread safe... (so run with "-p 1" for now)
        output_filename = image_writer.export_image(img_obj)
        # TODO rename(output_filename, ...)
        logging.info("Written " + output_filename)
        logging.info("Calling Tesseract hOCR")
        call(["tesseract", output_filename, "out", "-l", "nld", "hocr"])
        # TODO remove(output_filename)

        if path.isfile("./out.html"):
            # TODO fix race condition
            from extracthocr import get_bbox_page, get_bbox_texts

            for label in get_bbox_texts("./out.html"):
                labels.append(convert_label_bbox(label, img_obj.bbox, get_bbox_page("./out.html")))
            remove("./out.html")
        else:
            logging.error("Image object kon niet verwerkt worden door Tesseract: " + output_filename + ".")
    except PDFNotImplementedError as e:
        logging.error(
            "Image object kon niet verwerkt worden: "
            + str(e)
            + ". Mogelijk helpt het opslaan vam de PDF met ondersteuning voor Acrobat 4.x en later."
        )
Esempio n. 3
0
def find_object_recursively(layout_obj):
    # LTTextBoxを継承するオブジェクトの場合は1要素のリストを返す。
    # print("obj_name:", layout_obj.__class__.__name__)
    if isinstance(layout_obj, LTTextBox):
        return [layout_obj]

    if isinstance(layout_obj, LTImage):
        # print("LTImage:", layout_obj.__class__.__name__)
        imagewriter = ImageWriter(output_image_dir)
        imagewriter.export_image(layout_obj)
        # TODO jpeg以外の画像を抽出したい(PDFMinerでは不可ぽい)
        # stream = child_fig.stream
        # filters = stream.get_filters()
        # (width, height) = child_fig.srcsize
        # print("width, height, len(filters), filters[0][0]:", width, height, len(filters), filters[0][0])
        # ext = '_.%d.%dx%d.img' % (child_fig.bits, width, height)
        # name = child_fig.name + ext
        # print(child_fig.bits, name, child_fig.colorspace)
        # if filters[0][0] in LITERALS_FLATE_DECODE:
        #     with open('outputImg/' + name, "wb") as fp:
        #         # imgdata = zlib.decompress(stream.data)
        #         data = stream.get_data()
        #         fp.write(data)
        #         fp.close()
        return [layout_obj]

    if isinstance(layout_obj, LTLine):
        return [layout_obj]
    if isinstance(layout_obj, LTRect):
        return [layout_obj]
    # LTLine も LTRect も、LTCurve を継承しているので、これで両方とも返る
    # if isinstance(layout_obj, LTCurve):
    #     return [layout_obj]

    # LTPageやLTFigureなど、LTContainerを継承するオブジェクトは子要素を含むので、再帰的に探す。
    # if isinstance(layout_obj, LTPage):
    if isinstance(layout_obj, LTContainer):
        boxes = []
        for child_con in layout_obj:
            boxes.extend(find_object_recursively(child_con))

        return boxes

    # print("none object:", layout_obj.__class__.__name__)
    return []  # その他の場合は空リストを返す。
Esempio n. 4
0
 def get_text(self):
     """Does OCR on this image."""
     image_writer = ImageWriter("temp")
     try:
         temp_image = image_writer.export_image(self._image_obj)
     except PDFNotImplementedError:
         # No filter method available for this stream
         # https://github.com/euske/pdfminer/issues/99
         return u""
     try:
         text = image_to_string(Image.open("temp/" + temp_image),
                                lang="swe")
     except IOError:
         # PdfMiner did not return an image
         # Let's try to create one ourselves
         # TODO: Create proper color_mode values from ColorSpace
         # Most of the times "L" will create something good enough
         # for OCR, though
         temp_image = Image.frombuffer("L", self._image_obj.srcsize,
                                       self._stream.get_data(), "raw", "L",
                                       0, 1)
         text = image_to_string(temp_image, lang="swe")
     unlink("temp/" + temp_image)
     return text
Esempio n. 5
0
    def get_html_tree(self) -> str:
        # Create a temp folder where images are temporarily saved.
        dirname = tempfile.mkdtemp()
        imagewriter = ImageWriter(dirname)

        doc = Document()
        self.doc = doc
        html = doc.createElement("html")
        doc.appendChild(html)
        head = doc.createElement("head")
        html.appendChild(head)
        # meta
        meta = doc.createElement("meta")
        head.appendChild(meta)
        meta.setAttribute("name", "ocr-system")
        meta.setAttribute("content",
                          f"Converted from PDF by pdftotree {__version__}")
        meta = doc.createElement("meta")
        head.appendChild(meta)
        meta.setAttribute("name", "ocr-capabilities")
        meta.setAttribute("content",
                          "ocr_page ocr_table ocrx_block ocrx_line ocrx_word")
        meta = doc.createElement("meta")
        head.appendChild(meta)
        meta.setAttribute("name", "ocr-number-of-pages")
        meta.setAttribute("content", f"{len(self.elems.keys())}")
        # body
        body = doc.createElement("body")
        html.appendChild(body)
        for page_num in self.elems.keys():  # 1-based
            boxes: List[Tuple[str, float, float, float, float]] = []
            for clust in self.tree[page_num]:
                for (pnum, pwidth, pheight, top, left, bottom,
                     right) in self.tree[page_num][clust]:
                    boxes += [(clust.lower().replace(" ", "_"), top, left,
                               bottom, right)]
            page = doc.createElement("div")
            page.setAttribute("class", "ocr_page")
            page.setAttribute("id", f"page_{page_num}")
            width = int(self.elems[page_num].layout.width)
            height = int(self.elems[page_num].layout.height)
            page.setAttribute(
                "title",
                f"bbox 0 0 {width} {height}; ppageno {page_num-1}",
            )
            body.appendChild(page)
            # TODO: We need to detect columns and sort acccordingly.
            boxes.sort(key=cmp_to_key(column_order))

            for box in boxes:
                if box[0] == "table":
                    table = box[1:]  # bbox
                    table_element = self.get_html_table(table, page_num)
                    page.appendChild(table_element)
                elif box[0] == "figure":
                    elems: List[LTTextLine] = get_mentions_within_bbox(
                        box, self.elems[page_num].figures)
                    fig_element = doc.createElement("figure")
                    page.appendChild(fig_element)
                    top, left, bottom, right = [int(i) for i in box[1:]]
                    fig_element.setAttribute(
                        "title", f"bbox {left} {top} {right} {bottom}")
                    for img in [img for elem in elems for img in elem]:
                        if not isinstance(img, LTImage):
                            continue
                        filename = imagewriter.export_image(img)
                        with open(os.path.join(dirname, filename), "rb") as f:
                            base64 = b64encode(f.read()).decode("ascii")
                        if filename.endswith("jpg"):
                            mediatype = "jpeg"
                        elif filename.endswith("bmp"):
                            mediatype = "bmp"
                        else:
                            logger.info(
                                f"Skipping an unknown type image: {filename}.")
                            continue
                        logger.info(
                            f"Embedding a known type image: {filename}.")
                        img_element = doc.createElement("img")
                        fig_element.appendChild(img_element)
                        img_element.setAttribute("title", bbox2str(img.bbox))
                        img_element.setAttribute(
                            "src", f"data:image/{mediatype};base64,{base64}")
                else:
                    element = self.get_html_others(box[0], box[1:], page_num)
                    page.appendChild(element)
        return doc.toprettyxml()