def get_text(self): """Does OCR on this image.""" image_writer = ImageWriter("temp") try: temp_image = image_writer.export_image(self._image_obj) except PDFNotImplementedError: # No filter method available for this stream # https://github.com/euske/pdfminer/issues/99 return u"" try: text = image_to_string(Image.open("temp/" + temp_image), lang="fin") except IOError: # PdfMiner did not return an image # Let's try to create one ourselves # TODO: Create proper color_mode values from ColorSpace # Most of the times "L" will create something good enough # for OCR, though temp_image = Image.frombuffer("L", self._image_obj.srcsize, self._stream.get_data(), "raw", "L", 0, 1) text = image_to_string(temp_image, lang="fin") unlink("temp/" + temp_image) return text
def parse_img(img_obj, labels): from pdfminer.image import ImageWriter from subprocess import call from pdfminer.pdftypes import PDFNotImplementedError from os import remove, rename, path try: logging.info("Writing " + img_obj.name) image_writer = ImageWriter(".") # TODO this is not thread safe... (so run with "-p 1" for now) output_filename = image_writer.export_image(img_obj) # TODO rename(output_filename, ...) logging.info("Written " + output_filename) logging.info("Calling Tesseract hOCR") call(["tesseract", output_filename, "out", "-l", "nld", "hocr"]) # TODO remove(output_filename) if path.isfile("./out.html"): # TODO fix race condition from extracthocr import get_bbox_page, get_bbox_texts for label in get_bbox_texts("./out.html"): labels.append(convert_label_bbox(label, img_obj.bbox, get_bbox_page("./out.html"))) remove("./out.html") else: logging.error("Image object kon niet verwerkt worden door Tesseract: " + output_filename + ".") except PDFNotImplementedError as e: logging.error( "Image object kon niet verwerkt worden: " + str(e) + ". Mogelijk helpt het opslaan vam de PDF met ondersteuning voor Acrobat 4.x en later." )
def find_object_recursively(layout_obj): # LTTextBoxを継承するオブジェクトの場合は1要素のリストを返す。 # print("obj_name:", layout_obj.__class__.__name__) if isinstance(layout_obj, LTTextBox): return [layout_obj] if isinstance(layout_obj, LTImage): # print("LTImage:", layout_obj.__class__.__name__) imagewriter = ImageWriter(output_image_dir) imagewriter.export_image(layout_obj) # TODO jpeg以外の画像を抽出したい(PDFMinerでは不可ぽい) # stream = child_fig.stream # filters = stream.get_filters() # (width, height) = child_fig.srcsize # print("width, height, len(filters), filters[0][0]:", width, height, len(filters), filters[0][0]) # ext = '_.%d.%dx%d.img' % (child_fig.bits, width, height) # name = child_fig.name + ext # print(child_fig.bits, name, child_fig.colorspace) # if filters[0][0] in LITERALS_FLATE_DECODE: # with open('outputImg/' + name, "wb") as fp: # # imgdata = zlib.decompress(stream.data) # data = stream.get_data() # fp.write(data) # fp.close() return [layout_obj] if isinstance(layout_obj, LTLine): return [layout_obj] if isinstance(layout_obj, LTRect): return [layout_obj] # LTLine も LTRect も、LTCurve を継承しているので、これで両方とも返る # if isinstance(layout_obj, LTCurve): # return [layout_obj] # LTPageやLTFigureなど、LTContainerを継承するオブジェクトは子要素を含むので、再帰的に探す。 # if isinstance(layout_obj, LTPage): if isinstance(layout_obj, LTContainer): boxes = [] for child_con in layout_obj: boxes.extend(find_object_recursively(child_con)) return boxes # print("none object:", layout_obj.__class__.__name__) return [] # その他の場合は空リストを返す。
def get_text(self): """Does OCR on this image.""" image_writer = ImageWriter("temp") try: temp_image = image_writer.export_image(self._image_obj) except PDFNotImplementedError: # No filter method available for this stream # https://github.com/euske/pdfminer/issues/99 return u"" try: text = image_to_string(Image.open("temp/" + temp_image), lang="swe") except IOError: # PdfMiner did not return an image # Let's try to create one ourselves # TODO: Create proper color_mode values from ColorSpace # Most of the times "L" will create something good enough # for OCR, though temp_image = Image.frombuffer("L", self._image_obj.srcsize, self._stream.get_data(), "raw", "L", 0, 1) text = image_to_string(temp_image, lang="swe") unlink("temp/" + temp_image) return text
def get_html_tree(self) -> str: # Create a temp folder where images are temporarily saved. dirname = tempfile.mkdtemp() imagewriter = ImageWriter(dirname) doc = Document() self.doc = doc html = doc.createElement("html") doc.appendChild(html) head = doc.createElement("head") html.appendChild(head) # meta meta = doc.createElement("meta") head.appendChild(meta) meta.setAttribute("name", "ocr-system") meta.setAttribute("content", f"Converted from PDF by pdftotree {__version__}") meta = doc.createElement("meta") head.appendChild(meta) meta.setAttribute("name", "ocr-capabilities") meta.setAttribute("content", "ocr_page ocr_table ocrx_block ocrx_line ocrx_word") meta = doc.createElement("meta") head.appendChild(meta) meta.setAttribute("name", "ocr-number-of-pages") meta.setAttribute("content", f"{len(self.elems.keys())}") # body body = doc.createElement("body") html.appendChild(body) for page_num in self.elems.keys(): # 1-based boxes: List[Tuple[str, float, float, float, float]] = [] for clust in self.tree[page_num]: for (pnum, pwidth, pheight, top, left, bottom, right) in self.tree[page_num][clust]: boxes += [(clust.lower().replace(" ", "_"), top, left, bottom, right)] page = doc.createElement("div") page.setAttribute("class", "ocr_page") page.setAttribute("id", f"page_{page_num}") width = int(self.elems[page_num].layout.width) height = int(self.elems[page_num].layout.height) page.setAttribute( "title", f"bbox 0 0 {width} {height}; ppageno {page_num-1}", ) body.appendChild(page) # TODO: We need to detect columns and sort acccordingly. boxes.sort(key=cmp_to_key(column_order)) for box in boxes: if box[0] == "table": table = box[1:] # bbox table_element = self.get_html_table(table, page_num) page.appendChild(table_element) elif box[0] == "figure": elems: List[LTTextLine] = get_mentions_within_bbox( box, self.elems[page_num].figures) fig_element = doc.createElement("figure") page.appendChild(fig_element) top, left, bottom, right = [int(i) for i in box[1:]] fig_element.setAttribute( "title", f"bbox {left} {top} {right} {bottom}") for img in [img for elem in elems for img in elem]: if not isinstance(img, LTImage): continue filename = imagewriter.export_image(img) with open(os.path.join(dirname, filename), "rb") as f: base64 = b64encode(f.read()).decode("ascii") if filename.endswith("jpg"): mediatype = "jpeg" elif filename.endswith("bmp"): mediatype = "bmp" else: logger.info( f"Skipping an unknown type image: {filename}.") continue logger.info( f"Embedding a known type image: {filename}.") img_element = doc.createElement("img") fig_element.appendChild(img_element) img_element.setAttribute("title", bbox2str(img.bbox)) img_element.setAttribute( "src", f"data:image/{mediatype};base64,{base64}") else: element = self.get_html_others(box[0], box[1:], page_num) page.appendChild(element) return doc.toprettyxml()