def _convert_page(layout, path): text_content = [] for text_obj in _find_objects(layout._objs, (LTTextBox, LTTextLine)): text_content.append(text_obj.get_text()) text = text_fragments(text_content) # if len(text) < 2: # if len(list(_find_objects(layout._objs, LTImage))): # log.debug("Defaulting to OCR: %r, pg. %s", path, page_no) # text = _extract_image_page(path, page_no, languages) return text
def _convert_page(layout, languages): text_content = [] for text_obj in _find_objects(layout._objs, (LTTextBox, LTTextLine)): text_content.append(text_obj.get_text()) text = text_fragments(text_content) if len(text) > 3: # TODO: invent a smarter way to decide whether to do OCR. return text for img_obj in _find_objects(layout._objs, LTImage): try: if img_obj.width < OCR_MIN_WIDTH or \ img_obj.height < OCR_MIN_HEIGHT: continue data = img_obj.stream.get_rawdata() img_text = extract_image_data(data, languages=languages) text_content.append(img_text) except Exception as ex: log.debug(ex) return text_fragments(text_content)