def _perform_ocr(self, in_paths, out_dir, language): processes = [] def _clean_processes(): for p in processes[:]: if p.poll() is not None: processes.remove(p) _clean_processes.num_cleaned += 1 self.on_progressed.send( self, progress=float(_clean_processes .num_cleaned)/len(in_paths)) _clean_processes.num_cleaned = 0 max_procs = multiprocessing.cpu_count() FNULL = open(os.devnull, 'w') for fpath in in_paths: # Wait until another process has finished while len(processes) >= max_procs: _clean_processes() time.sleep(0.01) cmd = [BIN, unicode(fpath), unicode(out_dir / fpath.stem), "-l", language, "hocr"] logger.debug(cmd) proc = util.get_subprocess(cmd, stderr=FNULL, stdout=FNULL) processes.append(proc) # Wait for remaining processes to finish while processes: _clean_processes()
def _generate_output(self, projectfile, out_dir, num_pages): """ Run last step for the project file and keep track of the progress by emitting :py:attr:`on_progressed` signals. :param projectfile: Path ScanTailor configuration file :type projectfile: :py:class:`pathlib.Path` :param out_dir: Output directory for processed files :type out_dir: :py:class:`pathlib.Path` :param num_pages: Total number of pages to process :type num_pages: int """ logger.debug("Generating output...") temp_dir = Path(tempfile.mkdtemp(prefix="spreads.")) split_config = self._split_configuration(projectfile, temp_dir) logger.debug("Launching those subprocesses!") processes = [util.get_subprocess([CLI_BIN, '--start-filter=6', unicode(cfgfile), unicode(out_dir)]) for cfgfile in split_config] last_count = 0 while processes: recent_count = sum(1 for x in out_dir.glob('*.tif')) if recent_count > last_count: progress = 0.5 + (float(recent_count)/num_pages)/2 self.on_progressed.send(self, progress=progress) last_count = recent_count for p in processes[:]: if p.poll() is not None: processes.remove(p) time.sleep(.01) shutil.rmtree(unicode(temp_dir))
def __init__(self, config): super(ScanTailorPlugin, self).__init__(config) help_out = util.get_subprocess([CLI_BIN], stdout=subprocess.PIPE).communicate()[0] self._enhanced = bool( re.match(r".*<images\|directory\|->.*", help_out.splitlines()[7]))
def _generate_output(self, projectfile, out_dir, num_pages): """ Run last step for the project file and keep track of the progress by emitting :py:attr:`on_progressed` signals. :param projectfile: Path ScanTailor configuration file :type projectfile: :py:class:`pathlib.Path` :param out_dir: Output directory for processed files :type out_dir: :py:class:`pathlib.Path` :param num_pages: Total number of pages to process :type num_pages: int """ logger.debug("Generating output...") temp_dir = Path(tempfile.mkdtemp(prefix="spreads.")) split_config = self._split_configuration(projectfile, temp_dir) logger.debug("Launching those subprocesses!") processes = [ util.get_subprocess([ CLI_BIN, '--start-filter=6', unicode(cfgfile), unicode(out_dir) ]) for cfgfile in split_config ] last_count = 0 while processes: recent_count = sum(1 for x in out_dir.glob('*.tif')) if recent_count > last_count: progress = 0.5 + (float(recent_count) / num_pages) / 2 self.on_progressed.send(self, progress=progress) last_count = recent_count for p in processes[:]: if p.poll() is not None: processes.remove(p) time.sleep(.01) shutil.rmtree(unicode(temp_dir))
def _perform_ocr(self, in_paths, out_dir, language): """ For each input image, launch tesseract and keep track of how far along the work is. :param in_paths: Input images :type in_paths: list of :py:class:`pathlib.Path` :param out_dir: Output directory for hOCR files :type out_dir: :py:class:`pathlib.Path` :param language: Language to use for OCRing, must be among tesseract languages installed on the system. :type language: unicode """ processes = [] def _clean_processes(): """ Go through processes, remove completed and emit a :py:attr:`on_progressed` signal for it. """ for p in processes[:]: if p.poll() is not None: processes.remove(p) _clean_processes.num_cleaned += 1 self.on_progressed.send( self, progress=float(_clean_processes.num_cleaned) / len(in_paths)) _clean_processes.num_cleaned = 0 # Run as many simultaneous Tesseract instances as there are CPU cores max_procs = multiprocessing.cpu_count() devnull = open(os.devnull, 'w') for fpath in in_paths: # Wait until another process has finished while len(processes) >= max_procs: _clean_processes() time.sleep(0.01) cmd = [ BIN, unicode(fpath), unicode(out_dir / fpath.stem), "-l", language, "hocr" ] logger.debug(cmd) proc = util.get_subprocess(cmd, stderr=devnull, stdout=devnull) processes.append(proc) # Wait for remaining processes to finish while processes: _clean_processes()
def _perform_ocr(self, in_paths, out_dir, language): """ For each input image, launch tesseract and keep track of how far along the work is. :param in_paths: Input images :type in_paths: list of :py:class:`pathlib.Path` :param out_dir: Output directory for hOCR files :type out_dir: :py:class:`pathlib.Path` :param language: Language to use for OCRing, must be among tesseract languages installed on the system. :type language: unicode """ processes = [] def _clean_processes(): """ Go through processes, remove completed and emit a :py:attr:`on_progressed` signal for it. """ for p in processes[:]: if p.poll() is not None: processes.remove(p) _clean_processes.num_cleaned += 1 self.on_progressed.send( self, progress=float(_clean_processes .num_cleaned)/len(in_paths)) _clean_processes.num_cleaned = 0 # Run as many simultaneous Tesseract instances as there are CPU cores max_procs = multiprocessing.cpu_count() devnull = open(os.devnull, 'w') for fpath in in_paths: # Wait until another process has finished while len(processes) >= max_procs: _clean_processes() time.sleep(0.01) cmd = [BIN, unicode(fpath), unicode(out_dir / fpath.stem), "-l", language, "hocr"] logger.debug(cmd) proc = util.get_subprocess(cmd, stderr=devnull, stdout=devnull) processes.append(proc) # Wait for remaining processes to finish while processes: _clean_processes()
def _generate_output(self, projectfile, out_dir, num_pages): logger.debug("Generating output...") temp_dir = Path(tempfile.mkdtemp(prefix="spreads.")) split_config = self._split_configuration(projectfile, temp_dir) logger.debug("Launching those subprocesses!") processes = [util.get_subprocess([CLI_BIN, '--start-filter=6', unicode(cfgfile), unicode(out_dir)]) for cfgfile in split_config] last_count = 0 while processes: recent_count = sum(1 for x in out_dir.glob('*.tif')) if recent_count > last_count: progress = 0.5 + (float(recent_count)/num_pages)/2 self.on_progressed.send(self, progress=progress) last_count = recent_count for p in processes[:]: if p.poll() is not None: processes.remove(p) time.sleep(.01) shutil.rmtree(unicode(temp_dir))
def process(self, pages, target_path): """ Run the most recent image of every page through ScanTailor. :param pages: Pages to be processed :type pages: list of :py:class:`spreads.workflow.Page` :param target_path: Base directory where rotated images are to be stored :type target_path: :py:class:`pathlib.Path` """ autopilot = self.config['autopilot'].get(bool) if not autopilot and not util.find_in_path('scantailor'): raise util.MissingDependencyException( "Could not find executable `scantailor` in" " $PATH. Please install the appropriate" " package(s)!") # Create temporary files/directories projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1]) out_dir = Path(tempfile.mkdtemp(prefix='st-out')) # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[unicode(fpath)] = page logger.info("Generating ScanTailor configuration") self._generate_configuration(sorted(in_paths.keys()), projectfile, out_dir) if not autopilot: logger.warn("If you are changing output settings (in the last " "step, you *have* to run the last step from the GUI. " "Due to a bug in ScanTailor, your settings would " "otherwise be ignored.") time.sleep(5) logger.info("Opening ScanTailor GUI for manual adjustment") util.get_subprocess([GUI_BIN, unicode(projectfile)]) # Check if the user already generated output files from the GUI if not sum(1 for x in out_dir.glob('*.tif')) == len(pages): logger.info("Generating output images from ScanTailor " "configuration.") self._generate_output(projectfile, out_dir, len(pages)) # Associate generated output files with our pages for fname in out_dir.glob('*.tif'): out_stem = fname.stem for in_path, page in in_paths.iteritems(): if Path(in_path).stem == out_stem: target_fname = target_path / fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn( "Could not find page for output file {0}".format(fname)) # Remove temporary files/directories shutil.rmtree(unicode(out_dir)) # FIXME: This fails on Windows since there seems to be some non-gcable # reference to the file around, but I currently cannot figure # out where, so we just ignore the error... try: projectfile.unlink() except WindowsError as e: if e.errno == 32: pass
def _generate_configuration(self, in_paths, projectfile, out_dir): """ Run images through ScanTailor pre-processing steps. :param in_paths: Paths to images to be processed :type in_paths: list of :py:class:`pathlib.Path` :param projectfile: Path ScanTailor configuration file :type projectfile: :py:class:`pathlib.Path` :param out_dir: Output directory for processed files :type out_dir: :py:class:`pathlib.Path` """ # Filters are numbered from 1 to 6, with 6 being the 'create output # files' step. filterconf = [ self.config[x].get(bool) for x in ('rotate', 'split_pages', 'deskew', 'content', 'auto_margins') ] start_filter = filterconf.index(True) + 1 end_filter = len(filterconf) - list(reversed(filterconf)).index(True) marginconf = self.config['margins'].as_str_seq() # Build initial command-line generation_cmd = [ CLI_BIN, '--start-filter={0}'.format(start_filter), '--end-filter={0}'.format(end_filter), '--layout=1.5', '-o={0}'.format(projectfile) ] # The 'enhanced' fork of ScanTailor has some additional features page_detection = self.config['detection'].get() == 'page' if self._enhanced and page_detection: generation_cmd.extend([ '--enable-page-detection', '--disable-content-detection', '--enable-fine-tuning' ]) else: generation_cmd.extend([ '--margins-top={0}'.format(marginconf[0]), '--margins-right={0}'.format(marginconf[1]), '--margins-bottom={0}'.format(marginconf[2]), '--margins-left={0}'.format(marginconf[3]), ]) if IS_WIN: # NOTE: Due to Window's commandline length limit of 8192 chars, # we have to pipe in the list of files via stdin generation_cmd.append("-") else: generation_cmd.extend(in_paths) generation_cmd.append(unicode(out_dir)) logger.debug(" ".join(generation_cmd)) if IS_WIN: sp = util.get_subprocess(generation_cmd, stdin=subprocess.PIPE) sp.stdin.write(" ".join(in_paths)) sp.stdin.close() else: sp = util.get_subprocess(generation_cmd) proc = psutil.Process(sp.pid) # Keep track of the progress by monitoring the files opened by the # ScanTailor process. Since it processes the files in order and we # know in advance how often a file will be opened (= number of steps) # we can reliably calculate how far a long we are and emit # # :py:attr:`on_progressed` events. num_images = len(in_paths) num_steps = (end_filter - start_filter) + 1 last_fileidx = 0 recent_fileidx = 0 finished_steps = 0 while proc.is_running(): try: recent_fileidx = next( in_paths.index(x.path) for x in proc.open_files() if x.path in in_paths) except StopIteration: pass except psutil.AccessDenied: # This means the process is no longer running break if recent_fileidx == last_fileidx: time.sleep(.01) continue if recent_fileidx < last_fileidx: finished_steps += 1 last_fileidx = recent_fileidx progress = 0.5 * ((finished_steps * num_images + last_fileidx) / float(num_steps * num_images)) self.on_progressed.send(self, progress=progress)
def output(self, pages, target_path, metadata, table_of_contents): """ Go through pages and bundle their most recent images into a PDF file. :param pages: Pages to bundle :param target_path: list of :py:class:`spreads.workflow.Page` :param metadata: Metadata to include in PDF file :type metadata: :py:class:`spreads.metadata.Metadata` :param table_of_contents: Table of contents to include in PDF file :type table_of_contents: list of :py:class:`TocEntry` """ logger.info("Assembling PDF.") tmpdir = Path(tempfile.mkdtemp()) meta_file = tmpdir/'metadata.txt' with codecs.open(unicode(meta_file), "w", "utf-8") as fp: for key, value in metadata.iteritems(): if key == 'title': fp.write("Title: \"{0}\"\n".format(value)) if key == 'creator': for author in value: fp.write("Author: \"{0}\"\n".format(author)) images = [] for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image link_path = (tmpdir/fpath.name) if IS_WIN: shutil.copy(unicode(fpath), unicode(link_path)) else: link_path.symlink_to(fpath.absolute()) if 'tesseract' in page.processed_images: ocr_path = page.processed_images['tesseract'] if IS_WIN: shutil.copy(unicode(ocr_path), unicode(tmpdir/ocr_path.name)) else: (tmpdir/ocr_path.name).symlink_to(ocr_path.absolute()) images.append(link_path.absolute()) pdf_file = target_path.absolute()/"book.pdf" # TODO: Use table_of_contents to create a TOCFILE for pdfbeads # TODO: Use page.page_label to create a LSPEC for pdfbeads # NOTE: pdfbeads only finds *html files for the text layer in the # working directory, so we have to chdir() into it old_path = os.path.abspath(os.path.curdir) os.chdir(unicode(tmpdir)) cmd = [BIN, "-d", "-M", unicode(meta_file)] if IS_WIN: cmd.append(util.wildcardify(tuple(f.name for f in images))) else: cmd.extend([unicode(f) for f in images]) cmd.extend(["-o", unicode(pdf_file)]) logger.debug("Running " + " ".join(cmd)) proc = util.get_subprocess(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=IS_WIN) if IS_WIN: # NOTE: Due to a bug in the jbig2enc version for Windows, the error # output gets huge, creating a deadlock. Hence, we go the # safe way and use `communicate()`, though this means no # progress notification for the user. output, errors = proc.communicate() else: errors = "" is_jbig2 = False cur_jbig2_page = 0 while proc.poll() is None: cur_line = proc.stderr.readline() errors += "\n" + cur_line prep_match = re.match(r"^Prepared data for processing (.*)$", cur_line) proc_match = re.match(r"^Processed (.*)$", cur_line) jbig2_match = re.match( r"^JBIG2 compression complete. pages:(\d+) symbols:\d+ " r"log2:\d+$", cur_line) progress = None if prep_match: file_idx = next(idx for idx, f in enumerate(images) if unicode(f) == prep_match.group(1)) progress = file_idx/(len(images)*2) elif jbig2_match: cur_jbig2_page += int(jbig2_match.group(1)) progress = (len(images) + cur_jbig2_page) / (len(images)*2) is_jbig2 = True elif proc_match and not is_jbig2: file_idx = next(idx for idx, f in enumerate(images) if unicode(f) == proc_match.group(1)) progress = (len(images) + file_idx)/(len(images)*2) if progress is not None: self.on_progressed.send(self, progress=progress) time.sleep(.01) output = proc.stdout.read() logger.debug("pdfbeads stdout:\n{0}".format(output)) logger.debug("pdfbeads stderr:\n{0}".format(errors)) os.chdir(old_path) shutil.rmtree(unicode(tmpdir))
def _generate_configuration(self, in_paths, projectfile, out_dir): """ Run images through ScanTailor pre-processing steps. :param in_paths: Paths to images to be processed :type in_paths: list of :py:class:`pathlib.Path` :param projectfile: Path ScanTailor configuration file :type projectfile: :py:class:`pathlib.Path` :param out_dir: Output directory for processed files :type out_dir: :py:class:`pathlib.Path` """ # Filters are numbered from 1 to 6, with 6 being the 'create output # files' step. filterconf = [self.config[x].get(bool) for x in ('rotate', 'split_pages', 'deskew', 'content', 'auto_margins')] start_filter = filterconf.index(True)+1 end_filter = len(filterconf) - list(reversed(filterconf)).index(True) marginconf = self.config['margins'].as_str_seq() # Build initial command-line generation_cmd = [CLI_BIN, '--start-filter={0}'.format(start_filter), '--end-filter={0}'.format(end_filter), '--layout=1.5', '-o={0}'.format(projectfile)] # The 'enhanced' fork of ScanTailor has some additional features page_detection = self.config['detection'].get() == 'page' if self._enhanced and page_detection: generation_cmd.extend([ '--enable-page-detection', '--disable-content-detection', '--enable-fine-tuning' ]) else: generation_cmd.extend([ '--margins-top={0}'.format(marginconf[0]), '--margins-right={0}'.format(marginconf[1]), '--margins-bottom={0}'.format(marginconf[2]), '--margins-left={0}'.format(marginconf[3]), ]) if IS_WIN: # NOTE: Due to Window's commandline length limit of 8192 chars, # we have to pipe in the list of files via stdin generation_cmd.append("-") else: generation_cmd.extend(in_paths) generation_cmd.append(unicode(out_dir)) logger.debug(" ".join(generation_cmd)) if IS_WIN: sp = util.get_subprocess(generation_cmd, stdin=subprocess.PIPE) sp.stdin.write(" ".join(in_paths)) sp.stdin.close() else: sp = util.get_subprocess(generation_cmd) proc = psutil.Process(sp.pid) # Keep track of the progress by monitoring the files opened by the # ScanTailor process. Since it processes the files in order and we # know in advance how often a file will be opened (= number of steps) # we can reliably calculate how far a long we are and emit # # :py:attr:`on_progressed` events. num_images = len(in_paths) num_steps = (end_filter - start_filter)+1 last_fileidx = 0 recent_fileidx = 0 finished_steps = 0 while proc.is_running(): try: recent_fileidx = next(in_paths.index(x.path) for x in proc.open_files() if x.path in in_paths) except StopIteration: pass except psutil.AccessDenied: # This means the process is no longer running break if recent_fileidx == last_fileidx: time.sleep(.01) continue if recent_fileidx < last_fileidx: finished_steps += 1 last_fileidx = recent_fileidx progress = 0.5*((finished_steps*num_images+last_fileidx) / float(num_steps*num_images)) self.on_progressed.send(self, progress=progress)
def __init__(self, config): super(ScanTailorPlugin, self).__init__(config) help_out = util.get_subprocess([CLI_BIN], stdout=subprocess.PIPE).communicate()[0] self._enhanced = bool(re.match(r".*<images\|directory\|->.*", help_out.splitlines()[7]))
def process(self, pages, target_path): """ Run the most recent image of every page through ScanTailor. :param pages: Pages to be processed :type pages: list of :py:class:`spreads.workflow.Page` :param target_path: Base directory where rotated images are to be stored :type target_path: :py:class:`pathlib.Path` """ autopilot = self.config['autopilot'].get(bool) if not autopilot and not util.find_in_path('scantailor'): raise util.MissingDependencyException( "Could not find executable `scantailor` in" " $PATH. Please install the appropriate" " package(s)!") # Create temporary files/directories projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1]) out_dir = Path(tempfile.mkdtemp(prefix='st-out')) # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[unicode(fpath)] = page logger.info("Generating ScanTailor configuration") self._generate_configuration(sorted(in_paths.keys()), projectfile, out_dir) if not autopilot: logger.warn("If you are changing output settings (in the last " "step, you *have* to run the last step from the GUI. " "Due to a bug in ScanTailor, your settings would " "otherwise be ignored.") time.sleep(5) logger.info("Opening ScanTailor GUI for manual adjustment") proc = util.get_subprocess([GUI_BIN, unicode(projectfile)]) proc.wait() # Check if the user already generated output files from the GUI if not sum(1 for x in out_dir.glob('*.tif')) == len(pages): logger.info("Generating output images from ScanTailor " "configuration.") self._generate_output(projectfile, out_dir, len(pages)) # Associate generated output files with our pages for fname in out_dir.glob('*.tif'): out_stem = fname.stem for in_path, page in in_paths.iteritems(): if Path(in_path).stem == out_stem: target_fname = target_path/fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn("Could not find page for output file {0}" .format(fname)) # Remove temporary files/directories shutil.rmtree(unicode(out_dir)) # FIXME: This fails on Windows since there seems to be some non-gcable # reference to the file around, but I currently cannot figure # out where, so we just ignore the error... try: projectfile.unlink() except WindowsError as e: if e.errno == 32: pass
def _generate_configuration(self, in_paths, projectfile, out_dir): filterconf = [self.config[x].get(bool) for x in ('rotate', 'split_pages', 'deskew', 'content', 'auto_margins')] start_filter = filterconf.index(True)+1 end_filter = len(filterconf) - list(reversed(filterconf)).index(True) marginconf = self.config['margins'].as_str_seq() generation_cmd = [CLI_BIN, '--start-filter={0}'.format(start_filter), '--end-filter={0}'.format(end_filter), '--layout=1.5', '-o={0}'.format(projectfile)] page_detection = self.config['detection'].get() == 'page' if self._enhanced and page_detection: generation_cmd.extend([ '--enable-page-detection', '--disable-content-detection', '--enable-fine-tuning' ]) else: generation_cmd.extend([ '--margins-top={0}'.format(marginconf[0]), '--margins-right={0}'.format(marginconf[1]), '--margins-bottom={0}'.format(marginconf[2]), '--margins-left={0}'.format(marginconf[3]), ]) if IS_WIN: # NOTE: Due to Window's commandline length limit of 8192 chars, # we have to pipe in the list of files via stdin generation_cmd.append("-") else: generation_cmd.extend(in_paths) generation_cmd.append(unicode(out_dir)) logger.debug(" ".join(generation_cmd)) if IS_WIN: sp = util.get_subprocess(generation_cmd, stdin=subprocess.PIPE) sp.stdin.write(" ".join(in_paths)) sp.stdin.close() else: sp = util.get_subprocess(generation_cmd) proc = psutil.Process(sp.pid) num_images = len(in_paths) num_steps = (end_filter - start_filter)+1 last_fileidx = 0 recent_fileidx = 0 finished_steps = 0 while proc.is_running(): try: recent_fileidx = next(in_paths.index(x.path) for x in proc.open_files() if x.path in in_paths) except StopIteration: pass except psutil.AccessDenied: # This means the process is no longer running break if recent_fileidx == last_fileidx: time.sleep(.01) continue if recent_fileidx < last_fileidx: finished_steps += 1 last_fileidx = recent_fileidx progress = 0.5*((finished_steps*num_images+last_fileidx) / float(num_steps*num_images)) self.on_progressed.send(self, progress=progress)
from spreads.config import OptionTemplate from spreads.plugin import HookPlugin, ProcessHooksMixin from spreads.vendor.pathlib import Path BIN = util.find_in_path('tesseract') if not BIN: raise util.MissingDependencyException( "Could not find executable `tesseract`. Please install the appropriate" " package(s)!") # Newer versions of Tesseract provide a flag to obtain a list of installed # OCR languages, for older versions we have to read out the directory # containing the training data for languages. try: AVAILABLE_LANGS = (util.get_subprocess( [BIN, "--list-langs"], stderr=subprocess.STDOUT, stdout=subprocess.PIPE).communicate()[0].split("\n")[1:-1]) # There should be at least a single language if not AVAILABLE_LANGS: raise ValueError() except (subprocess.CalledProcessError, ValueError): AVAILABLE_LANGS = [ x.stem for x in Path('/usr/share/tesseract-ocr/tessdata').glob( '*.traineddata') ] logger = logging.getLogger('spreadsplug.tesseract') class TesseractPlugin(HookPlugin, ProcessHooksMixin): __name__ = 'tesseract'
from spreads.plugin import HookPlugin, ProcessHooksMixin from pathlib import Path BIN = util.find_in_path('tesseract') if not BIN: raise util.MissingDependencyException( "Could not find executable `tesseract`. Please install the appropriate" " package(s)!") # Newer versions of Tesseract provide a flag to obtain a list of installed # OCR languages, for older versions we have to read out the directory # containing the training data for languages. try: AVAILABLE_LANGS = (util.get_subprocess([BIN, "--list-langs"], stderr=subprocess.STDOUT, stdout=subprocess.PIPE) .communicate()[0] .split("\n")[1:-1]) # There should be at least a single language if not AVAILABLE_LANGS: raise ValueError() except (subprocess.CalledProcessError, ValueError): AVAILABLE_LANGS = [x.stem for x in Path('/usr/share/tesseract-ocr/tessdata') .glob('*.traineddata')] logger = logging.getLogger('spreadsplug.tesseract') class TesseractPlugin(HookPlugin, ProcessHooksMixin): __name__ = 'tesseract'
def output(self, pages, target_path, metadata, table_of_contents): logger.info("Assembling PDF.") tmpdir = Path(tempfile.mkdtemp()) meta_file = tmpdir/'metadata.txt' with codecs.open(unicode(meta_file), "w", "utf-8") as fp: for key, value in metadata.iteritems(): if key == 'title': fp.write("Title: \"{0}\"\n".format(value)) if key == 'creator': for author in value: fp.write("Author: \"{0}\"\n".format(author)) images = [] for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image link_path = (tmpdir/fpath.name) if IS_WIN: shutil.copy(unicode(fpath), unicode(link_path)) else: link_path.symlink_to(fpath.absolute()) if 'tesseract' in page.processed_images: ocr_path = page.processed_images['tesseract'] if IS_WIN: shutil.copy(unicode(ocr_path), unicode(tmpdir/ocr_path.name)) else: (tmpdir/ocr_path.name).symlink_to(ocr_path.absolute()) images.append(link_path.absolute()) pdf_file = target_path.absolute()/"book.pdf" # TODO: Use table_of_contents to create a TOCFILE for pdfbeads # TODO: Use page.page_label to create a LSPEC for pdfbeads # NOTE: pdfbeads only finds *html files for the text layer in the # working directory, so we have to chdir() into it old_path = os.path.abspath(os.path.curdir) os.chdir(unicode(tmpdir)) cmd = [BIN, "-d", "-M", unicode(meta_file)] if IS_WIN: cmd.append(util.wildcardify(tuple(f.name for f in images))) else: cmd.extend([unicode(f) for f in images]) cmd.extend(["-o", unicode(pdf_file)]) logger.debug("Running " + " ".join(cmd)) proc = util.get_subprocess(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=IS_WIN) if IS_WIN: # NOTE: Due to a bug in the jbig2enc version for Windows, the error # output gets huge, creating a deadlock. Hence, we go the # safe way and use `communicate()`, though this means no # progress notification for the user. output, errors = proc.communicate() else: last_count = 0 while proc.poll() is None: current_count = sum(1 for x in tmpdir.glob('*.jbig2')) if current_count > last_count: last_count = current_count self.on_progressed.send( self, progress=float(current_count)/len(images)) time.sleep(.01) output = proc.stdout.read() errors = proc.stderr.read() logger.debug("pdfbeads stdout:\n{0}".format(output)) logger.debug("pdfbeads stderr:\n{0}".format(errors)) os.chdir(old_path) shutil.rmtree(unicode(tmpdir))
def output(self, pages, target_path, metadata, table_of_contents): """ Go through pages and bundle their most recent images into a PDF file. :param pages: Pages to bundle :param target_path: list of :py:class:`spreads.workflow.Page` :param metadata: Metadata to include in PDF file :type metadata: :py:class:`spreads.metadata.Metadata` :param table_of_contents: Table of contents to include in PDF file :type table_of_contents: list of :py:class:`TocEntry` """ logger.info("Assembling PDF.") tmpdir = Path(tempfile.mkdtemp()) meta_file = tmpdir / 'metadata.txt' with codecs.open(unicode(meta_file), "w", "utf-8") as fp: for key, value in metadata.iteritems(): if key == 'title': fp.write("Title: \"{0}\"\n".format(value)) if key == 'creator': for author in value: fp.write("Author: \"{0}\"\n".format(author)) images = [] for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image link_path = (tmpdir / fpath.name) if IS_WIN: shutil.copy(unicode(fpath), unicode(link_path)) else: link_path.symlink_to(fpath.absolute()) if 'tesseract' in page.processed_images: ocr_path = page.processed_images['tesseract'] if IS_WIN: shutil.copy(unicode(ocr_path), unicode(tmpdir / ocr_path.name)) else: (tmpdir / ocr_path.name).symlink_to(ocr_path.absolute()) images.append(link_path.absolute()) pdf_file = target_path.absolute() / "book.pdf" # TODO: Use table_of_contents to create a TOCFILE for pdfbeads # TODO: Use page.page_label to create a LSPEC for pdfbeads # NOTE: pdfbeads only finds *html files for the text layer in the # working directory, so we have to chdir() into it old_path = os.path.abspath(os.path.curdir) os.chdir(unicode(tmpdir)) cmd = [BIN, "-d", "-M", unicode(meta_file)] if IS_WIN: cmd.append(util.wildcardify(tuple(f.name for f in images))) else: cmd.extend([unicode(f) for f in images]) cmd.extend(["-o", unicode(pdf_file)]) logger.debug("Running " + " ".join(cmd)) proc = util.get_subprocess(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=IS_WIN) if IS_WIN: # NOTE: Due to a bug in the jbig2enc version for Windows, the error # output gets huge, creating a deadlock. Hence, we go the # safe way and use `communicate()`, though this means no # progress notification for the user. output, errors = proc.communicate() else: errors = "" is_jbig2 = False cur_jbig2_page = 0 while proc.poll() is None: cur_line = proc.stderr.readline() errors += "\n" + cur_line prep_match = re.match(r"^Prepared data for processing (.*)$", cur_line) proc_match = re.match(r"^Processed (.*)$", cur_line) jbig2_match = re.match( r"^JBIG2 compression complete. pages:(\d+) symbols:\d+ " r"log2:\d+$", cur_line) progress = None if prep_match: file_idx = next(idx for idx, f in enumerate(images) if unicode(f) == prep_match.group(1)) progress = file_idx / (len(images) * 2) elif jbig2_match: cur_jbig2_page += int(jbig2_match.group(1)) progress = (len(images) + cur_jbig2_page) / (len(images) * 2) is_jbig2 = True elif proc_match and not is_jbig2: file_idx = next(idx for idx, f in enumerate(images) if unicode(f) == proc_match.group(1)) progress = (len(images) + file_idx) / (len(images) * 2) if progress is not None: self.on_progressed.send(self, progress=progress) time.sleep(.01) output = proc.stdout.read() logger.debug("pdfbeads stdout:\n{0}".format(output)) logger.debug("pdfbeads stderr:\n{0}".format(errors)) os.chdir(old_path) shutil.rmtree(unicode(tmpdir))