import xml.etree.cElementTree as ET import psutil from spreads.vendor.pathlib import Path import spreads.util as util from spreads.config import OptionTemplate from spreads.plugin import HookPlugin, ProcessHooksMixin IS_WIN = util.is_os('windows') CLI_BIN = util.find_in_path('scantailor-cli') GUI_BIN = util.find_in_path('scantailor') if not CLI_BIN: raise util.MissingDependencyException( "Could not find executable `scantailor-cli`. Please" " install the" " appropriate package(s)!") logger = logging.getLogger('spreadsplug.scantailor') class ScanTailorPlugin(HookPlugin, ProcessHooksMixin): __name__ = 'scantailor' @classmethod def configuration_template(cls): conf = { 'autopilot': OptionTemplate(value=True, docstring="Skip manual correction"), 'rotate': OptionTemplate(value=False, docstring="Rotate pages"),
def process(self, pages, target_path): """ Run the most recent image of every page through ScanTailor. :param pages: Pages to be processed :type pages: list of :py:class:`spreads.workflow.Page` :param target_path: Base directory where rotated images are to be stored :type target_path: :py:class:`pathlib.Path` """ autopilot = self.config['autopilot'].get(bool) if not autopilot and not util.find_in_path('scantailor'): raise util.MissingDependencyException( "Could not find executable `scantailor` in" " $PATH. Please install the appropriate" " package(s)!") # Create temporary files/directories projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1]) out_dir = Path(tempfile.mkdtemp(prefix='st-out')) # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[unicode(fpath)] = page logger.info("Generating ScanTailor configuration") self._generate_configuration(sorted(in_paths.keys()), projectfile, out_dir) if not autopilot: logger.warn("If you are changing output settings (in the last " "step, you *have* to run the last step from the GUI. " "Due to a bug in ScanTailor, your settings would " "otherwise be ignored.") time.sleep(5) logger.info("Opening ScanTailor GUI for manual adjustment") util.get_subprocess([GUI_BIN, unicode(projectfile)]) # Check if the user already generated output files from the GUI if not sum(1 for x in out_dir.glob('*.tif')) == len(pages): logger.info("Generating output images from ScanTailor " "configuration.") self._generate_output(projectfile, out_dir, len(pages)) # Associate generated output files with our pages for fname in out_dir.glob('*.tif'): out_stem = fname.stem for in_path, page in in_paths.iteritems(): if Path(in_path).stem == out_stem: target_fname = target_path / fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn( "Could not find page for output file {0}".format(fname)) # Remove temporary files/directories shutil.rmtree(unicode(out_dir)) # FIXME: This fails on Windows since there seems to be some non-gcable # reference to the file around, but I currently cannot figure # out where, so we just ignore the error... try: projectfile.unlink() except WindowsError as e: if e.errno == 32: pass
import shutil import subprocess import tempfile import time import xml.etree.cElementTree as ET from itertools import chain import spreads.util as util from spreads.config import OptionTemplate from spreads.plugin import HookPlugin, ProcessHooksMixin from spreads.vendor.pathlib import Path BIN = util.find_in_path('tesseract') if not BIN: raise util.MissingDependencyException( "Could not find executable `tesseract`. Please install the appropriate" " package(s)!") # Newer versions of Tesseract provide a flag to obtain a list of installed # OCR languages, for older versions we have to read out the directory # containing the training data for languages. try: AVAILABLE_LANGS = (util.get_subprocess( [BIN, "--list-langs"], stderr=subprocess.STDOUT, stdout=subprocess.PIPE).communicate()[0].split("\n")[1:-1]) # There should be at least a single language if not AVAILABLE_LANGS: raise ValueError() except (subprocess.CalledProcessError, ValueError): AVAILABLE_LANGS = [