Example #1
0
    def process(self, pages, target_path):
        autopilot = self.config['autopilot'].get(bool)
        if not autopilot and not find_in_path('scantailor'):
            raise MissingDependencyException(
                "Could not find executable `scantailor` in"
                " $PATH. Please install the appropriate"
                " package(s)!")

        # Create temporary files/directories
        projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1])
        out_dir = Path(tempfile.mkdtemp(prefix='st-out'))

        # Map input paths to their pages so we can more easily associate
        # the generated output files with their pages later on
        in_paths = {}
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            in_paths[unicode(fpath)] = page

        logger.info("Generating ScanTailor configuration")
        self._generate_configuration(sorted(in_paths.keys()),
                                     projectfile, out_dir)

        if not autopilot:
            logger.warn("If you are changing output settings (in the last "
                        "step, you *have* to run the last step from the GUI. "
                        "Due to a bug in ScanTailor, your settings would "
                        "otherwise be ignored.")
            time.sleep(5)
            logger.info("Opening ScanTailor GUI for manual adjustment")
            subprocess.call([find_in_path('scantailor'), unicode(projectfile)])
        # Check if the user already generated output files from the GUI
        if not sum(1 for x in out_dir.glob('*.tif')) == len(pages):
            logger.info("Generating output images from ScanTailor "
                        "configuration.")
            self._generate_output(projectfile, out_dir, len(pages))

        # Associate generated output files with our pages
        for fname in out_dir.glob('*.tif'):
            out_stem = fname.stem
            for in_path, page in in_paths.iteritems():
                if Path(in_path).stem == out_stem:
                    target_fname = target_path/fname.name
                    shutil.copyfile(unicode(fname), unicode(target_fname))
                    page.processed_images[self.__name__] = target_fname
                    break
            else:
                logger.warn("Could not find page for output file {0}"
                            .format(fname))

        # Remove temporary files/directories
        shutil.rmtree(unicode(out_dir))
        projectfile.unlink()
Example #2
0
    def process(self, path):
        autopilot = self.config['autopilot'].get(bool)
        if not autopilot and not find_in_path('scantailor'):
            raise MissingDependencyException(
                "Could not find executable `scantailor` in"
                " $PATH. Please install the appropriate"
                " package(s)!")
        projectfile = path / "{0}.ScanTailor".format(path.name)
        img_dir = path / 'raw'
        out_dir = path / 'done'

        if not projectfile.exists():
            self._generate_configuration(projectfile, img_dir, out_dir)
        if not autopilot:
            logger.info("Opening ScanTailor GUI for manual adjustment")
            subprocess.call(['scantailor', unicode(projectfile)])
        logger.info("Generating output images from ScanTailor configuration.")
        self._generate_output(projectfile, out_dir)
Example #3
0
    def output(self, pages, target_path, metadata, table_of_contents):
        logger.info("Assembling PDF.")

        tmpdir = Path(tempfile.mkdtemp())
        # NOTE: pdfbeads only finds *html files for the text layer in the
        #       working directory, so we have to chdir() into it
        old_path = os.path.abspath(os.path.curdir)
        os.chdir(unicode(tmpdir))

        images = []
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            link_path = (tmpdir / fpath.name)
            link_path.symlink_to(fpath)
            if 'tesseract' in page.processed_images:
                ocr_path = page.processed_images['tesseract']
                (tmpdir / ocr_path.name).symlink_to(ocr_path)
            images.append(link_path)

        # TODO: Use metadata to create a METAFILE for pdfbeads
        # TODO: Use table_of_contents to create a TOCFILE for pdfbeads
        # TODO: Use page.page_label to create a LSPEC for pdfbeads

        pdf_file = target_path / "book.pdf"
        cmd = [find_in_path("pdfbeads"), "-d"]
        cmd.extend([f.name for f in images])
        cmd.extend(["-o", unicode(pdf_file)])
        logger.debug("Running " + " ".join(cmd))
        proc = subprocess.Popen(cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT)
        last_count = 0
        while proc.poll() is None:
            current_count = sum(1 for x in tmpdir.glob('*.jbig2'))
            if current_count > last_count:
                last_count = current_count
                self.on_progressed.send(self,
                                        progress=float(current_count) /
                                        len(images))
            time.sleep(.01)
        logger.debug("Output:\n{0}".format(proc.stdout.read()))
        os.chdir(old_path)
Example #4
0
    def output(self, pages, target_path, metadata, table_of_contents):
        logger.info("Assembling PDF.")

        tmpdir = Path(tempfile.mkdtemp())
        # NOTE: pdfbeads only finds *html files for the text layer in the
        #       working directory, so we have to chdir() into it
        old_path = os.path.abspath(os.path.curdir)
        os.chdir(unicode(tmpdir))

        images = []
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            link_path = (tmpdir/fpath.name)
            link_path.symlink_to(fpath)
            if 'tesseract' in page.processed_images:
                ocr_path = page.processed_images['tesseract']
                (tmpdir/ocr_path.name).symlink_to(ocr_path)
            images.append(link_path)

        # TODO: Use metadata to create a METAFILE for pdfbeads
        # TODO: Use table_of_contents to create a TOCFILE for pdfbeads
        # TODO: Use page.page_label to create a LSPEC for pdfbeads

        pdf_file = target_path/"book.pdf"
        cmd = [find_in_path("pdfbeads"), "-d"]
        cmd.extend([f.name for f in images])
        cmd.extend(["-o", unicode(pdf_file)])
        logger.debug("Running " + " ".join(cmd))
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT)
        last_count = 0
        while proc.poll() is None:
            current_count = sum(1 for x in tmpdir.glob('*.jbig2'))
            if current_count > last_count:
                last_count = current_count
                self.on_progressed.send(
                    self, progress=float(current_count)/len(images))
            time.sleep(.01)
        logger.debug("Output:\n{0}".format(proc.stdout.read()))
        os.chdir(old_path)
Example #5
0
    def process(self, path):
        autopilot = (self.config['scantailor']['autopilot']
                     .get(bool) or self.config['autopilot'].get(bool))
        if not autopilot and not find_in_path('scantailor'):
            raise MissingDependencyException(
                "Could not find executable `scantailor` in"
                " $PATH. Please install the appropriate"
                " package(s)!")
        projectfile = os.path.join(path, "{0}.ScanTailor".format(
            os.path.basename(path)))
        img_dir = os.path.join(path, 'raw')
        out_dir = os.path.join(path, 'done')

        if not os.path.exists(projectfile):
            self._generate_configuration(projectfile, img_dir, out_dir)
        if not autopilot:
            logger.info("Opening ScanTailor GUI for manual adjustment")
            subprocess.call(['scantailor', projectfile])
        logger.info("Generating output images from ScanTailor configuration.")
        self._generate_output(projectfile, out_dir)
Example #6
0
    def _generate_output(self, projectfile, out_dir, num_pages):
        logger.debug("Generating output...")
        temp_dir = Path(tempfile.mkdtemp(prefix="spreads."))
        split_config = self._split_configuration(projectfile, temp_dir)
        logger.debug("Launching those subprocesses!")
        processes = [subprocess.Popen([find_in_path('scantailor-cli'),
                                       '--start-filter=6', unicode(cfgfile),
                                       unicode(out_dir)])
                     for cfgfile in split_config]

        last_count = 0
        while processes:
            recent_count = sum(1 for x in out_dir.glob('*.tif'))
            if recent_count > last_count:
                progress = 0.5 + (float(recent_count)/num_pages)/2
                self.on_progressed.send(self, progress=progress)
                last_count = recent_count
            for p in processes[:]:
                if p.poll() is not None:
                    processes.remove(p)
            time.sleep(.01)
        shutil.rmtree(unicode(temp_dir))
Example #7
0
    def process(self, path):
        autopilot = self.config['autopilot'].get(bool)
        if not autopilot and not find_in_path('scantailor'):
            raise MissingDependencyException(
                "Could not find executable `scantailor` in"
                " $PATH. Please install the appropriate"
                " package(s)!")
        projectfile = path / "{0}.ScanTailor".format(path.name)
        img_dir = path / 'raw'
        out_dir = path / 'done'

        if not projectfile.exists():
            self._generate_configuration(projectfile, img_dir, out_dir)

        if not autopilot:
            logger.info("Opening ScanTailor GUI for manual adjustment")
            subprocess.call(['scantailor', unicode(projectfile)])
        logger.info("Generating output images from ScanTailor configuration.")

        num_pages = sum(1 for x in img_dir.iterdir()
                        if x.suffix.lower() in ('.jpeg', '.jpg'))
        self._generate_output(projectfile, out_dir, num_pages)
Example #8
0
    def process(self, pages, target_path):
        """ Run the most recent image of every page through ScanTailor.

        :param pages:       Pages to be processed
        :type pages:        list of :py:class:`spreads.workflow.Page`
        :param target_path: Base directory where rotated images are to be
                            stored
        :type target_path:  :py:class:`pathlib.Path`
        """
        autopilot = self.config['autopilot'].get(bool)
        if not autopilot and not util.find_in_path('scantailor'):
            raise util.MissingDependencyException(
                "Could not find executable `scantailor` in"
                " $PATH. Please install the appropriate"
                " package(s)!")

        # Create temporary files/directories
        projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1])
        out_dir = Path(tempfile.mkdtemp(prefix='st-out'))

        # Map input paths to their pages so we can more easily associate
        # the generated output files with their pages later on
        in_paths = {}
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            in_paths[unicode(fpath)] = page

        logger.info("Generating ScanTailor configuration")
        self._generate_configuration(sorted(in_paths.keys()), projectfile,
                                     out_dir)

        if not autopilot:
            logger.warn("If you are changing output settings (in the last "
                        "step, you *have* to run the last step from the GUI. "
                        "Due to a bug in ScanTailor, your settings would "
                        "otherwise be ignored.")
            time.sleep(5)
            logger.info("Opening ScanTailor GUI for manual adjustment")
            util.get_subprocess([GUI_BIN, unicode(projectfile)])
        # Check if the user already generated output files from the GUI
        if not sum(1 for x in out_dir.glob('*.tif')) == len(pages):
            logger.info("Generating output images from ScanTailor "
                        "configuration.")
            self._generate_output(projectfile, out_dir, len(pages))

        # Associate generated output files with our pages
        for fname in out_dir.glob('*.tif'):
            out_stem = fname.stem
            for in_path, page in in_paths.iteritems():
                if Path(in_path).stem == out_stem:
                    target_fname = target_path / fname.name
                    shutil.copyfile(unicode(fname), unicode(target_fname))
                    page.processed_images[self.__name__] = target_fname
                    break
            else:
                logger.warn(
                    "Could not find page for output file {0}".format(fname))

        # Remove temporary files/directories
        shutil.rmtree(unicode(out_dir))
        # FIXME: This fails on Windows since there seems to be some non-gcable
        #        reference to the file around, but I currently cannot figure
        #        out where, so we just ignore the error...
        try:
            projectfile.unlink()
        except WindowsError as e:
            if e.errno == 32:
                pass
Example #9
0
import codecs
import logging
import os
import re
import shutil
import subprocess
import tempfile
import time

from spreads.vendor.pathlib import Path

import spreads.util as util
from spreads.plugin import HookPlugin, OutputHooksMixin

BIN = util.find_in_path('pdfbeads')
IS_WIN = util.is_os('windows')

if not BIN:
    raise util.MissingDependencyException(
        "Could not find executable `pdfbeads`. Please install the appropriate "
        "package(s)!")


logger = logging.getLogger('spreadsplug.pdfbeads')


class PDFBeadsPlugin(HookPlugin, OutputHooksMixin):
    __name__ = 'pdfbeads'

    def output(self, pages, target_path, metadata, table_of_contents):
Example #10
0
import re
import shutil
import subprocess
import tempfile
import time
import xml.etree.cElementTree as ET

import psutil
from spreads.vendor.pathlib import Path

import spreads.util as util
from spreads.config import OptionTemplate
from spreads.plugin import HookPlugin, ProcessHooksMixin

IS_WIN = util.is_os('windows')
CLI_BIN = util.find_in_path('scantailor-cli')
GUI_BIN = util.find_in_path('scantailor')

if not CLI_BIN:
    raise util.MissingDependencyException(
        "Could not find executable `scantailor-cli`. Please"
        " install the"
        " appropriate package(s)!")

logger = logging.getLogger('spreadsplug.scantailor')


class ScanTailorPlugin(HookPlugin, ProcessHooksMixin):
    __name__ = 'scantailor'

    @classmethod
Example #11
0
import re
import shutil
import subprocess
import tempfile
import time
import xml.etree.cElementTree as ET

import psutil
from pathlib import Path

import spreads.util as util
from spreads.config import OptionTemplate
from spreads.plugin import HookPlugin, ProcessHooksMixin

IS_WIN = util.is_os('windows')
CLI_BIN = util.find_in_path('scantailor-cli')
GUI_BIN = util.find_in_path('scantailor')

if not CLI_BIN:
    raise util.MissingDependencyException(
        "Could not find executable `scantailor-cli`. Please" " install the"
        " appropriate package(s)!")

logger = logging.getLogger('spreadsplug.scantailor')


class ScanTailorPlugin(HookPlugin, ProcessHooksMixin):
    __name__ = 'scantailor'

    @classmethod
    def configuration_template(cls):
Example #12
0
    def process(self, pages, target_path):
        """ Run the most recent image of every page through ScanTailor.

        :param pages:       Pages to be processed
        :type pages:        list of :py:class:`spreads.workflow.Page`
        :param target_path: Base directory where rotated images are to be
                            stored
        :type target_path:  :py:class:`pathlib.Path`
        """
        autopilot = self.config['autopilot'].get(bool)
        if not autopilot and not util.find_in_path('scantailor'):
            raise util.MissingDependencyException(
                "Could not find executable `scantailor` in"
                " $PATH. Please install the appropriate"
                " package(s)!")

        # Create temporary files/directories
        projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1])
        out_dir = Path(tempfile.mkdtemp(prefix='st-out'))

        # Map input paths to their pages so we can more easily associate
        # the generated output files with their pages later on
        in_paths = {}
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            in_paths[unicode(fpath)] = page

        logger.info("Generating ScanTailor configuration")
        self._generate_configuration(sorted(in_paths.keys()),
                                     projectfile, out_dir)

        if not autopilot:
            logger.warn("If you are changing output settings (in the last "
                        "step, you *have* to run the last step from the GUI. "
                        "Due to a bug in ScanTailor, your settings would "
                        "otherwise be ignored.")
            time.sleep(5)
            logger.info("Opening ScanTailor GUI for manual adjustment")
            proc = util.get_subprocess([GUI_BIN, unicode(projectfile)])
            proc.wait()
        # Check if the user already generated output files from the GUI
        if not sum(1 for x in out_dir.glob('*.tif')) == len(pages):
            logger.info("Generating output images from ScanTailor "
                        "configuration.")
            self._generate_output(projectfile, out_dir, len(pages))

        # Associate generated output files with our pages
        for fname in out_dir.glob('*.tif'):
            out_stem = fname.stem
            for in_path, page in in_paths.iteritems():
                if Path(in_path).stem == out_stem:
                    target_fname = target_path/fname.name
                    shutil.copyfile(unicode(fname), unicode(target_fname))
                    page.processed_images[self.__name__] = target_fname
                    break
            else:
                logger.warn("Could not find page for output file {0}"
                            .format(fname))

        # Remove temporary files/directories
        shutil.rmtree(unicode(out_dir))
        # FIXME: This fails on Windows since there seems to be some non-gcable
        #        reference to the file around, but I currently cannot figure
        #        out where, so we just ignore the error...
        try:
            projectfile.unlink()
        except WindowsError as e:
            if e.errno == 32:
                pass
Example #13
0
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from __future__ import division, unicode_literals

import logging
import os
import subprocess

from spreads.plugin import HookPlugin, OutputHookMixin
from spreads.util import MissingDependencyException, find_in_path

if not find_in_path('djvubind'):
    raise MissingDependencyException("Could not find executable `djvubind` in"
                                     " $PATH. Please install the appropriate"
                                     " package(s)!")

logger = logging.getLogger('spreadsplug.djvubind')


class DjvuBindPlugin(HookPlugin, OutputHookMixin):
    __name__ = 'djvubind'

    def output(self, path):
        logger.info("Assembling DJVU.")
        img_dir = path / 'done'
        djvu_file = path / 'out' / "{0}.djvu".format(path.name)
        cmd = ["djvubind", unicode(img_dir)]
Example #14
0
 def __init__(self, config):
     super(ScanTailorPlugin, self).__init__(config)
     self._enhanced = bool(re.match(r".*<images\|directory\|->.*",
                           subprocess.check_output(
                               find_in_path('scantailor-cli'))
                           .splitlines()[7]))
Example #15
0
import logging
import os
import re
import subprocess
import xml.etree.cElementTree as ET

from concurrent import futures

from spreads.plugin import HookPlugin
from spreads.util import find_in_path, MissingDependencyException

if not find_in_path('tesseract'):
    raise MissingDependencyException("Could not find executable `tesseract`"
                                     " in $PATH. Please install the"
                                     " appropriate package(s)!")

logger = logging.getLogger('spreadsplug.tesseract')


class TesseractPlugin(HookPlugin):
    @classmethod
    def add_arguments(cls, command, parser):
        if command == 'postprocess':
            parser.add_argument("--language",
                                "-l",
                                dest="language",
                                default="eng",
                                help="OCR language (3-letter language code)"
                                " [default: eng]")

    def process(self, path):
Example #16
0
from __future__ import division, unicode_literals

import logging
import os
import shutil
import subprocess
import tempfile
import time

from spreads.vendor.pathlib import Path

from spreads.plugin import HookPlugin, OutputHookMixin
from spreads.util import MissingDependencyException, find_in_path

if not find_in_path('pdfbeads'):
    raise MissingDependencyException("Could not find executable `pdfbeads`."
                                     "Please install the appropriate "
                                     "package(s)!")

logger = logging.getLogger('spreadsplug.pdfbeads')


class PDFBeadsPlugin(HookPlugin, OutputHookMixin):
    __name__ = 'pdfbeads'

    def output(self, pages, target_path, metadata, table_of_contents):
        logger.info("Assembling PDF.")

        tmpdir = Path(tempfile.mkdtemp())
        # NOTE: pdfbeads only finds *html files for the text layer in the
Example #17
0
from __future__ import division, unicode_literals

import logging
import math
import multiprocessing
import os
import re
import shutil
import subprocess
import tempfile
from xml.etree.cElementTree import ElementTree as ET

from spreads.plugin import HookPlugin
from spreads.util import find_in_path, MissingDependencyException

if not find_in_path('scantailor-cli'):
    raise MissingDependencyException("Could not find executable"
                                     " `scantailor-cli` in $PATH. Please"
                                     " install the appropriate package(s)!")

logger = logging.getLogger('spreadsplug.scantailor')


class ScanTailorPlugin(HookPlugin):
    _enhanced = bool(re.match(r".*<images\|directory\|->.*",
                              subprocess.check_output('scantailor-cli')
                              .splitlines()[7]))

    @classmethod
    def add_arguments(cls, command, parser):
        if command == "postprocess":
Example #18
0
from __future__ import division, unicode_literals

import logging
import os
import shutil
import subprocess
import tempfile
import time

from spreads.vendor.pathlib import Path

from spreads.plugin import HookPlugin, OutputHookMixin
from spreads.util import MissingDependencyException, find_in_path

if not find_in_path('pdfbeads'):
    raise MissingDependencyException("Could not find executable `pdfbeads`."
                                     "Please install the appropriate "
                                     "package(s)!")

logger = logging.getLogger('spreadsplug.pdfbeads')


class PDFBeadsPlugin(HookPlugin, OutputHookMixin):
    __name__ = 'pdfbeads'

    def output(self, pages, target_path, metadata, table_of_contents):
        logger.info("Assembling PDF.")

        tmpdir = Path(tempfile.mkdtemp())
        # NOTE: pdfbeads only finds *html files for the text layer in the
Example #19
0
import multiprocessing
import os
import re
import shutil
import subprocess
import tempfile
import time
import xml.etree.cElementTree as ET
from itertools import chain

import spreads.util as util
from spreads.config import OptionTemplate
from spreads.plugin import HookPlugin, ProcessHooksMixin
from spreads.vendor.pathlib import Path

BIN = util.find_in_path('tesseract')
if not BIN:
    raise util.MissingDependencyException(
        "Could not find executable `tesseract`. Please install the appropriate"
        " package(s)!")

# Newer versions of Tesseract provide a flag to obtain a list of installed
# OCR languages, for older versions we have to read out the directory
# containing the training data for languages.
try:
    AVAILABLE_LANGS = (util.get_subprocess(
        [BIN, "--list-langs"],
        stderr=subprocess.STDOUT,
        stdout=subprocess.PIPE).communicate()[0].split("\n")[1:-1])
    # There should be at least a single language
    if not AVAILABLE_LANGS:
Example #20
0
import multiprocessing
import os
import re
import shutil
import subprocess
import tempfile
import time
import xml.etree.cElementTree as ET
from itertools import chain

import spreads.util as util
from spreads.config import OptionTemplate
from spreads.plugin import HookPlugin, ProcessHooksMixin
from pathlib import Path

BIN = util.find_in_path('tesseract')
if not BIN:
    raise util.MissingDependencyException(
        "Could not find executable `tesseract`. Please install the appropriate"
        " package(s)!")

# Newer versions of Tesseract provide a flag to obtain a list of installed
# OCR languages, for older versions we have to read out the directory
# containing the training data for languages.
try:
    AVAILABLE_LANGS = (util.get_subprocess([BIN, "--list-langs"],
                                           stderr=subprocess.STDOUT,
                                           stdout=subprocess.PIPE)
                       .communicate()[0]
                       .split("\n")[1:-1])
    # There should be at least a single language
Example #21
0
    def _generate_configuration(self, in_paths, projectfile, out_dir):
        filterconf = [self.config[x].get(bool)
                      for x in ('rotate', 'split_pages', 'deskew', 'content',
                                'auto_margins')]
        start_filter = filterconf.index(True)+1
        end_filter = len(filterconf) - list(reversed(filterconf)).index(True)
        marginconf = self.config['margins'].as_str_seq()
        generation_cmd = [find_in_path('scantailor-cli'),
                          '--start-filter={0}'.format(start_filter),
                          '--end-filter={0}'.format(end_filter),
                          '--layout=1.5',
                          '-o={0}'.format(projectfile)]
        page_detection = self.config['detection'].get() == 'page'
        if self._enhanced and page_detection:
            generation_cmd.extend([
                '--enable-page-detection',
                '--disable-content-detection',
                '--enable-fine-tuning'
            ])
        else:
            generation_cmd.extend([
                '--margins-top={0}'.format(marginconf[0]),
                '--margins-right={0}'.format(marginconf[1]),
                '--margins-bottom={0}'.format(marginconf[2]),
                '--margins-left={0}'.format(marginconf[3]),
            ])
        # NOTE: We cannot pass individual filenames on windows, since we have
        # a limit of 32,768 characters for commands. Thus, we first try to
        # find a wildcard for our paths that matches only them, and if that
        # fails, throw an Exception and tell the user to use a proper OS...
        wildcard = wildcardify(in_paths)
        if not wildcard and IS_WIN:
            raise SpreadsException("Please use a proper operating system.")
        elif not wildcard:
            generation_cmd.extend(in_paths)
        else:
            generation_cmd.append(wildcard)

        generation_cmd.append(unicode(out_dir))
        logger.debug(" ".join(generation_cmd))
        proc = psutil.Process(subprocess.Popen(generation_cmd).pid)

        num_images = len(in_paths)
        num_steps = (end_filter - start_filter)+1
        last_fileidx = 0
        recent_fileidx = 0
        finished_steps = 0
        while proc.is_running():
            try:
                recent_fileidx = next(in_paths.index(x.path)
                                      for x in proc.open_files()
                                      if x.path in in_paths)
            except StopIteration:
                pass
            except psutil.AccessDenied:
                # This means the process is no longer running
                break
            if recent_fileidx == last_fileidx:
                time.sleep(.01)
                continue
            if recent_fileidx < last_fileidx:
                finished_steps += 1
            last_fileidx = recent_fileidx
            progress = 0.5*((finished_steps*num_images+last_fileidx) /
                            float(num_steps*num_images))
            self.on_progressed.send(self, progress=progress)
Example #22
0
import logging
import re
import subprocess
import xml.etree.cElementTree as ET

from concurrent import futures

from spreads.plugin import HookPlugin, PluginOption
from spreads.util import find_in_path, MissingDependencyException

if not find_in_path('tesseract'):
    raise MissingDependencyException("Could not find executable `tesseract`"
                                     " in $PATH. Please install the"
                                     " appropriate package(s)!")

AVAILABLE_LANGS = (subprocess.check_output(["tesseract", "--list-langs"],
                                           stderr=subprocess.STDOUT)
                   .split("\n")[1:-1])

logger = logging.getLogger('spreadsplug.tesseract')


class TesseractPlugin(HookPlugin):
    __name__ = 'tesseract'

    @classmethod
    def add_arguments(cls, command, parser):
        if command == 'postprocess':
            parser.add_argument("--language", "-l",
                                dest="language", default="eng",
                                help="OCR language (3-letter language code)"
Example #23
0
import codecs
import logging
import os
import re
import shutil
import subprocess
import tempfile
import time

from pathlib import Path

import spreads.util as util
from spreads.plugin import HookPlugin, OutputHooksMixin

BIN = util.find_in_path('pdfbeads')
IS_WIN = util.is_os('windows')

if not BIN:
    raise util.MissingDependencyException(
        "Could not find executable `pdfbeads`. Please install the appropriate "
        "package(s)!")

logger = logging.getLogger('spreadsplug.pdfbeads')


class PDFBeadsPlugin(HookPlugin, OutputHooksMixin):
    __name__ = 'pdfbeads'

    def output(self, pages, target_path, metadata, table_of_contents):
        """ Go through pages and bundle their most recent images into a PDF
Example #24
0
import logging
import math
import multiprocessing
import re
import shutil
import subprocess
import tempfile
from xml.etree.cElementTree import ElementTree as ET

from spreads.vendor.pathlib import Path

from spreads.plugin import HookPlugin, ProcessHookMixin, PluginOption
from spreads.util import find_in_path, MissingDependencyException

if not find_in_path('scantailor-cli'):
    raise MissingDependencyException("Could not find executable"
                                     " `scantailor-cli` in $PATH. Please"
                                     " install the appropriate package(s)!")

logger = logging.getLogger('spreadsplug.scantailor')


class ScanTailorPlugin(HookPlugin, ProcessHookMixin):
    __name__ = 'scantailor'

    @classmethod
    def configuration_template(cls):
        conf = {'autopilot': PluginOption(value=False,
                                          docstring="Skip manual correction"),
                'rotate': PluginOption(value=False, docstring="Rotate pages"),
Example #25
0
import logging
import os
import re
import subprocess
import tempfile
import time
from fractions import Fraction
from itertools import chain

from spreads.vendor.pathlib import Path

from spreads.plugin import DevicePlugin, PluginOption, DeviceFeatures
from spreads.util import (DeviceException, find_in_path,
                          MissingDependencyException)

if not find_in_path('exiftool'):
    raise MissingDependencyException("Could not find executable `exiftool`"
                                     " in $PATH. Please install the"
                                     " appropriate package(s)!")


class CHDKPTPException(Exception):
    pass


class CHDKCameraDevice(DevicePlugin):
    """ Plugin for digital cameras running the CHDK firmware.

    """

    features = (DeviceFeatures.PREVIEW, DeviceFeatures.IS_CAMERA)
Example #26
0
# -*- coding: utf-8 -*-

from __future__ import division, unicode_literals

import logging
import os
import subprocess

from spreads.plugin import HookPlugin
from spreads.util import MissingDependencyException, find_in_path

if not find_in_path('djvubind'):
    raise MissingDependencyException("Could not find executable `djvubind` in"
                                     " $PATH. Please install the appropriate"
                                     " package(s)!")

logger = logging.getLogger('spreadsplug.djvubind')


class DjvuBindPlugin(HookPlugin):
    __name__ = 'djvubind'

    def output(self, path):
        logger.info("Assembling DJVU.")
        img_dir = path / 'done'
        djvu_file = path / 'out' / "{0}.djvu".format(path.name)
        cmd = ["djvubind", unicode(img_dir)]
        if not img_dir.glob("*.html"):
            cmd.append("--no-ocr")
        logger.debug("Running " + " ".join(cmd))
        subprocess.check_output(cmd, stderr=subprocess.STDOUT)
Example #27
0
# -*- coding: utf-8 -*-

from __future__ import division, unicode_literals

import logging
import os
import subprocess

from spreads.plugin import HookPlugin
from spreads.util import MissingDependencyException, find_in_path

if not find_in_path("djvubind"):
    raise MissingDependencyException(
        "Could not find executable `djvubind` in" " $PATH. Please install the appropriate" " package(s)!"
    )

logger = logging.getLogger("spreadsplug.djvubind")


class DjvuBindPlugin(HookPlugin):
    def output(self, path):
        logger.info("Assembling DJVU.")
        img_dir = os.path.join(path, "done")
        djvu_file = os.path.join(path, "out", "{0}.djvu".format(os.path.basename(path)))
        cmd = ["djvubind", img_dir]
        if self.config["djvubind"]["ocr"].get(unicode) == "none":
            cmd.append("--no-ocr")
        logger.debug("Running " + " ".join(cmd))
        _ = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
        os.rename("book.djvu", djvu_file)