Example #1
0
    def _perform_ocr(self, in_paths, out_dir, language):
        processes = []

        def _clean_processes():
            for p in processes[:]:
                if p.poll() is not None:
                    processes.remove(p)
                    _clean_processes.num_cleaned += 1
                    self.on_progressed.send(
                        self, progress=float(_clean_processes
                                             .num_cleaned)/len(in_paths))
        _clean_processes.num_cleaned = 0

        max_procs = multiprocessing.cpu_count()
        FNULL = open(os.devnull, 'w')
        for fpath in in_paths:
            # Wait until another process has finished
            while len(processes) >= max_procs:
                _clean_processes()
                time.sleep(0.01)
            cmd = [BIN, unicode(fpath), unicode(out_dir / fpath.stem),
                   "-l", language, "hocr"]
            logger.debug(cmd)
            proc = util.get_subprocess(cmd, stderr=FNULL, stdout=FNULL)
            processes.append(proc)
        # Wait for remaining processes to finish
        while processes:
            _clean_processes()
Example #2
0
    def _generate_output(self, projectfile, out_dir, num_pages):
        """ Run last step for the project file and keep track of the progress
            by emitting :py:attr:`on_progressed` signals.

        :param projectfile:     Path ScanTailor configuration file
        :type projectfile:      :py:class:`pathlib.Path`
        :param out_dir:         Output directory for processed files
        :type out_dir:          :py:class:`pathlib.Path`
        :param num_pages:       Total number of pages to process
        :type num_pages:        int
        """
        logger.debug("Generating output...")
        temp_dir = Path(tempfile.mkdtemp(prefix="spreads."))
        split_config = self._split_configuration(projectfile, temp_dir)
        logger.debug("Launching those subprocesses!")
        processes = [util.get_subprocess([CLI_BIN, '--start-filter=6',
                                          unicode(cfgfile), unicode(out_dir)])
                     for cfgfile in split_config]

        last_count = 0
        while processes:
            recent_count = sum(1 for x in out_dir.glob('*.tif'))
            if recent_count > last_count:
                progress = 0.5 + (float(recent_count)/num_pages)/2
                self.on_progressed.send(self, progress=progress)
                last_count = recent_count
            for p in processes[:]:
                if p.poll() is not None:
                    processes.remove(p)
            time.sleep(.01)
        shutil.rmtree(unicode(temp_dir))
Example #3
0
 def __init__(self, config):
     super(ScanTailorPlugin, self).__init__(config)
     help_out = util.get_subprocess([CLI_BIN],
                                    stdout=subprocess.PIPE).communicate()[0]
     self._enhanced = bool(
         re.match(r".*<images\|directory\|->.*",
                  help_out.splitlines()[7]))
Example #4
0
    def _generate_output(self, projectfile, out_dir, num_pages):
        """ Run last step for the project file and keep track of the progress
            by emitting :py:attr:`on_progressed` signals.

        :param projectfile:     Path ScanTailor configuration file
        :type projectfile:      :py:class:`pathlib.Path`
        :param out_dir:         Output directory for processed files
        :type out_dir:          :py:class:`pathlib.Path`
        :param num_pages:       Total number of pages to process
        :type num_pages:        int
        """
        logger.debug("Generating output...")
        temp_dir = Path(tempfile.mkdtemp(prefix="spreads."))
        split_config = self._split_configuration(projectfile, temp_dir)
        logger.debug("Launching those subprocesses!")
        processes = [
            util.get_subprocess([
                CLI_BIN, '--start-filter=6',
                unicode(cfgfile),
                unicode(out_dir)
            ]) for cfgfile in split_config
        ]

        last_count = 0
        while processes:
            recent_count = sum(1 for x in out_dir.glob('*.tif'))
            if recent_count > last_count:
                progress = 0.5 + (float(recent_count) / num_pages) / 2
                self.on_progressed.send(self, progress=progress)
                last_count = recent_count
            for p in processes[:]:
                if p.poll() is not None:
                    processes.remove(p)
            time.sleep(.01)
        shutil.rmtree(unicode(temp_dir))
Example #5
0
    def _perform_ocr(self, in_paths, out_dir, language):
        """ For each input image, launch tesseract and keep track of how far
            along the work is.

        :param in_paths:    Input images
        :type in_paths:     list of :py:class:`pathlib.Path`
        :param out_dir:     Output directory for hOCR files
        :type out_dir:      :py:class:`pathlib.Path`
        :param language:    Language to use for OCRing, must be among tesseract
                            languages installed on the system.
        :type language:     unicode
        """
        processes = []

        def _clean_processes():
            """ Go through processes, remove completed and emit a
                :py:attr:`on_progressed` signal for it.
            """
            for p in processes[:]:
                if p.poll() is not None:
                    processes.remove(p)
                    _clean_processes.num_cleaned += 1
                    self.on_progressed.send(
                        self,
                        progress=float(_clean_processes.num_cleaned) /
                        len(in_paths))

        _clean_processes.num_cleaned = 0

        # Run as many simultaneous Tesseract instances as there are CPU cores
        max_procs = multiprocessing.cpu_count()
        devnull = open(os.devnull, 'w')
        for fpath in in_paths:
            # Wait until another process has finished
            while len(processes) >= max_procs:
                _clean_processes()
                time.sleep(0.01)
            cmd = [
                BIN,
                unicode(fpath),
                unicode(out_dir / fpath.stem), "-l", language, "hocr"
            ]
            logger.debug(cmd)
            proc = util.get_subprocess(cmd, stderr=devnull, stdout=devnull)
            processes.append(proc)
        # Wait for remaining processes to finish
        while processes:
            _clean_processes()
Example #6
0
    def _perform_ocr(self, in_paths, out_dir, language):
        """ For each input image, launch tesseract and keep track of how far
            along the work is.

        :param in_paths:    Input images
        :type in_paths:     list of :py:class:`pathlib.Path`
        :param out_dir:     Output directory for hOCR files
        :type out_dir:      :py:class:`pathlib.Path`
        :param language:    Language to use for OCRing, must be among tesseract
                            languages installed on the system.
        :type language:     unicode
        """
        processes = []

        def _clean_processes():
            """ Go through processes, remove completed and emit a
                :py:attr:`on_progressed` signal for it.
            """
            for p in processes[:]:
                if p.poll() is not None:
                    processes.remove(p)
                    _clean_processes.num_cleaned += 1
                    self.on_progressed.send(
                        self, progress=float(_clean_processes
                                             .num_cleaned)/len(in_paths))
        _clean_processes.num_cleaned = 0

        # Run as many simultaneous Tesseract instances as there are CPU cores
        max_procs = multiprocessing.cpu_count()
        devnull = open(os.devnull, 'w')
        for fpath in in_paths:
            # Wait until another process has finished
            while len(processes) >= max_procs:
                _clean_processes()
                time.sleep(0.01)
            cmd = [BIN, unicode(fpath), unicode(out_dir / fpath.stem),
                   "-l", language, "hocr"]
            logger.debug(cmd)
            proc = util.get_subprocess(cmd, stderr=devnull, stdout=devnull)
            processes.append(proc)
        # Wait for remaining processes to finish
        while processes:
            _clean_processes()
Example #7
0
    def _generate_output(self, projectfile, out_dir, num_pages):
        logger.debug("Generating output...")
        temp_dir = Path(tempfile.mkdtemp(prefix="spreads."))
        split_config = self._split_configuration(projectfile, temp_dir)
        logger.debug("Launching those subprocesses!")
        processes = [util.get_subprocess([CLI_BIN, '--start-filter=6',
                                          unicode(cfgfile), unicode(out_dir)])
                     for cfgfile in split_config]

        last_count = 0
        while processes:
            recent_count = sum(1 for x in out_dir.glob('*.tif'))
            if recent_count > last_count:
                progress = 0.5 + (float(recent_count)/num_pages)/2
                self.on_progressed.send(self, progress=progress)
                last_count = recent_count
            for p in processes[:]:
                if p.poll() is not None:
                    processes.remove(p)
            time.sleep(.01)
        shutil.rmtree(unicode(temp_dir))
Example #8
0
    def process(self, pages, target_path):
        """ Run the most recent image of every page through ScanTailor.

        :param pages:       Pages to be processed
        :type pages:        list of :py:class:`spreads.workflow.Page`
        :param target_path: Base directory where rotated images are to be
                            stored
        :type target_path:  :py:class:`pathlib.Path`
        """
        autopilot = self.config['autopilot'].get(bool)
        if not autopilot and not util.find_in_path('scantailor'):
            raise util.MissingDependencyException(
                "Could not find executable `scantailor` in"
                " $PATH. Please install the appropriate"
                " package(s)!")

        # Create temporary files/directories
        projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1])
        out_dir = Path(tempfile.mkdtemp(prefix='st-out'))

        # Map input paths to their pages so we can more easily associate
        # the generated output files with their pages later on
        in_paths = {}
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            in_paths[unicode(fpath)] = page

        logger.info("Generating ScanTailor configuration")
        self._generate_configuration(sorted(in_paths.keys()), projectfile,
                                     out_dir)

        if not autopilot:
            logger.warn("If you are changing output settings (in the last "
                        "step, you *have* to run the last step from the GUI. "
                        "Due to a bug in ScanTailor, your settings would "
                        "otherwise be ignored.")
            time.sleep(5)
            logger.info("Opening ScanTailor GUI for manual adjustment")
            util.get_subprocess([GUI_BIN, unicode(projectfile)])
        # Check if the user already generated output files from the GUI
        if not sum(1 for x in out_dir.glob('*.tif')) == len(pages):
            logger.info("Generating output images from ScanTailor "
                        "configuration.")
            self._generate_output(projectfile, out_dir, len(pages))

        # Associate generated output files with our pages
        for fname in out_dir.glob('*.tif'):
            out_stem = fname.stem
            for in_path, page in in_paths.iteritems():
                if Path(in_path).stem == out_stem:
                    target_fname = target_path / fname.name
                    shutil.copyfile(unicode(fname), unicode(target_fname))
                    page.processed_images[self.__name__] = target_fname
                    break
            else:
                logger.warn(
                    "Could not find page for output file {0}".format(fname))

        # Remove temporary files/directories
        shutil.rmtree(unicode(out_dir))
        # FIXME: This fails on Windows since there seems to be some non-gcable
        #        reference to the file around, but I currently cannot figure
        #        out where, so we just ignore the error...
        try:
            projectfile.unlink()
        except WindowsError as e:
            if e.errno == 32:
                pass
Example #9
0
    def _generate_configuration(self, in_paths, projectfile, out_dir):
        """ Run images through ScanTailor pre-processing steps.

        :param in_paths:        Paths to images to be processed
        :type in_paths:         list of :py:class:`pathlib.Path`
        :param projectfile:     Path ScanTailor configuration file
        :type projectfile:      :py:class:`pathlib.Path`
        :param out_dir:         Output directory for processed files
        :type out_dir:          :py:class:`pathlib.Path`
        """
        # Filters are numbered from 1 to 6, with 6 being the 'create output
        # files' step.
        filterconf = [
            self.config[x].get(bool)
            for x in ('rotate', 'split_pages', 'deskew', 'content',
                      'auto_margins')
        ]
        start_filter = filterconf.index(True) + 1
        end_filter = len(filterconf) - list(reversed(filterconf)).index(True)
        marginconf = self.config['margins'].as_str_seq()

        # Build initial command-line
        generation_cmd = [
            CLI_BIN, '--start-filter={0}'.format(start_filter),
            '--end-filter={0}'.format(end_filter), '--layout=1.5',
            '-o={0}'.format(projectfile)
        ]

        # The 'enhanced' fork of ScanTailor has some additional features
        page_detection = self.config['detection'].get() == 'page'
        if self._enhanced and page_detection:
            generation_cmd.extend([
                '--enable-page-detection', '--disable-content-detection',
                '--enable-fine-tuning'
            ])
        else:
            generation_cmd.extend([
                '--margins-top={0}'.format(marginconf[0]),
                '--margins-right={0}'.format(marginconf[1]),
                '--margins-bottom={0}'.format(marginconf[2]),
                '--margins-left={0}'.format(marginconf[3]),
            ])
        if IS_WIN:
            # NOTE: Due to Window's commandline length limit of 8192 chars,
            #       we have to pipe in the list of files via stdin
            generation_cmd.append("-")
        else:
            generation_cmd.extend(in_paths)

        generation_cmd.append(unicode(out_dir))
        logger.debug(" ".join(generation_cmd))
        if IS_WIN:
            sp = util.get_subprocess(generation_cmd, stdin=subprocess.PIPE)
            sp.stdin.write(" ".join(in_paths))
            sp.stdin.close()
        else:
            sp = util.get_subprocess(generation_cmd)
        proc = psutil.Process(sp.pid)

        # Keep track of the progress by monitoring the files opened by the
        # ScanTailor process. Since it processes the files in order and we
        # know in advance how often a file will be opened (= number of steps)
        # we can reliably calculate how far a long we are and emit
        #       # :py:attr:`on_progressed` events.
        num_images = len(in_paths)
        num_steps = (end_filter - start_filter) + 1
        last_fileidx = 0
        recent_fileidx = 0
        finished_steps = 0
        while proc.is_running():
            try:
                recent_fileidx = next(
                    in_paths.index(x.path) for x in proc.open_files()
                    if x.path in in_paths)
            except StopIteration:
                pass
            except psutil.AccessDenied:
                # This means the process is no longer running
                break
            if recent_fileidx == last_fileidx:
                time.sleep(.01)
                continue
            if recent_fileidx < last_fileidx:
                finished_steps += 1
            last_fileidx = recent_fileidx
            progress = 0.5 * ((finished_steps * num_images + last_fileidx) /
                              float(num_steps * num_images))
            self.on_progressed.send(self, progress=progress)
Example #10
0
    def output(self, pages, target_path, metadata, table_of_contents):
        """ Go through pages and bundle their most recent images into a PDF
            file.

        :param pages:               Pages to bundle
        :param target_path:         list of :py:class:`spreads.workflow.Page`
        :param metadata:            Metadata to include in PDF file
        :type metadata:             :py:class:`spreads.metadata.Metadata`
        :param table_of_contents:   Table of contents to include in PDF file
        :type table_of_contents:    list of :py:class:`TocEntry`
        """
        logger.info("Assembling PDF.")

        tmpdir = Path(tempfile.mkdtemp())

        meta_file = tmpdir/'metadata.txt'
        with codecs.open(unicode(meta_file), "w", "utf-8") as fp:
            for key, value in metadata.iteritems():
                if key == 'title':
                    fp.write("Title: \"{0}\"\n".format(value))
                if key == 'creator':
                    for author in value:
                        fp.write("Author: \"{0}\"\n".format(author))

        images = []
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            link_path = (tmpdir/fpath.name)
            if IS_WIN:
                shutil.copy(unicode(fpath), unicode(link_path))
            else:
                link_path.symlink_to(fpath.absolute())
            if 'tesseract' in page.processed_images:
                ocr_path = page.processed_images['tesseract']
                if IS_WIN:
                    shutil.copy(unicode(ocr_path),
                                unicode(tmpdir/ocr_path.name))
                else:
                    (tmpdir/ocr_path.name).symlink_to(ocr_path.absolute())
            images.append(link_path.absolute())

        pdf_file = target_path.absolute()/"book.pdf"

        # TODO: Use table_of_contents to create a TOCFILE for pdfbeads
        # TODO: Use page.page_label to create a LSPEC for pdfbeads

        # NOTE: pdfbeads only finds *html files for the text layer in the
        #       working directory, so we have to chdir() into it
        old_path = os.path.abspath(os.path.curdir)
        os.chdir(unicode(tmpdir))

        cmd = [BIN, "-d", "-M", unicode(meta_file)]
        if IS_WIN:
            cmd.append(util.wildcardify(tuple(f.name for f in images)))
        else:
            cmd.extend([unicode(f) for f in images])
        cmd.extend(["-o", unicode(pdf_file)])
        logger.debug("Running " + " ".join(cmd))
        proc = util.get_subprocess(cmd, stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE, shell=IS_WIN)
        if IS_WIN:
            # NOTE: Due to a bug in the jbig2enc version for Windows, the error
            #       output gets huge, creating a deadlock. Hence, we go the
            #       safe way and use `communicate()`, though this means no
            #       progress notification for the user.
            output, errors = proc.communicate()
        else:
            errors = ""
            is_jbig2 = False
            cur_jbig2_page = 0
            while proc.poll() is None:
                cur_line = proc.stderr.readline()
                errors += "\n" + cur_line
                prep_match = re.match(r"^Prepared data for processing (.*)$",
                                      cur_line)
                proc_match = re.match(r"^Processed (.*)$", cur_line)
                jbig2_match = re.match(
                    r"^JBIG2 compression complete. pages:(\d+) symbols:\d+ "
                    r"log2:\d+$", cur_line)
                progress = None
                if prep_match:
                    file_idx = next(idx for idx, f in enumerate(images)
                                    if unicode(f) == prep_match.group(1))
                    progress = file_idx/(len(images)*2)
                elif jbig2_match:
                    cur_jbig2_page += int(jbig2_match.group(1))
                    progress = (len(images) + cur_jbig2_page) / (len(images)*2)
                    is_jbig2 = True
                elif proc_match and not is_jbig2:
                    file_idx = next(idx for idx, f in enumerate(images)
                                    if unicode(f) == proc_match.group(1))
                    progress = (len(images) + file_idx)/(len(images)*2)
                if progress is not None:
                    self.on_progressed.send(self, progress=progress)
                time.sleep(.01)
            output = proc.stdout.read()
        logger.debug("pdfbeads stdout:\n{0}".format(output))
        logger.debug("pdfbeads stderr:\n{0}".format(errors))
        os.chdir(old_path)
        shutil.rmtree(unicode(tmpdir))
Example #11
0
    def _generate_configuration(self, in_paths, projectfile, out_dir):
        """ Run images through ScanTailor pre-processing steps.

        :param in_paths:        Paths to images to be processed
        :type in_paths:         list of :py:class:`pathlib.Path`
        :param projectfile:     Path ScanTailor configuration file
        :type projectfile:      :py:class:`pathlib.Path`
        :param out_dir:         Output directory for processed files
        :type out_dir:          :py:class:`pathlib.Path`
        """
        # Filters are numbered from 1 to 6, with 6 being the 'create output
        # files' step.
        filterconf = [self.config[x].get(bool)
                      for x in ('rotate', 'split_pages', 'deskew', 'content',
                                'auto_margins')]
        start_filter = filterconf.index(True)+1
        end_filter = len(filterconf) - list(reversed(filterconf)).index(True)
        marginconf = self.config['margins'].as_str_seq()

        # Build initial command-line
        generation_cmd = [CLI_BIN,
                          '--start-filter={0}'.format(start_filter),
                          '--end-filter={0}'.format(end_filter),
                          '--layout=1.5',
                          '-o={0}'.format(projectfile)]

        # The 'enhanced' fork of ScanTailor has some additional features
        page_detection = self.config['detection'].get() == 'page'
        if self._enhanced and page_detection:
            generation_cmd.extend([
                '--enable-page-detection',
                '--disable-content-detection',
                '--enable-fine-tuning'
            ])
        else:
            generation_cmd.extend([
                '--margins-top={0}'.format(marginconf[0]),
                '--margins-right={0}'.format(marginconf[1]),
                '--margins-bottom={0}'.format(marginconf[2]),
                '--margins-left={0}'.format(marginconf[3]),
            ])
        if IS_WIN:
            # NOTE: Due to Window's commandline length limit of 8192 chars,
            #       we have to pipe in the list of files via stdin
            generation_cmd.append("-")
        else:
            generation_cmd.extend(in_paths)

        generation_cmd.append(unicode(out_dir))
        logger.debug(" ".join(generation_cmd))
        if IS_WIN:
            sp = util.get_subprocess(generation_cmd, stdin=subprocess.PIPE)
            sp.stdin.write(" ".join(in_paths))
            sp.stdin.close()
        else:
            sp = util.get_subprocess(generation_cmd)
        proc = psutil.Process(sp.pid)

        # Keep track of the progress by monitoring the files opened by the
        # ScanTailor process. Since it processes the files in order and we
        # know in advance how often a file will be opened (= number of steps)
        # we can reliably calculate how far a long we are and emit
#       # :py:attr:`on_progressed` events.
        num_images = len(in_paths)
        num_steps = (end_filter - start_filter)+1
        last_fileidx = 0
        recent_fileidx = 0
        finished_steps = 0
        while proc.is_running():
            try:
                recent_fileidx = next(in_paths.index(x.path)
                                      for x in proc.open_files()
                                      if x.path in in_paths)
            except StopIteration:
                pass
            except psutil.AccessDenied:
                # This means the process is no longer running
                break
            if recent_fileidx == last_fileidx:
                time.sleep(.01)
                continue
            if recent_fileidx < last_fileidx:
                finished_steps += 1
            last_fileidx = recent_fileidx
            progress = 0.5*((finished_steps*num_images+last_fileidx) /
                            float(num_steps*num_images))
            self.on_progressed.send(self, progress=progress)
Example #12
0
 def __init__(self, config):
     super(ScanTailorPlugin, self).__init__(config)
     help_out = util.get_subprocess([CLI_BIN],
                                    stdout=subprocess.PIPE).communicate()[0]
     self._enhanced = bool(re.match(r".*<images\|directory\|->.*",
                           help_out.splitlines()[7]))
Example #13
0
    def process(self, pages, target_path):
        """ Run the most recent image of every page through ScanTailor.

        :param pages:       Pages to be processed
        :type pages:        list of :py:class:`spreads.workflow.Page`
        :param target_path: Base directory where rotated images are to be
                            stored
        :type target_path:  :py:class:`pathlib.Path`
        """
        autopilot = self.config['autopilot'].get(bool)
        if not autopilot and not util.find_in_path('scantailor'):
            raise util.MissingDependencyException(
                "Could not find executable `scantailor` in"
                " $PATH. Please install the appropriate"
                " package(s)!")

        # Create temporary files/directories
        projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1])
        out_dir = Path(tempfile.mkdtemp(prefix='st-out'))

        # Map input paths to their pages so we can more easily associate
        # the generated output files with their pages later on
        in_paths = {}
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            in_paths[unicode(fpath)] = page

        logger.info("Generating ScanTailor configuration")
        self._generate_configuration(sorted(in_paths.keys()),
                                     projectfile, out_dir)

        if not autopilot:
            logger.warn("If you are changing output settings (in the last "
                        "step, you *have* to run the last step from the GUI. "
                        "Due to a bug in ScanTailor, your settings would "
                        "otherwise be ignored.")
            time.sleep(5)
            logger.info("Opening ScanTailor GUI for manual adjustment")
            proc = util.get_subprocess([GUI_BIN, unicode(projectfile)])
            proc.wait()
        # Check if the user already generated output files from the GUI
        if not sum(1 for x in out_dir.glob('*.tif')) == len(pages):
            logger.info("Generating output images from ScanTailor "
                        "configuration.")
            self._generate_output(projectfile, out_dir, len(pages))

        # Associate generated output files with our pages
        for fname in out_dir.glob('*.tif'):
            out_stem = fname.stem
            for in_path, page in in_paths.iteritems():
                if Path(in_path).stem == out_stem:
                    target_fname = target_path/fname.name
                    shutil.copyfile(unicode(fname), unicode(target_fname))
                    page.processed_images[self.__name__] = target_fname
                    break
            else:
                logger.warn("Could not find page for output file {0}"
                            .format(fname))

        # Remove temporary files/directories
        shutil.rmtree(unicode(out_dir))
        # FIXME: This fails on Windows since there seems to be some non-gcable
        #        reference to the file around, but I currently cannot figure
        #        out where, so we just ignore the error...
        try:
            projectfile.unlink()
        except WindowsError as e:
            if e.errno == 32:
                pass
Example #14
0
    def _generate_configuration(self, in_paths, projectfile, out_dir):
        filterconf = [self.config[x].get(bool)
                      for x in ('rotate', 'split_pages', 'deskew', 'content',
                                'auto_margins')]
        start_filter = filterconf.index(True)+1
        end_filter = len(filterconf) - list(reversed(filterconf)).index(True)
        marginconf = self.config['margins'].as_str_seq()
        generation_cmd = [CLI_BIN,
                          '--start-filter={0}'.format(start_filter),
                          '--end-filter={0}'.format(end_filter),
                          '--layout=1.5',
                          '-o={0}'.format(projectfile)]
        page_detection = self.config['detection'].get() == 'page'
        if self._enhanced and page_detection:
            generation_cmd.extend([
                '--enable-page-detection',
                '--disable-content-detection',
                '--enable-fine-tuning'
            ])
        else:
            generation_cmd.extend([
                '--margins-top={0}'.format(marginconf[0]),
                '--margins-right={0}'.format(marginconf[1]),
                '--margins-bottom={0}'.format(marginconf[2]),
                '--margins-left={0}'.format(marginconf[3]),
            ])
        if IS_WIN:
            # NOTE: Due to Window's commandline length limit of 8192 chars,
            #       we have to pipe in the list of files via stdin
            generation_cmd.append("-")
        else:
            generation_cmd.extend(in_paths)

        generation_cmd.append(unicode(out_dir))
        logger.debug(" ".join(generation_cmd))
        if IS_WIN:
            sp = util.get_subprocess(generation_cmd, stdin=subprocess.PIPE)
            sp.stdin.write(" ".join(in_paths))
            sp.stdin.close()
        else:
            sp = util.get_subprocess(generation_cmd)
        proc = psutil.Process(sp.pid)

        num_images = len(in_paths)
        num_steps = (end_filter - start_filter)+1
        last_fileidx = 0
        recent_fileidx = 0
        finished_steps = 0
        while proc.is_running():
            try:
                recent_fileidx = next(in_paths.index(x.path)
                                      for x in proc.open_files()
                                      if x.path in in_paths)
            except StopIteration:
                pass
            except psutil.AccessDenied:
                # This means the process is no longer running
                break
            if recent_fileidx == last_fileidx:
                time.sleep(.01)
                continue
            if recent_fileidx < last_fileidx:
                finished_steps += 1
            last_fileidx = recent_fileidx
            progress = 0.5*((finished_steps*num_images+last_fileidx) /
                            float(num_steps*num_images))
            self.on_progressed.send(self, progress=progress)
Example #15
0
from spreads.config import OptionTemplate
from spreads.plugin import HookPlugin, ProcessHooksMixin
from spreads.vendor.pathlib import Path

BIN = util.find_in_path('tesseract')
if not BIN:
    raise util.MissingDependencyException(
        "Could not find executable `tesseract`. Please install the appropriate"
        " package(s)!")

# Newer versions of Tesseract provide a flag to obtain a list of installed
# OCR languages, for older versions we have to read out the directory
# containing the training data for languages.
try:
    AVAILABLE_LANGS = (util.get_subprocess(
        [BIN, "--list-langs"],
        stderr=subprocess.STDOUT,
        stdout=subprocess.PIPE).communicate()[0].split("\n")[1:-1])
    # There should be at least a single language
    if not AVAILABLE_LANGS:
        raise ValueError()
except (subprocess.CalledProcessError, ValueError):
    AVAILABLE_LANGS = [
        x.stem for x in Path('/usr/share/tesseract-ocr/tessdata').glob(
            '*.traineddata')
    ]

logger = logging.getLogger('spreadsplug.tesseract')


class TesseractPlugin(HookPlugin, ProcessHooksMixin):
    __name__ = 'tesseract'
Example #16
0
from spreads.plugin import HookPlugin, ProcessHooksMixin
from pathlib import Path

BIN = util.find_in_path('tesseract')
if not BIN:
    raise util.MissingDependencyException(
        "Could not find executable `tesseract`. Please install the appropriate"
        " package(s)!")

# Newer versions of Tesseract provide a flag to obtain a list of installed
# OCR languages, for older versions we have to read out the directory
# containing the training data for languages.
try:
    AVAILABLE_LANGS = (util.get_subprocess([BIN, "--list-langs"],
                                           stderr=subprocess.STDOUT,
                                           stdout=subprocess.PIPE)
                       .communicate()[0]
                       .split("\n")[1:-1])
    # There should be at least a single language
    if not AVAILABLE_LANGS:
        raise ValueError()
except (subprocess.CalledProcessError, ValueError):
    AVAILABLE_LANGS = [x.stem for x in
                       Path('/usr/share/tesseract-ocr/tessdata')
                       .glob('*.traineddata')]

logger = logging.getLogger('spreadsplug.tesseract')


class TesseractPlugin(HookPlugin, ProcessHooksMixin):
    __name__ = 'tesseract'
Example #17
0
    def output(self, pages, target_path, metadata, table_of_contents):
        logger.info("Assembling PDF.")

        tmpdir = Path(tempfile.mkdtemp())

        meta_file = tmpdir/'metadata.txt'
        with codecs.open(unicode(meta_file), "w", "utf-8") as fp:
            for key, value in metadata.iteritems():
                if key == 'title':
                    fp.write("Title: \"{0}\"\n".format(value))
                if key == 'creator':
                    for author in value:
                        fp.write("Author: \"{0}\"\n".format(author))

        images = []
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            link_path = (tmpdir/fpath.name)
            if IS_WIN:
                shutil.copy(unicode(fpath), unicode(link_path))
            else:
                link_path.symlink_to(fpath.absolute())
            if 'tesseract' in page.processed_images:
                ocr_path = page.processed_images['tesseract']
                if IS_WIN:
                    shutil.copy(unicode(ocr_path),
                                unicode(tmpdir/ocr_path.name))
                else:
                    (tmpdir/ocr_path.name).symlink_to(ocr_path.absolute())
            images.append(link_path.absolute())

        pdf_file = target_path.absolute()/"book.pdf"

        # TODO: Use table_of_contents to create a TOCFILE for pdfbeads
        # TODO: Use page.page_label to create a LSPEC for pdfbeads

        # NOTE: pdfbeads only finds *html files for the text layer in the
        #       working directory, so we have to chdir() into it
        old_path = os.path.abspath(os.path.curdir)
        os.chdir(unicode(tmpdir))

        cmd = [BIN, "-d", "-M", unicode(meta_file)]
        if IS_WIN:
            cmd.append(util.wildcardify(tuple(f.name for f in images)))
        else:
            cmd.extend([unicode(f) for f in images])
        cmd.extend(["-o", unicode(pdf_file)])
        logger.debug("Running " + " ".join(cmd))
        proc = util.get_subprocess(cmd, stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE, shell=IS_WIN)
        if IS_WIN:
            # NOTE: Due to a bug in the jbig2enc version for Windows, the error
            #       output gets huge, creating a deadlock. Hence, we go the
            #       safe way and use `communicate()`, though this means no
            #       progress notification for the user.
            output, errors = proc.communicate()
        else:
            last_count = 0
            while proc.poll() is None:
                current_count = sum(1 for x in tmpdir.glob('*.jbig2'))
                if current_count > last_count:
                    last_count = current_count
                    self.on_progressed.send(
                        self, progress=float(current_count)/len(images))
                time.sleep(.01)
            output = proc.stdout.read()
            errors = proc.stderr.read()
        logger.debug("pdfbeads stdout:\n{0}".format(output))
        logger.debug("pdfbeads stderr:\n{0}".format(errors))
        os.chdir(old_path)
        shutil.rmtree(unicode(tmpdir))
Example #18
0
    def output(self, pages, target_path, metadata, table_of_contents):
        """ Go through pages and bundle their most recent images into a PDF
            file.

        :param pages:               Pages to bundle
        :param target_path:         list of :py:class:`spreads.workflow.Page`
        :param metadata:            Metadata to include in PDF file
        :type metadata:             :py:class:`spreads.metadata.Metadata`
        :param table_of_contents:   Table of contents to include in PDF file
        :type table_of_contents:    list of :py:class:`TocEntry`
        """
        logger.info("Assembling PDF.")

        tmpdir = Path(tempfile.mkdtemp())

        meta_file = tmpdir / 'metadata.txt'
        with codecs.open(unicode(meta_file), "w", "utf-8") as fp:
            for key, value in metadata.iteritems():
                if key == 'title':
                    fp.write("Title: \"{0}\"\n".format(value))
                if key == 'creator':
                    for author in value:
                        fp.write("Author: \"{0}\"\n".format(author))

        images = []
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            link_path = (tmpdir / fpath.name)
            if IS_WIN:
                shutil.copy(unicode(fpath), unicode(link_path))
            else:
                link_path.symlink_to(fpath.absolute())
            if 'tesseract' in page.processed_images:
                ocr_path = page.processed_images['tesseract']
                if IS_WIN:
                    shutil.copy(unicode(ocr_path),
                                unicode(tmpdir / ocr_path.name))
                else:
                    (tmpdir / ocr_path.name).symlink_to(ocr_path.absolute())
            images.append(link_path.absolute())

        pdf_file = target_path.absolute() / "book.pdf"

        # TODO: Use table_of_contents to create a TOCFILE for pdfbeads
        # TODO: Use page.page_label to create a LSPEC for pdfbeads

        # NOTE: pdfbeads only finds *html files for the text layer in the
        #       working directory, so we have to chdir() into it
        old_path = os.path.abspath(os.path.curdir)
        os.chdir(unicode(tmpdir))

        cmd = [BIN, "-d", "-M", unicode(meta_file)]
        if IS_WIN:
            cmd.append(util.wildcardify(tuple(f.name for f in images)))
        else:
            cmd.extend([unicode(f) for f in images])
        cmd.extend(["-o", unicode(pdf_file)])
        logger.debug("Running " + " ".join(cmd))
        proc = util.get_subprocess(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE,
                                   shell=IS_WIN)
        if IS_WIN:
            # NOTE: Due to a bug in the jbig2enc version for Windows, the error
            #       output gets huge, creating a deadlock. Hence, we go the
            #       safe way and use `communicate()`, though this means no
            #       progress notification for the user.
            output, errors = proc.communicate()
        else:
            errors = ""
            is_jbig2 = False
            cur_jbig2_page = 0
            while proc.poll() is None:
                cur_line = proc.stderr.readline()
                errors += "\n" + cur_line
                prep_match = re.match(r"^Prepared data for processing (.*)$",
                                      cur_line)
                proc_match = re.match(r"^Processed (.*)$", cur_line)
                jbig2_match = re.match(
                    r"^JBIG2 compression complete. pages:(\d+) symbols:\d+ "
                    r"log2:\d+$", cur_line)
                progress = None
                if prep_match:
                    file_idx = next(idx for idx, f in enumerate(images)
                                    if unicode(f) == prep_match.group(1))
                    progress = file_idx / (len(images) * 2)
                elif jbig2_match:
                    cur_jbig2_page += int(jbig2_match.group(1))
                    progress = (len(images) + cur_jbig2_page) / (len(images) *
                                                                 2)
                    is_jbig2 = True
                elif proc_match and not is_jbig2:
                    file_idx = next(idx for idx, f in enumerate(images)
                                    if unicode(f) == proc_match.group(1))
                    progress = (len(images) + file_idx) / (len(images) * 2)
                if progress is not None:
                    self.on_progressed.send(self, progress=progress)
                time.sleep(.01)
            output = proc.stdout.read()
        logger.debug("pdfbeads stdout:\n{0}".format(output))
        logger.debug("pdfbeads stderr:\n{0}".format(errors))
        os.chdir(old_path)
        shutil.rmtree(unicode(tmpdir))