Ejemplo n.º 1
0
    def output(self, pages, target_path, metadata, table_of_contents):
        """ Go through pages and bundle their most recent images into a PDF
            file.

        :param pages:               Pages to bundle
        :param target_path:         list of :py:class:`spreads.workflow.Page`
        :param metadata:            Metadata to include in PDF file
        :type metadata:             :py:class:`spreads.metadata.Metadata`
        :param table_of_contents:   Table of contents to include in PDF file
        :type table_of_contents:    list of :py:class:`TocEntry`
        """
        logger.info("Assembling PDF.")

        tmpdir = Path(tempfile.mkdtemp())

        meta_file = tmpdir/'metadata.txt'
        with codecs.open(unicode(meta_file), "w", "utf-8") as fp:
            for key, value in metadata.iteritems():
                if key == 'title':
                    fp.write("Title: \"{0}\"\n".format(value))
                if key == 'creator':
                    for author in value:
                        fp.write("Author: \"{0}\"\n".format(author))

        images = []
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            link_path = (tmpdir/fpath.name)
            if IS_WIN:
                shutil.copy(unicode(fpath), unicode(link_path))
            else:
                link_path.symlink_to(fpath.absolute())
            if 'tesseract' in page.processed_images:
                ocr_path = page.processed_images['tesseract']
                if IS_WIN:
                    shutil.copy(unicode(ocr_path),
                                unicode(tmpdir/ocr_path.name))
                else:
                    (tmpdir/ocr_path.name).symlink_to(ocr_path.absolute())
            images.append(link_path.absolute())

        pdf_file = target_path.absolute()/"book.pdf"

        # TODO: Use table_of_contents to create a TOCFILE for pdfbeads
        # TODO: Use page.page_label to create a LSPEC for pdfbeads

        # NOTE: pdfbeads only finds *html files for the text layer in the
        #       working directory, so we have to chdir() into it
        old_path = os.path.abspath(os.path.curdir)
        os.chdir(unicode(tmpdir))

        cmd = [BIN, "-d", "-M", unicode(meta_file)]
        if IS_WIN:
            cmd.append(util.wildcardify(tuple(f.name for f in images)))
        else:
            cmd.extend([unicode(f) for f in images])
        cmd.extend(["-o", unicode(pdf_file)])
        logger.debug("Running " + " ".join(cmd))
        proc = util.get_subprocess(cmd, stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE, shell=IS_WIN)
        if IS_WIN:
            # NOTE: Due to a bug in the jbig2enc version for Windows, the error
            #       output gets huge, creating a deadlock. Hence, we go the
            #       safe way and use `communicate()`, though this means no
            #       progress notification for the user.
            output, errors = proc.communicate()
        else:
            errors = ""
            is_jbig2 = False
            cur_jbig2_page = 0
            while proc.poll() is None:
                cur_line = proc.stderr.readline()
                errors += "\n" + cur_line
                prep_match = re.match(r"^Prepared data for processing (.*)$",
                                      cur_line)
                proc_match = re.match(r"^Processed (.*)$", cur_line)
                jbig2_match = re.match(
                    r"^JBIG2 compression complete. pages:(\d+) symbols:\d+ "
                    r"log2:\d+$", cur_line)
                progress = None
                if prep_match:
                    file_idx = next(idx for idx, f in enumerate(images)
                                    if unicode(f) == prep_match.group(1))
                    progress = file_idx/(len(images)*2)
                elif jbig2_match:
                    cur_jbig2_page += int(jbig2_match.group(1))
                    progress = (len(images) + cur_jbig2_page) / (len(images)*2)
                    is_jbig2 = True
                elif proc_match and not is_jbig2:
                    file_idx = next(idx for idx, f in enumerate(images)
                                    if unicode(f) == proc_match.group(1))
                    progress = (len(images) + file_idx)/(len(images)*2)
                if progress is not None:
                    self.on_progressed.send(self, progress=progress)
                time.sleep(.01)
            output = proc.stdout.read()
        logger.debug("pdfbeads stdout:\n{0}".format(output))
        logger.debug("pdfbeads stderr:\n{0}".format(errors))
        os.chdir(old_path)
        shutil.rmtree(unicode(tmpdir))
Ejemplo n.º 2
0
    def output(self, pages, target_path, metadata, table_of_contents):
        logger.info("Assembling PDF.")

        tmpdir = Path(tempfile.mkdtemp())

        meta_file = tmpdir/'metadata.txt'
        with codecs.open(unicode(meta_file), "w", "utf-8") as fp:
            for key, value in metadata.iteritems():
                if key == 'title':
                    fp.write("Title: \"{0}\"\n".format(value))
                if key == 'creator':
                    for author in value:
                        fp.write("Author: \"{0}\"\n".format(author))

        images = []
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            link_path = (tmpdir/fpath.name)
            if IS_WIN:
                shutil.copy(unicode(fpath), unicode(link_path))
            else:
                link_path.symlink_to(fpath.absolute())
            if 'tesseract' in page.processed_images:
                ocr_path = page.processed_images['tesseract']
                if IS_WIN:
                    shutil.copy(unicode(ocr_path),
                                unicode(tmpdir/ocr_path.name))
                else:
                    (tmpdir/ocr_path.name).symlink_to(ocr_path.absolute())
            images.append(link_path.absolute())

        pdf_file = target_path.absolute()/"book.pdf"

        # TODO: Use table_of_contents to create a TOCFILE for pdfbeads
        # TODO: Use page.page_label to create a LSPEC for pdfbeads

        # NOTE: pdfbeads only finds *html files for the text layer in the
        #       working directory, so we have to chdir() into it
        old_path = os.path.abspath(os.path.curdir)
        os.chdir(unicode(tmpdir))

        cmd = [BIN, "-d", "-M", unicode(meta_file)]
        if IS_WIN:
            cmd.append(util.wildcardify(tuple(f.name for f in images)))
        else:
            cmd.extend([unicode(f) for f in images])
        cmd.extend(["-o", unicode(pdf_file)])
        logger.debug("Running " + " ".join(cmd))
        proc = util.get_subprocess(cmd, stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE, shell=IS_WIN)
        if IS_WIN:
            # NOTE: Due to a bug in the jbig2enc version for Windows, the error
            #       output gets huge, creating a deadlock. Hence, we go the
            #       safe way and use `communicate()`, though this means no
            #       progress notification for the user.
            output, errors = proc.communicate()
        else:
            last_count = 0
            while proc.poll() is None:
                current_count = sum(1 for x in tmpdir.glob('*.jbig2'))
                if current_count > last_count:
                    last_count = current_count
                    self.on_progressed.send(
                        self, progress=float(current_count)/len(images))
                time.sleep(.01)
            output = proc.stdout.read()
            errors = proc.stderr.read()
        logger.debug("pdfbeads stdout:\n{0}".format(output))
        logger.debug("pdfbeads stderr:\n{0}".format(errors))
        os.chdir(old_path)
        shutil.rmtree(unicode(tmpdir))
Ejemplo n.º 3
0
    def _generate_configuration(self, in_paths, projectfile, out_dir):
        filterconf = [self.config[x].get(bool)
                      for x in ('rotate', 'split_pages', 'deskew', 'content',
                                'auto_margins')]
        start_filter = filterconf.index(True)+1
        end_filter = len(filterconf) - list(reversed(filterconf)).index(True)
        marginconf = self.config['margins'].as_str_seq()
        generation_cmd = [find_in_path('scantailor-cli'),
                          '--start-filter={0}'.format(start_filter),
                          '--end-filter={0}'.format(end_filter),
                          '--layout=1.5',
                          '-o={0}'.format(projectfile)]
        page_detection = self.config['detection'].get() == 'page'
        if self._enhanced and page_detection:
            generation_cmd.extend([
                '--enable-page-detection',
                '--disable-content-detection',
                '--enable-fine-tuning'
            ])
        else:
            generation_cmd.extend([
                '--margins-top={0}'.format(marginconf[0]),
                '--margins-right={0}'.format(marginconf[1]),
                '--margins-bottom={0}'.format(marginconf[2]),
                '--margins-left={0}'.format(marginconf[3]),
            ])
        # NOTE: We cannot pass individual filenames on windows, since we have
        # a limit of 32,768 characters for commands. Thus, we first try to
        # find a wildcard for our paths that matches only them, and if that
        # fails, throw an Exception and tell the user to use a proper OS...
        wildcard = wildcardify(in_paths)
        if not wildcard and IS_WIN:
            raise SpreadsException("Please use a proper operating system.")
        elif not wildcard:
            generation_cmd.extend(in_paths)
        else:
            generation_cmd.append(wildcard)

        generation_cmd.append(unicode(out_dir))
        logger.debug(" ".join(generation_cmd))
        proc = psutil.Process(subprocess.Popen(generation_cmd).pid)

        num_images = len(in_paths)
        num_steps = (end_filter - start_filter)+1
        last_fileidx = 0
        recent_fileidx = 0
        finished_steps = 0
        while proc.is_running():
            try:
                recent_fileidx = next(in_paths.index(x.path)
                                      for x in proc.open_files()
                                      if x.path in in_paths)
            except StopIteration:
                pass
            except psutil.AccessDenied:
                # This means the process is no longer running
                break
            if recent_fileidx == last_fileidx:
                time.sleep(.01)
                continue
            if recent_fileidx < last_fileidx:
                finished_steps += 1
            last_fileidx = recent_fileidx
            progress = 0.5*((finished_steps*num_images+last_fileidx) /
                            float(num_steps*num_images))
            self.on_progressed.send(self, progress=progress)
Ejemplo n.º 4
0
    def output(self, pages, target_path, metadata, table_of_contents):
        """ Go through pages and bundle their most recent images into a PDF
            file.

        :param pages:               Pages to bundle
        :param target_path:         list of :py:class:`spreads.workflow.Page`
        :param metadata:            Metadata to include in PDF file
        :type metadata:             :py:class:`spreads.metadata.Metadata`
        :param table_of_contents:   Table of contents to include in PDF file
        :type table_of_contents:    list of :py:class:`TocEntry`
        """
        logger.info("Assembling PDF.")

        tmpdir = Path(tempfile.mkdtemp())

        meta_file = tmpdir / 'metadata.txt'
        with codecs.open(unicode(meta_file), "w", "utf-8") as fp:
            for key, value in metadata.iteritems():
                if key == 'title':
                    fp.write("Title: \"{0}\"\n".format(value))
                if key == 'creator':
                    for author in value:
                        fp.write("Author: \"{0}\"\n".format(author))

        images = []
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            link_path = (tmpdir / fpath.name)
            if IS_WIN:
                shutil.copy(unicode(fpath), unicode(link_path))
            else:
                link_path.symlink_to(fpath.absolute())
            if 'tesseract' in page.processed_images:
                ocr_path = page.processed_images['tesseract']
                if IS_WIN:
                    shutil.copy(unicode(ocr_path),
                                unicode(tmpdir / ocr_path.name))
                else:
                    (tmpdir / ocr_path.name).symlink_to(ocr_path.absolute())
            images.append(link_path.absolute())

        pdf_file = target_path.absolute() / "book.pdf"

        # TODO: Use table_of_contents to create a TOCFILE for pdfbeads
        # TODO: Use page.page_label to create a LSPEC for pdfbeads

        # NOTE: pdfbeads only finds *html files for the text layer in the
        #       working directory, so we have to chdir() into it
        old_path = os.path.abspath(os.path.curdir)
        os.chdir(unicode(tmpdir))

        cmd = [BIN, "-d", "-M", unicode(meta_file)]
        if IS_WIN:
            cmd.append(util.wildcardify(tuple(f.name for f in images)))
        else:
            cmd.extend([unicode(f) for f in images])
        cmd.extend(["-o", unicode(pdf_file)])
        logger.debug("Running " + " ".join(cmd))
        proc = util.get_subprocess(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE,
                                   shell=IS_WIN)
        if IS_WIN:
            # NOTE: Due to a bug in the jbig2enc version for Windows, the error
            #       output gets huge, creating a deadlock. Hence, we go the
            #       safe way and use `communicate()`, though this means no
            #       progress notification for the user.
            output, errors = proc.communicate()
        else:
            errors = ""
            is_jbig2 = False
            cur_jbig2_page = 0
            while proc.poll() is None:
                cur_line = proc.stderr.readline()
                errors += "\n" + cur_line
                prep_match = re.match(r"^Prepared data for processing (.*)$",
                                      cur_line)
                proc_match = re.match(r"^Processed (.*)$", cur_line)
                jbig2_match = re.match(
                    r"^JBIG2 compression complete. pages:(\d+) symbols:\d+ "
                    r"log2:\d+$", cur_line)
                progress = None
                if prep_match:
                    file_idx = next(idx for idx, f in enumerate(images)
                                    if unicode(f) == prep_match.group(1))
                    progress = file_idx / (len(images) * 2)
                elif jbig2_match:
                    cur_jbig2_page += int(jbig2_match.group(1))
                    progress = (len(images) + cur_jbig2_page) / (len(images) *
                                                                 2)
                    is_jbig2 = True
                elif proc_match and not is_jbig2:
                    file_idx = next(idx for idx, f in enumerate(images)
                                    if unicode(f) == proc_match.group(1))
                    progress = (len(images) + file_idx) / (len(images) * 2)
                if progress is not None:
                    self.on_progressed.send(self, progress=progress)
                time.sleep(.01)
            output = proc.stdout.read()
        logger.debug("pdfbeads stdout:\n{0}".format(output))
        logger.debug("pdfbeads stderr:\n{0}".format(errors))
        os.chdir(old_path)
        shutil.rmtree(unicode(tmpdir))