def process(self, pages, target_path): # TODO: This plugin should be 'output' only, since we ideally work # with fully binarized output images # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[fpath] = page out_dir = Path(tempfile.mkdtemp(prefix='tess-out')) language = self.config["language"].get() logger.info("Performing OCR") logger.info("Language is \"{0}\"".format(language)) self._perform_ocr(in_paths, out_dir, language) for fname in chain(out_dir.glob('*.hocr'), out_dir.glob('*.html')): self._fix_hocr(fname) out_stem = fname.stem for in_path, page in in_paths.iteritems(): if in_path.stem == out_stem: target_fname = target_path/fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn("Could not find page for output file {0}" .format(fname))
def process(self, pages, target_path): # TODO: This plugin should be 'output' only, since we ideally work # with fully binarized output images # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[fpath] = page out_dir = Path(tempfile.mkdtemp(prefix='tess-out')) language = self.config["language"].get() logger.info("Performing OCR") logger.info("Language is \"{0}\"".format(language)) self._perform_ocr(in_paths, out_dir, language) for fname in chain(out_dir.glob('*.hocr'), out_dir.glob('*.html')): self._fix_hocr(fname) out_stem = fname.stem for in_path, page in in_paths.iteritems(): if in_path.stem == out_stem: target_fname = target_path / fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn( "Could not find page for output file {0}".format(fname))
def process(self, pages, target_path): autopilot = self.config['autopilot'].get(bool) if not autopilot and not find_in_path('scantailor'): raise MissingDependencyException( "Could not find executable `scantailor` in" " $PATH. Please install the appropriate" " package(s)!") # Create temporary files/directories projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1]) out_dir = Path(tempfile.mkdtemp(prefix='st-out')) # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[unicode(fpath)] = page logger.info("Generating ScanTailor configuration") self._generate_configuration(sorted(in_paths.keys()), projectfile, out_dir) if not autopilot: logger.warn("If you are changing output settings (in the last " "step, you *have* to run the last step from the GUI. " "Due to a bug in ScanTailor, your settings would " "otherwise be ignored.") time.sleep(5) logger.info("Opening ScanTailor GUI for manual adjustment") subprocess.call([find_in_path('scantailor'), unicode(projectfile)]) # Check if the user already generated output files from the GUI if not sum(1 for x in out_dir.glob('*.tif')) == len(pages): logger.info("Generating output images from ScanTailor " "configuration.") self._generate_output(projectfile, out_dir, len(pages)) # Associate generated output files with our pages for fname in out_dir.glob('*.tif'): out_stem = fname.stem for in_path, page in in_paths.iteritems(): if Path(in_path).stem == out_stem: target_fname = target_path/fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn("Could not find page for output file {0}" .format(fname)) # Remove temporary files/directories shutil.rmtree(unicode(out_dir)) projectfile.unlink()
def process(self, pages, target_path): """ For each page, rotate the most recent image according to its EXIF orientation tag. :param pages: Pages to be processed :type pages: list of :py:class:`spreads.workflow.Page` :param target_path: Base directory where processed images are to be stored :type target_path: :py:class:`pathlib.Path` """ # TODO: This plugin should be 'output' only, since we ideally work # with fully binarized output images # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[fpath] = page out_dir = Path(tempfile.mkdtemp(prefix='tess-out')) language = self.config["language"].get() logger.info("Performing OCR") logger.info("Language is \"{0}\"".format(language)) self._perform_ocr(in_paths, out_dir, language) for fname in chain(out_dir.glob('*.hocr'), out_dir.glob('*.html')): self._perform_replacements(fname) # For each hOCR file, try to find a corresponding input image # and associate it to the image's page out_stem = fname.stem for in_path, page in in_paths.iteritems(): if in_path.stem == out_stem: target_fname = target_path / fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn( "Could not find page for output file {0}".format(fname))
def process(self, pages, target_path): """ For each page, rotate the most recent image according to its EXIF orientation tag. :param pages: Pages to be processed :type pages: list of :py:class:`spreads.workflow.Page` :param target_path: Base directory where processed images are to be stored :type target_path: :py:class:`pathlib.Path` """ # TODO: This plugin should be 'output' only, since we ideally work # with fully binarized output images # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[fpath] = page out_dir = Path(tempfile.mkdtemp(prefix='tess-out')) language = self.config["language"].get() logger.info("Performing OCR") logger.info("Language is \"{0}\"".format(language)) self._perform_ocr(in_paths, out_dir, language) for fname in chain(out_dir.glob('*.hocr'), out_dir.glob('*.html')): self._perform_replacements(fname) # For each hOCR file, try to find a corresponding input image # and associate it to the image's page out_stem = fname.stem for in_path, page in in_paths.iteritems(): if in_path.stem == out_stem: target_fname = target_path/fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn("Could not find page for output file {0}" .format(fname))
def output(self, pages, target_path, metadata, table_of_contents): logger.info("Assembling PDF.") tmpdir = Path(tempfile.mkdtemp()) # NOTE: pdfbeads only finds *html files for the text layer in the # working directory, so we have to chdir() into it old_path = os.path.abspath(os.path.curdir) os.chdir(unicode(tmpdir)) images = [] for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image link_path = (tmpdir / fpath.name) link_path.symlink_to(fpath) if 'tesseract' in page.processed_images: ocr_path = page.processed_images['tesseract'] (tmpdir / ocr_path.name).symlink_to(ocr_path) images.append(link_path) # TODO: Use metadata to create a METAFILE for pdfbeads # TODO: Use table_of_contents to create a TOCFILE for pdfbeads # TODO: Use page.page_label to create a LSPEC for pdfbeads pdf_file = target_path / "book.pdf" cmd = [find_in_path("pdfbeads"), "-d"] cmd.extend([f.name for f in images]) cmd.extend(["-o", unicode(pdf_file)]) logger.debug("Running " + " ".join(cmd)) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) last_count = 0 while proc.poll() is None: current_count = sum(1 for x in tmpdir.glob('*.jbig2')) if current_count > last_count: last_count = current_count self.on_progressed.send(self, progress=float(current_count) / len(images)) time.sleep(.01) logger.debug("Output:\n{0}".format(proc.stdout.read())) os.chdir(old_path)
def output(self, pages, target_path, metadata, table_of_contents): logger.info("Assembling PDF.") tmpdir = Path(tempfile.mkdtemp()) # NOTE: pdfbeads only finds *html files for the text layer in the # working directory, so we have to chdir() into it old_path = os.path.abspath(os.path.curdir) os.chdir(unicode(tmpdir)) images = [] for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image link_path = (tmpdir/fpath.name) link_path.symlink_to(fpath) if 'tesseract' in page.processed_images: ocr_path = page.processed_images['tesseract'] (tmpdir/ocr_path.name).symlink_to(ocr_path) images.append(link_path) # TODO: Use metadata to create a METAFILE for pdfbeads # TODO: Use table_of_contents to create a TOCFILE for pdfbeads # TODO: Use page.page_label to create a LSPEC for pdfbeads pdf_file = target_path/"book.pdf" cmd = [find_in_path("pdfbeads"), "-d"] cmd.extend([f.name for f in images]) cmd.extend(["-o", unicode(pdf_file)]) logger.debug("Running " + " ".join(cmd)) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) last_count = 0 while proc.poll() is None: current_count = sum(1 for x in tmpdir.glob('*.jbig2')) if current_count > last_count: last_count = current_count self.on_progressed.send( self, progress=float(current_count)/len(images)) time.sleep(.01) logger.debug("Output:\n{0}".format(proc.stdout.read())) os.chdir(old_path)
def process(self, pages, target_path): """ Run the most recent image of every page through ScanTailor. :param pages: Pages to be processed :type pages: list of :py:class:`spreads.workflow.Page` :param target_path: Base directory where rotated images are to be stored :type target_path: :py:class:`pathlib.Path` """ autopilot = self.config['autopilot'].get(bool) if not autopilot and not util.find_in_path('scantailor'): raise util.MissingDependencyException( "Could not find executable `scantailor` in" " $PATH. Please install the appropriate" " package(s)!") # Create temporary files/directories projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1]) out_dir = Path(tempfile.mkdtemp(prefix='st-out')) # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[unicode(fpath)] = page logger.info("Generating ScanTailor configuration") self._generate_configuration(sorted(in_paths.keys()), projectfile, out_dir) if not autopilot: logger.warn("If you are changing output settings (in the last " "step, you *have* to run the last step from the GUI. " "Due to a bug in ScanTailor, your settings would " "otherwise be ignored.") time.sleep(5) logger.info("Opening ScanTailor GUI for manual adjustment") util.get_subprocess([GUI_BIN, unicode(projectfile)]) # Check if the user already generated output files from the GUI if not sum(1 for x in out_dir.glob('*.tif')) == len(pages): logger.info("Generating output images from ScanTailor " "configuration.") self._generate_output(projectfile, out_dir, len(pages)) # Associate generated output files with our pages for fname in out_dir.glob('*.tif'): out_stem = fname.stem for in_path, page in in_paths.iteritems(): if Path(in_path).stem == out_stem: target_fname = target_path / fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn( "Could not find page for output file {0}".format(fname)) # Remove temporary files/directories shutil.rmtree(unicode(out_dir)) # FIXME: This fails on Windows since there seems to be some non-gcable # reference to the file around, but I currently cannot figure # out where, so we just ignore the error... try: projectfile.unlink() except WindowsError as e: if e.errno == 32: pass
def output(self, pages, target_path, metadata, table_of_contents): logger.info("Assembling PDF.") tmpdir = Path(tempfile.mkdtemp()) meta_file = tmpdir/'metadata.txt' with codecs.open(unicode(meta_file), "w", "utf-8") as fp: for key, value in metadata.iteritems(): if key == 'title': fp.write("Title: \"{0}\"\n".format(value)) if key == 'creator': for author in value: fp.write("Author: \"{0}\"\n".format(author)) images = [] for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image link_path = (tmpdir/fpath.name) if IS_WIN: shutil.copy(unicode(fpath), unicode(link_path)) else: link_path.symlink_to(fpath.absolute()) if 'tesseract' in page.processed_images: ocr_path = page.processed_images['tesseract'] if IS_WIN: shutil.copy(unicode(ocr_path), unicode(tmpdir/ocr_path.name)) else: (tmpdir/ocr_path.name).symlink_to(ocr_path.absolute()) images.append(link_path.absolute()) pdf_file = target_path.absolute()/"book.pdf" # TODO: Use table_of_contents to create a TOCFILE for pdfbeads # TODO: Use page.page_label to create a LSPEC for pdfbeads # NOTE: pdfbeads only finds *html files for the text layer in the # working directory, so we have to chdir() into it old_path = os.path.abspath(os.path.curdir) os.chdir(unicode(tmpdir)) cmd = [BIN, "-d", "-M", unicode(meta_file)] if IS_WIN: cmd.append(util.wildcardify(tuple(f.name for f in images))) else: cmd.extend([unicode(f) for f in images]) cmd.extend(["-o", unicode(pdf_file)]) logger.debug("Running " + " ".join(cmd)) proc = util.get_subprocess(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=IS_WIN) if IS_WIN: # NOTE: Due to a bug in the jbig2enc version for Windows, the error # output gets huge, creating a deadlock. Hence, we go the # safe way and use `communicate()`, though this means no # progress notification for the user. output, errors = proc.communicate() else: last_count = 0 while proc.poll() is None: current_count = sum(1 for x in tmpdir.glob('*.jbig2')) if current_count > last_count: last_count = current_count self.on_progressed.send( self, progress=float(current_count)/len(images)) time.sleep(.01) output = proc.stdout.read() errors = proc.stderr.read() logger.debug("pdfbeads stdout:\n{0}".format(output)) logger.debug("pdfbeads stderr:\n{0}".format(errors)) os.chdir(old_path) shutil.rmtree(unicode(tmpdir))
def process(self, pages, target_path): """ Run the most recent image of every page through ScanTailor. :param pages: Pages to be processed :type pages: list of :py:class:`spreads.workflow.Page` :param target_path: Base directory where rotated images are to be stored :type target_path: :py:class:`pathlib.Path` """ autopilot = self.config['autopilot'].get(bool) if not autopilot and not util.find_in_path('scantailor'): raise util.MissingDependencyException( "Could not find executable `scantailor` in" " $PATH. Please install the appropriate" " package(s)!") # Create temporary files/directories projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1]) out_dir = Path(tempfile.mkdtemp(prefix='st-out')) # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[unicode(fpath)] = page logger.info("Generating ScanTailor configuration") self._generate_configuration(sorted(in_paths.keys()), projectfile, out_dir) if not autopilot: logger.warn("If you are changing output settings (in the last " "step, you *have* to run the last step from the GUI. " "Due to a bug in ScanTailor, your settings would " "otherwise be ignored.") time.sleep(5) logger.info("Opening ScanTailor GUI for manual adjustment") util.get_subprocess([GUI_BIN, unicode(projectfile)]) # Check if the user already generated output files from the GUI if not sum(1 for x in out_dir.glob('*.tif')) == len(pages): logger.info("Generating output images from ScanTailor " "configuration.") self._generate_output(projectfile, out_dir, len(pages)) # Associate generated output files with our pages for fname in out_dir.glob('*.tif'): out_stem = fname.stem for in_path, page in in_paths.iteritems(): if Path(in_path).stem == out_stem: target_fname = target_path/fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn("Could not find page for output file {0}" .format(fname)) # Remove temporary files/directories shutil.rmtree(unicode(out_dir)) # FIXME: This fails on Windows since there seems to be some non-gcable # reference to the file around, but I currently cannot figure # out where, so we just ignore the error... try: projectfile.unlink() except WindowsError as e: if e.errno == 32: pass