Exemple #1
0
    def create(cls, location, metadata=None, config=None):
        """ Create a new Workflow.

        :param location:    Base directory that the workflow should be created
                            in
        :type location:     unicode or :py:class:`pathlib.Path`
        :param metadata:    Initial metadata for workflow. Must at least
                            contain a `title` item.
        :type metadata:     dict
        :param config:      Initial configuration for workflow
        :type config:       dict or :py:class:`spreads.config.Configuration`
        :return:            The new instance
        :rtype:             :py:class:`Workflow`
        """
        if not isinstance(location, Path):
            location = Path(location)
        if metadata is None or 'title' not in metadata:
            raise ValidationError(
                metadata={'title': 'Please specify at least a title'})
        path = Path(location/util.slugify(metadata['title']))
        if path.exists():
            raise ValidationError(
                name="A workflow with that title already exists")
        wf = cls(path=path, config=config, metadata=metadata)
        return wf
Exemple #2
0
    def process(self, pages, target_path):
        # TODO: This plugin should be 'output' only, since we ideally work
        #       with fully binarized output images
        # Map input paths to their pages so we can more easily associate
        # the generated output files with their pages later on
        in_paths = {}
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            in_paths[fpath] = page

        out_dir = Path(tempfile.mkdtemp(prefix='tess-out'))
        language = self.config["language"].get()

        logger.info("Performing OCR")
        logger.info("Language is \"{0}\"".format(language))
        self._perform_ocr(in_paths, out_dir, language)

        for fname in chain(out_dir.glob('*.hocr'), out_dir.glob('*.html')):
            self._fix_hocr(fname)
            out_stem = fname.stem
            for in_path, page in in_paths.iteritems():
                if in_path.stem == out_stem:
                    target_fname = target_path / fname.name
                    shutil.copyfile(unicode(fname), unicode(target_fname))
                    page.processed_images[self.__name__] = target_fname
                    break
            else:
                logger.warn(
                    "Could not find page for output file {0}".format(fname))
Exemple #3
0
    def process(self, pages, target_path):
        # TODO: This plugin should be 'output' only, since we ideally work
        #       with fully binarized output images
        # Map input paths to their pages so we can more easily associate
        # the generated output files with their pages later on
        in_paths = {}
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            in_paths[fpath] = page

        out_dir = Path(tempfile.mkdtemp(prefix='tess-out'))
        language = self.config["language"].get()

        logger.info("Performing OCR")
        logger.info("Language is \"{0}\"".format(language))
        self._perform_ocr(in_paths, out_dir, language)

        for fname in chain(out_dir.glob('*.hocr'), out_dir.glob('*.html')):
            self._fix_hocr(fname)
            out_stem = fname.stem
            for in_path, page in in_paths.iteritems():
                if in_path.stem == out_stem:
                    target_fname = target_path/fname.name
                    shutil.copyfile(unicode(fname), unicode(target_fname))
                    page.processed_images[self.__name__] = target_fname
                    break
            else:
                logger.warn("Could not find page for output file {0}"
                            .format(fname))
Exemple #4
0
    def find_all(cls, location, key='slug', reload=False):
        """ List all workflows in the given location.

        :param location:    Location where the workflows are located
        :type location:     unicode or :py:class:`pathlib.Path`
        :param key:         Attribute to use as key for returned dict
        :type key:          str/unicode
        :param reload:      Do not load workflows from cache
        :type reload:       bool
        :return:            All found workflows
        :rtype:             dict
        """
        if not isinstance(location, Path):
            location = Path(location)
        if key not in ('slug', 'id'):
            raise ValueError("'key' must be one of ('id', 'slug')")
        if location in cls._cache and not reload:
            found = cls._cache[location]
        else:
            found = []
        for candidate in location.iterdir():
            is_workflow = (location.is_dir() and
                           ((candidate/'bagit.txt').exists or
                            (candidate/'raw').exists))
            if not is_workflow:
                continue
            if not next((wf for wf in found if wf.path == candidate), None):
                logging.debug(
                    "Cache missed, instantiating workflow from {0}."
                    .format(candidate))
                workflow = cls(candidate)
                found.append(workflow)
        cls._cache[location] = found
        return {getattr(wf, key): wf for wf in cls._cache[location]}
Exemple #5
0
    def find_all(cls, location, key='slug', reload=False):
        """ List all workflows in the given location.

        :param location:    Location where the workflows are located
        :type location:     unicode/pathlib.Path
        :param key:         Attribute to use as key for returned dict
        :type key:          str
        :param reload:      Do not load workflows from cache
        :type reload:       bool
        :return:            All found workflows
        :rtype:             dict
        """
        if not isinstance(location, Path):
            location = Path(location)
        if key not in ('slug', 'id'):
            raise ValueError("'key' must be one of ('id', 'slug')")
        if location in cls._cache and not reload:
            found = cls._cache[location]
        else:
            found = []
        for candidate in location.iterdir():
            is_workflow = (location.is_dir() and
                           ((candidate/'bagit.txt').exists
                            or (candidate/'raw').exists))
            if not is_workflow:
                continue
            if not next((wf for wf in found if wf.path == candidate), None):
                logging.debug(
                    "Cache missed, instantiating workflow from {0}."
                    .format(candidate))
                workflow = cls(candidate)
                found.append(workflow)
        cls._cache[location] = found
        return {getattr(wf, key): wf for wf in cls._cache[location]}
Exemple #6
0
def get_data_dir(create=False):
    UNIX_DIR_VAR = 'XDG_DATA_DIRS'
    UNIX_DIR_FALLBACK = '~/.config'
    WINDOWS_DIR_VAR = 'APPDATA'
    WINDOWS_DIR_FALLBACK = '~\\AppData\\Roaming'
    MAC_DIR = '~/Library/Application Support'
    base_dir = None
    if platform.system() == 'Darwin':
        if Path(UNIX_DIR_FALLBACK).exists:
            base_dir = UNIX_DIR_FALLBACK
        else:
            base_dir = MAC_DIR
    elif platform.system() == 'Windows':
        if WINDOWS_DIR_VAR in os.environ:
            base_dir = os.environ[WINDOWS_DIR_VAR]
        else:
            base_dir = WINDOWS_DIR_FALLBACK
    else:
        if UNIX_DIR_VAR in os.environ:
            base_dir = os.environ[UNIX_DIR_VAR]
        else:
            base_dir = UNIX_DIR_FALLBACK
    app_path = Path(base_dir)/'spreads'
    if create and not app_path.exists():
        app_path.mkdir()
    return unicode(app_path)
Exemple #7
0
def test_fix_hocr(plugin, tmpdir):
    shutil.copyfile('./tests/data/000.hocr', unicode(tmpdir.join('test.html')))
    fpath = Path(unicode(tmpdir.join('test.html')))
    plugin._fix_hocr(fpath)
    with fpath.open('r') as fp:
        matches = re.findall(
            r'(<span[^>]*>(<strong>)? +(</strong>)?</span> *){2}', fp.read())
    assert len(matches) == 0
def test_split_configuration(plugin, tmpdir):
    with mock.patch('spreadsplug.scantailor.multiprocessing.cpu_count') as cnt:
        cnt.return_value = 4
        splitfiles = plugin._split_configuration(
            Path('./tests/data/test.scanTailor'), Path(unicode(tmpdir)))
    assert len(splitfiles) == 4
    tree = ET.parse(unicode(splitfiles[0]))
    for elem in ('files', 'images', 'pages', 'file-name-disambiguation'):
        assert len(tree.find('./{0}'.format(elem))) == 7
Exemple #9
0
def test_generate_configuration(popen, proc, plugin):
    proc.return_value.is_running.return_value = False
    in_paths = ['{0:03}.jpg'.format(idx) for idx in xrange(5)]
    proj_file = Path('/tmp/foo.st')
    out_dir = Path('/tmp/out')
    plugin._generate_configuration(in_paths, proj_file, out_dir)
    args = popen.call_args[0][0]
    for fp in in_paths:
        assert fp in args
Exemple #10
0
def test_generate_configuration(popen, proc, plugin):
    proc.return_value.is_running.return_value = False
    # TODO: Setup up some config variables
    imgdir = mock.MagicMock(wraps=Path('/tmp/raw'))
    imgs = [imgdir/"foo.jpg", imgdir/"bar.jpg"]
    imgdir.iterdir.return_value = imgs
    plugin._generate_configuration(Path('/tmp/foo.st'),
                                   imgdir,
                                   Path('/tmp/out'))
def test_perform_replacements(plugin, tmpdir):
    shutil.copyfile('./tests/data/000.hocr', unicode(tmpdir.join('test.html')))
    fpath = Path(unicode(tmpdir.join('test.html')))
    plugin._perform_replacements(fpath)
    with fpath.open('r') as fp:
        matches = re.findall(
            r'(<span[^>]*>(<strong>)? +(</strong>)?</span> *){2}',
            fp.read())
    assert len(matches) == 0
Exemple #12
0
 def create(cls, location, metadata=None, config=None):
     if not isinstance(location, Path):
         location = Path(location)
     if metadata is None or not 'title' in metadata:
         raise ValidationError(
             metadata={'title': 'Please specify at least a title'})
     path = Path(location/util.slugify(metadata['title']))
     if path.exists():
         raise ValidationError(
             name="A workflow with that title already exists")
     wf = cls(path=path, config=config, metadata=metadata)
     return wf
Exemple #13
0
def build_msi(bitness=32):
    egg_path = Path('spreads.egg-info')
    if egg_path.exists():
        shutil.rmtree(unicode(egg_path))
    build_path = Path('build')
    if not build_path.exists():
        build_path.mkdir()
    pkg_dir = build_path / 'pynsist_pkgs'
    if pkg_dir.exists():
        shutil.rmtree(unicode(pkg_dir))
    pkg_dir.mkdir()
    for pkg in BINARY_PACKAGES.itervalues():
        arch = 'win32' if bitness == 32 else 'win-amd64'
        extract_native_pkg(pkg.format(arch=arch), pkg_dir)

    for pkg in (x.project_name for x in SOURCE_PACKAGES
                if x.project_name is not None):
        copy_info(pkg, pkg_dir)

    icon = os.path.abspath("spreads.ico")
    extra_files = [(unicode(
        (Path('win_deps') / 'extra' /
         x.format(arch='.amd64' if bitness == 64 else '')).absolute()), None)
                   for x in EXTRA_FILES]
    nsi_template = os.path.abspath("template.nsi")

    # NOTE: We need to remove the working directory from sys.path to force
    # pynsist to copy all of our modules, including 'spreads' and 'spreadsplug'
    # from the site-packages. Additionally, we need to change into the
    # build directory.
    if os.getcwd() in sys.path:
        sys.path.remove(os.getcwd())
    os.chdir(unicode(build_path))
    builder = InstallerBuilder(
        appname="spreads",
        version=spreads.__version__,
        packages=[x.module_name for x in SOURCE_PACKAGES],
        extra_files=extra_files,
        py_version="2.7.6",
        py_bitness=bitness,
        build_dir='msi{0}'.format(bitness),
        installer_name=None,
        nsi_template=nsi_template,
        icon=icon,
        shortcuts={
            'Configure spreads': {
                'entry_point': 'spreads.main:run_config_windows',
                'icon': icon,
                'console': False
            },
            'Spreads Web Service': {
                'entry_point': 'spreads.main:run_service_windows',
                'icon': icon,
                'console': False
            }
        })
    builder.run()
    os.chdir('..')
Exemple #14
0
    def process(self, pages, target_path):
        autopilot = self.config['autopilot'].get(bool)
        if not autopilot and not find_in_path('scantailor'):
            raise MissingDependencyException(
                "Could not find executable `scantailor` in"
                " $PATH. Please install the appropriate"
                " package(s)!")

        # Create temporary files/directories
        projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1])
        out_dir = Path(tempfile.mkdtemp(prefix='st-out'))

        # Map input paths to their pages so we can more easily associate
        # the generated output files with their pages later on
        in_paths = {}
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            in_paths[unicode(fpath)] = page

        logger.info("Generating ScanTailor configuration")
        self._generate_configuration(sorted(in_paths.keys()),
                                     projectfile, out_dir)

        if not autopilot:
            logger.warn("If you are changing output settings (in the last "
                        "step, you *have* to run the last step from the GUI. "
                        "Due to a bug in ScanTailor, your settings would "
                        "otherwise be ignored.")
            time.sleep(5)
            logger.info("Opening ScanTailor GUI for manual adjustment")
            subprocess.call([find_in_path('scantailor'), unicode(projectfile)])
        # Check if the user already generated output files from the GUI
        if not sum(1 for x in out_dir.glob('*.tif')) == len(pages):
            logger.info("Generating output images from ScanTailor "
                        "configuration.")
            self._generate_output(projectfile, out_dir, len(pages))

        # Associate generated output files with our pages
        for fname in out_dir.glob('*.tif'):
            out_stem = fname.stem
            for in_path, page in in_paths.iteritems():
                if Path(in_path).stem == out_stem:
                    target_fname = target_path/fname.name
                    shutil.copyfile(unicode(fname), unicode(target_fname))
                    page.processed_images[self.__name__] = target_fname
                    break
            else:
                logger.warn("Could not find page for output file {0}"
                            .format(fname))

        # Remove temporary files/directories
        shutil.rmtree(unicode(out_dir))
        projectfile.unlink()
Exemple #15
0
def test_output(plugin, tmpdir):
    dummy_pages = []
    for idx in xrange(20):
        dummy_pages.append(
            Page(Path('000.jpg'),
                 idx,
                 processed_images={'tesseract':
                                   Path('./tests/data/000.hocr')}))
    plugin.output(dummy_pages, tmpdir, None, None)
    assert tmpdir.join('text.html').exists()
    tree = ET.parse(unicode(tmpdir.join('text.html')))
    assert len(tree.findall('.//span[@class="ocrx_word"]')) == 20 * 201
    assert len(tree.findall('.//span[@class="ocr_line"]')) == 20 * 26
    assert len(tree.findall('.//p[@class="ocr_par"]')) == 20 * 4
    assert len(tree.findall('.//div[@class="ocr_page"]')) == 20
Exemple #16
0
def extract_native_pkg(fname, pkg_dir):
    zf = zipfile.ZipFile(unicode(Path('win_deps') / 'python' / fname))
    tmpdir = Path(tempfile.mkdtemp())
    zf.extractall(unicode(tmpdir))
    fpaths = []
    if (tmpdir / 'PLATLIB').exists():
        fpaths += [p for p in (tmpdir / 'PLATLIB').iterdir()]
    if (tmpdir / 'PURELIB').exists():
        fpaths += [p for p in (tmpdir / 'PURELIB').iterdir()]
    for path in fpaths:
        if path.is_dir():
            shutil.copytree(unicode(path), unicode(pkg_dir / path.name))
        else:
            shutil.copy2(unicode(path), unicode(pkg_dir / path.name))
    shutil.rmtree(unicode(tmpdir))
Exemple #17
0
    def yield_devices(cls, config):
        """ Search for usable devices, yield one at a time

        :param config:  spreads configuration
        :type config:   spreads.confit.ConfigView
        """
        SPECIAL_CASES = {  # noqa
            # (idVendor, idProduct): SpecialClass
            (0x4a9, 0x31ef): QualityFix,  # not r47, but has the same bug
            (0x4a9, 0x3218): QualityFix,
            (0x4a9, 0x3223): A3300,
            (0x4a9, 0x3224): QualityFix,
            (0x4a9, 0x3225): QualityFix,
            (0x4a9, 0x3226): QualityFix,
            (0x4a9, 0x3227): QualityFix,
            (0x4a9, 0x3228): QualityFix,
            (0x4a9, 0x3229): QualityFix,
            (0x4a9, 0x322a): QualityFix,
            (0x4a9, 0x322b): QualityFix,
            (0x4a9, 0x322c): QualityFix,
        }

        # Check if we can find the chdkptp executable
        chdkptp_path = Path(config["chdkptp_path"].get(unicode))
        if not chdkptp_path.exists() or not (chdkptp_path /
                                             'chdkptp').exists():
            raise MissingDependencyException(
                "Could not find executable `chdkptp`. Please make sure that "
                "the `chdkptp_path` setting in your `chdkcamera` "
                "configuration points to "
                "a directory containing chdkptp "
                "and its libraries. Current setting is `{0}`".format(
                    chdkptp_path))

        # only match ptp devices in find_all
        def is_ptp(dev):
            for cfg in dev:
                if usb.util.find_descriptor(cfg,
                                            bInterfaceClass=6,
                                            bInterfaceSubClass=1):
                    return True

        for dev in usb.core.find(find_all=True, custom_match=is_ptp):
            ids = (dev.idVendor, dev.idProduct)
            if ids in SPECIAL_CASES:
                yield SPECIAL_CASES[ids](config, dev)
            else:
                yield cls(config, dev)
Exemple #18
0
 def find_by_slug(cls, location, slug):
     if not isinstance(location, Path):
         location = Path(location)
     try:
         return cls.find_all(location, key='slug')[slug]
     except KeyError:
         return None
Exemple #19
0
 def find_by_id(cls, location, id):
     if not isinstance(location, Path):
         location = Path(location)
     try:
         return cls.find_all(location, key='id')[id]
     except KeyError:
         return None
Exemple #20
0
    def output(self, pages, target_path, metadata, table_of_contents):
        """ Go through pages and bundle their most recent images into a DJVU
            file.

        :param pages:               Pages to bundle
        :param target_path:         list of :py:class:`spreads.workflow.Page`
        :param metadata:            Metadata to include in DJVU file
        :type metadata:             :py:class:`spreads.metadata.Metadata`
        :param table_of_contents:   Table of contents to include in DJVU file
        :type table_of_contents:    list of :py:class:`TocEntry`
        """
        logger.info("Assembling DJVU.")

        tmpdir = Path(tempfile.mkdtemp())
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            link_path = (tmpdir / fpath.name)
            link_path.symlink_to(fpath)

        # TODO: Add metadata
        # TODO: Add table of contents

        djvu_file = target_path / "book.djvu"
        cmd = ["djvubind", unicode(tmpdir), '--no-ocr']
        logger.debug("Running " + " ".join(cmd))
        subprocess.check_output(cmd, stderr=subprocess.STDOUT)
        os.rename("book.djvu", unicode(djvu_file))
        shutil.rmtree(unicode(tmpdir))
Exemple #21
0
    def cfg_path(self):
        """ Path to YAML file of the user-specific configuration.

        :returns:   Path
        :rtype:     :py:class:`pathlib.Path`
        """
        return Path(self._config.config_dir()) / confit.CONFIG_FILENAME
Exemple #22
0
    def __init__(self, path, config=None, step=None, step_done=None, id=None):
        self._logger = logging.getLogger('Workflow')
        self._logger.debug("Initializing workflow {0}".format(path))
        self.step = step
        self.step_done = step_done
        if not isinstance(path, Path):
            path = Path(path)
        self.path = path
        if not self.path.exists():
            self.path.mkdir()
        self.id = id
        if self.images:
            self.pages_shot = len(self.images)
        else:
            self.pages_shot = 0
        # See if supplied `config` is already a valid ConfigView object
        if isinstance(config, confit.ConfigView):
            self.config = config
        elif isinstance(config, Configuration):
            self.config = config.as_view()
        else:
            self.config = self._load_config(config)
        self._capture_lock = threading.RLock()
        self.active = False
        self._devices = None
        self._pluginmanager = None

        # Instantiate plugins
        self.plugins = [
            cls(self.config) for cls in plugin.get_plugins(
                *self.config["plugins"].get()).values()
        ]
Exemple #23
0
def get_workflow(workflow_id):
    # See if the workflow is among our cached instances
    if workflow_id in WorkflowCache:
        return WorkflowCache[workflow_id]
    logger.debug("Loading workflow {0} from database".format(workflow_id))
    with open_connection() as con:
        db_data = con.execute("SELECT * FROM workflow WHERE workflow.id=?",
                              (workflow_id, )).fetchone()
    if db_data is None:
        logger.warn("Workflow {0} was not found.".format(workflow_id))
        return None

    db_workflow = DbWorkflow(*db_data)

    # Try to load configuration from database
    if db_workflow.config is not None:
        config = json.loads(db_workflow.config)
    else:
        config = None
    workflow = Workflow(path=Path(app.config['base_path']) / db_workflow.name,
                        config=config,
                        step=db_workflow.step,
                        step_done=bool(db_workflow.step_done),
                        id=workflow_id)
    WorkflowCache[workflow_id] = workflow
    return workflow
Exemple #24
0
def create_workflow():
    """ Create a new workflow.

    Payload should be a JSON object. The only required attribute is 'name' for
    the desired workflow name. Optionally, 'config' can be set to a
    configuration object in the form "plugin_name: { setting: value, ...}".

    Returns the newly created workflow as a JSON object.
    """
    data = json.loads(request.data)
    path = Path(app.config['base_path']) / unicode(data['name'])

    # Setup default configuration
    config = app.config['default_config']
    # Overlay user-supplied values, if existant
    user_config = data.get('config', None)
    if user_config is not None:
        config = config.with_overlay(user_config)
    workflow = Workflow(config=config,
                        path=path,
                        step=data.get('step', None),
                        step_done=data.get('step_done', None))
    try:
        workflow.id = persistence.save_workflow(workflow)
    except persistence.ValidationError as e:
        return make_response(json.dumps(dict(errors=e.errors)), 400,
                             {'Content-Type': 'application/json'})
    return make_response(json.dumps(workflow), 200,
                         {'Content-Type': 'application/json'})
Exemple #25
0
    def _generate_output(self, projectfile, out_dir, num_pages):
        """ Run last step for the project file and keep track of the progress
            by emitting :py:attr:`on_progressed` signals.

        :param projectfile:     Path ScanTailor configuration file
        :type projectfile:      :py:class:`pathlib.Path`
        :param out_dir:         Output directory for processed files
        :type out_dir:          :py:class:`pathlib.Path`
        :param num_pages:       Total number of pages to process
        :type num_pages:        int
        """
        logger.debug("Generating output...")
        temp_dir = Path(tempfile.mkdtemp(prefix="spreads."))
        split_config = self._split_configuration(projectfile, temp_dir)
        logger.debug("Launching those subprocesses!")
        processes = [
            util.get_subprocess([
                CLI_BIN, '--start-filter=6',
                unicode(cfgfile),
                unicode(out_dir)
            ]) for cfgfile in split_config
        ]

        last_count = 0
        while processes:
            recent_count = sum(1 for x in out_dir.glob('*.tif'))
            if recent_count > last_count:
                progress = 0.5 + (float(recent_count) / num_pages) / 2
                self.on_progressed.send(self, progress=progress)
                last_count = recent_count
            for p in processes[:]:
                if p.poll() is not None:
                    processes.remove(p)
            time.sleep(.01)
        shutil.rmtree(unicode(temp_dir))
Exemple #26
0
def test_generate_configuration_noenhanced(popen, proc, config, pluginclass):
    proc.return_value.is_running.return_value = False
    # TODO: Setup up some config variables
    with mock.patch('subprocess.check_output') as mock_co:
        mock_co.return_value = "".join(chain(
            repeat("\n", 7),
            ("scantailor-cli [options] <image, image, ...>"
             " <output_directory>",))
        )
        plugin = pluginclass(config)
    imgdir = mock.MagicMock(wraps=Path('/tmp/raw'))
    imgs = [imgdir/"foo.jpg", imgdir/"bar.jpg"]
    imgdir.iterdir.return_value = imgs
    plugin._generate_configuration(Path('/tmp/foo.st'), imgdir,
                                   Path('/tmp/out'))
    assert (unicode(imgs[0]) in popen.call_args[0][0])
Exemple #27
0
def test_capture_noprepare(jpeg, camera):
    camera._run.side_effect = (
        chdkcamera.CHDKPTPException('dev not in rec mode'), None)
    with mock.patch.object(camera, 'prepare_capture') as prepare:
        camera.capture(Path('/tmp/000.jpg'))
        assert prepare.call_count == 1
        assert camera._run.call_count == 2
Exemple #28
0
    def yield_devices(cls, config):
        """ Search for usable devices, yield one at a time

        :param config:  spreads configuration
        :type config:   spreads.confit.ConfigView
        """
        SPECIAL_CASES = {
            # (idVendor, idProduct): SpecialClass
            (0x4a9, 0x31ef): QualityFix,  # not r47, but has the same bug
            (0x4a9, 0x3218): QualityFix,
            (0x4a9, 0x3223): QualityFix,
            (0x4a9, 0x3224): QualityFix,
            (0x4a9, 0x3225): QualityFix,
            (0x4a9, 0x3226): QualityFix,
            (0x4a9, 0x3227): QualityFix,
            (0x4a9, 0x3228): QualityFix,
            (0x4a9, 0x3229): QualityFix,
            (0x4a9, 0x322a): A2200,
            (0x4a9, 0x322b): QualityFix,
            (0x4a9, 0x322c): QualityFix,
        }

        # Check if we can find the chdkptp executable
        chdkptp_path = Path(config["chdkptp_path"].get(unicode))
        if not chdkptp_path.exists() or not (chdkptp_path/'chdkptp').exists():
            raise MissingDependencyException(
                "Could not find executable `chdkptp`. Please make sure that "
                "the `chdkptp_path` setting in your `chdkcamera` "
                "configuration points to " "a directory containing chdkptp "
                "and its libraries. Current setting is `{0}`"
                .format(chdkptp_path)
            )

        # only match ptp devices in find_all
        def is_ptp(dev):
            for cfg in dev:
                if usb.util.find_descriptor(cfg, bInterfaceClass=6,
                                            bInterfaceSubClass=1):
                    return True

        for dev in usb.core.find(find_all=True, custom_match=is_ptp):
            ids = (dev.idVendor, dev.idProduct)
            if ids in SPECIAL_CASES:
                yield SPECIAL_CASES[ids](config, dev)
            else:
                yield cls(config, dev)
Exemple #29
0
 def last_modified(self):
     # We use the most recent of the modified timestamps of the two
     # checksum files of the BagIt directory, since any relevant changes
     # to the workflow's structure will cause a change in at least one
     # file hash.
     return datetime.fromtimestamp(
         max(Path(self.path/fname).stat().st_mtime
             for fname in ('manifest-md5.txt', 'tagmanifest-md5.txt')))
Exemple #30
0
def test_capture_raw(jpeg, camera):
    jpeg.return_value = mock.Mock()
    camera.config['shoot_raw'] = True
    camera.capture(Path('/tmp/000.dng'))
    assert camera._run.call_count == 1
    assert "-dng " in camera._run.call_args_list[0][0][0]
    assert camera._run.call_args_list[0][0][0].endswith('"/tmp/000"')
    assert jpeg.called_once_with('/tmp/000.dng')
Exemple #31
0
def transfer_to_stick(wf_id, base_path):
    workflow = Workflow.find_by_id(base_path, wf_id)
    stick = find_stick()
    files = list(workflow.path.rglob('*'))
    num_files = len(files)
    # Filter out problematic characters
    clean_name = (workflow.path.name.replace(':', '_')
                                    .replace('/', '_'))
    workflow.status['step'] = 'transfer'
    try:
        if IS_WIN:
            target_path = Path(stick)/clean_name
        else:
            mount = stick.get_dbus_method(
                "FilesystemMount",
                dbus_interface="org.freedesktop.UDisks.Device")
            mount_point = mount('', [])
            target_path = Path(mount_point)/clean_name
        if target_path.exists():
            shutil.rmtree(unicode(target_path))
        target_path.mkdir()
        signals['transfer:started'].send(workflow)
        for num, path in enumerate(files, 1):
            signals['transfer:progressed'].send(
                workflow, progress=(num/num_files)*0.79, status=path.name)
            workflow.status['step_done'] = (num/num_files)*0.79
            target = target_path/path.relative_to(workflow.path)
            if path.is_dir():
                target.mkdir()
            else:
                shutil.copyfile(unicode(path), unicode(target))
    finally:
        if 'mount_point' in locals():
            signals['transfer:progressed'].send(workflow, progress=0.8,
                                                status="Syncing...")
            workflow.status['step_done'] = 0.8
            unmount = stick.get_dbus_method(
                "FilesystemUnmount",
                dbus_interface="org.freedesktop.UDisks.Device")
            unmount([], timeout=1e6)  # dbus-python doesn't know an infinite
                                      # timeout... unmounting sometimes takes a
                                      # long time, since the device has to be
                                      # synced.
        signals['transfer:completed'].send(workflow)
        workflow.status['step'] = None
Exemple #32
0
def test_process():
    # No need for confit.Configuration, since the plugin doesn't have any
    # configuration
    config = {'autorotate': None}
    pages = [Page(Path('{0:03}.jpg'.format(idx))) for idx in xrange(4)]
    target_path = Path('/tmp/dummy')

    with mock.patch('spreadsplug.autorotate.ProcessPoolExecutor') as mockctx:
        plugin = autorotate.AutoRotatePlugin(config)
        pool = mockctx.return_value.__enter__.return_value
        plugin.process(pages, target_path)
        # The text file should not have been passed
        assert pool.submit.call_count == 4
        # We only want the second parameter to submit, the first is the
        # function to call
        assert sorted([unicode(p.raw_image) for p in pages
                       ]) == (sorted(x[0][1]
                                     for x in pool.submit.call_args_list))
Exemple #33
0
def test_capture(jpeg, camera):
    jpeg.return_value = mock.Mock()
    camera.capture(Path('/tmp/000.jpg'))
    assert camera._run.call_count == 1
    assert camera._run.call_args_list[0][0][0].startswith('remoteshoot')
    assert camera._run.call_args_list[0][0][0].endswith('"/tmp/000"')
    assert jpeg.called_once_with('/tmp/000.jpg')
    assert jpeg.return_value.exif_orientation == 6
    assert jpeg.return_value.save.called_once_with('/tmp/000.jpg')
Exemple #34
0
    def output(self, pages, target_path, metadata, table_of_contents):
        logger.info("Assembling PDF.")

        tmpdir = Path(tempfile.mkdtemp())
        # NOTE: pdfbeads only finds *html files for the text layer in the
        #       working directory, so we have to chdir() into it
        old_path = os.path.abspath(os.path.curdir)
        os.chdir(unicode(tmpdir))

        images = []
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            link_path = (tmpdir / fpath.name)
            link_path.symlink_to(fpath)
            if 'tesseract' in page.processed_images:
                ocr_path = page.processed_images['tesseract']
                (tmpdir / ocr_path.name).symlink_to(ocr_path)
            images.append(link_path)

        # TODO: Use metadata to create a METAFILE for pdfbeads
        # TODO: Use table_of_contents to create a TOCFILE for pdfbeads
        # TODO: Use page.page_label to create a LSPEC for pdfbeads

        pdf_file = target_path / "book.pdf"
        cmd = [find_in_path("pdfbeads"), "-d"]
        cmd.extend([f.name for f in images])
        cmd.extend(["-o", unicode(pdf_file)])
        logger.debug("Running " + " ".join(cmd))
        proc = subprocess.Popen(cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT)
        last_count = 0
        while proc.poll() is None:
            current_count = sum(1 for x in tmpdir.glob('*.jbig2'))
            if current_count > last_count:
                last_count = current_count
                self.on_progressed.send(self,
                                        progress=float(current_count) /
                                        len(images))
            time.sleep(.01)
        logger.debug("Output:\n{0}".format(proc.stdout.read()))
        os.chdir(old_path)
    def process(self, pages, target_path):
        """ For each page, rotate the most recent image according to its EXIF
            orientation tag.

        :param pages:       Pages to be processed
        :type pages:        list of :py:class:`spreads.workflow.Page`
        :param target_path: Base directory where processed images are to be
                            stored
        :type target_path:  :py:class:`pathlib.Path`
        """
        # TODO: This plugin should be 'output' only, since we ideally work
        #       with fully binarized output images

        # Map input paths to their pages so we can more easily associate
        # the generated output files with their pages later on
        in_paths = {}
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            in_paths[fpath] = page

        out_dir = Path(tempfile.mkdtemp(prefix='tess-out'))
        language = self.config["language"].get()

        logger.info("Performing OCR")
        logger.info("Language is \"{0}\"".format(language))
        self._perform_ocr(in_paths, out_dir, language)

        for fname in chain(out_dir.glob('*.hocr'), out_dir.glob('*.html')):
            self._perform_replacements(fname)
            # For each hOCR file, try to find a corresponding input image
            # and associate it to the image's page
            out_stem = fname.stem
            for in_path, page in in_paths.iteritems():
                if in_path.stem == out_stem:
                    target_fname = target_path/fname.name
                    shutil.copyfile(unicode(fname), unicode(target_fname))
                    page.processed_images[self.__name__] = target_fname
                    break
            else:
                logger.warn("Could not find page for output file {0}"
                            .format(fname))
Exemple #36
0
    def process(self, pages, target_path):
        """ For each page, rotate the most recent image according to its EXIF
            orientation tag.

        :param pages:       Pages to be processed
        :type pages:        list of :py:class:`spreads.workflow.Page`
        :param target_path: Base directory where processed images are to be
                            stored
        :type target_path:  :py:class:`pathlib.Path`
        """
        # TODO: This plugin should be 'output' only, since we ideally work
        #       with fully binarized output images

        # Map input paths to their pages so we can more easily associate
        # the generated output files with their pages later on
        in_paths = {}
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            in_paths[fpath] = page

        out_dir = Path(tempfile.mkdtemp(prefix='tess-out'))
        language = self.config["language"].get()

        logger.info("Performing OCR")
        logger.info("Language is \"{0}\"".format(language))
        self._perform_ocr(in_paths, out_dir, language)

        for fname in chain(out_dir.glob('*.hocr'), out_dir.glob('*.html')):
            self._perform_replacements(fname)
            # For each hOCR file, try to find a corresponding input image
            # and associate it to the image's page
            out_stem = fname.stem
            for in_path, page in in_paths.iteritems():
                if in_path.stem == out_stem:
                    target_fname = target_path / fname.name
                    shutil.copyfile(unicode(fname), unicode(target_fname))
                    page.processed_images[self.__name__] = target_fname
                    break
            else:
                logger.warn(
                    "Could not find page for output file {0}".format(fname))
Exemple #37
0
    def output(self, pages, target_path, metadata, table_of_contents):
        logger.info("Assembling PDF.")

        tmpdir = Path(tempfile.mkdtemp())
        # NOTE: pdfbeads only finds *html files for the text layer in the
        #       working directory, so we have to chdir() into it
        old_path = os.path.abspath(os.path.curdir)
        os.chdir(unicode(tmpdir))

        images = []
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            link_path = (tmpdir/fpath.name)
            link_path.symlink_to(fpath)
            if 'tesseract' in page.processed_images:
                ocr_path = page.processed_images['tesseract']
                (tmpdir/ocr_path.name).symlink_to(ocr_path)
            images.append(link_path)

        # TODO: Use metadata to create a METAFILE for pdfbeads
        # TODO: Use table_of_contents to create a TOCFILE for pdfbeads
        # TODO: Use page.page_label to create a LSPEC for pdfbeads

        pdf_file = target_path/"book.pdf"
        cmd = [find_in_path("pdfbeads"), "-d"]
        cmd.extend([f.name for f in images])
        cmd.extend(["-o", unicode(pdf_file)])
        logger.debug("Running " + " ".join(cmd))
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT)
        last_count = 0
        while proc.poll() is None:
            current_count = sum(1 for x in tmpdir.glob('*.jbig2'))
            if current_count > last_count:
                last_count = current_count
                self.on_progressed.send(
                    self, progress=float(current_count)/len(images))
            time.sleep(.01)
        logger.debug("Output:\n{0}".format(proc.stdout.read()))
        os.chdir(old_path)
Exemple #38
0
 def create(cls, location, name, config=None, metadata=None):
     if not isinstance(location, Path):
         location = Path(location)
     if (location / name).exists():
         raise ValidationError(
             name="A workflow with that name already exists")
     wf = cls(path=location / name, config=config, metadata=metadata)
     if not location in cls._cache:
         cls._cache[location] = []
     cls._cache[location].append(wf)
     return wf
Exemple #39
0
def test_process(call, plugin, tmpdir):
    def create_out_files(pf, out_dir, num):
        for p in pages:
            (out_dir/(p.raw_image.stem + '.tif')).touch()
    plugin._generate_configuration = mock.Mock()
    plugin._generate_output = create_out_files
    plugin.config['autopilot'] = True

    pages = [Page(Path('{0:03}.jpg'.format(idx))) for idx in xrange(5)]
    target_dir = Path(unicode(tmpdir))
    plugin.process(pages, target_dir)
    assert call.call_count == 0
    for p in pages:
        assert 'scantailor' in p.processed_images
        assert p.processed_images['scantailor'].parent == target_dir
        assert p.processed_images['scantailor'].exists()

    plugin.config['autopilot'] = False
    plugin.process(pages, target_dir)
    assert call.call_count == 1
Exemple #40
0
def copy_info(pkg, pkg_dir):
    try:
        dist = pkg_resources.get_distribution(pkg)
    except pkg_resources.DistributionNotFound:
        raise IOError("No distribution could be found for {0}!".format(pkg))
    if dist.location == os.getcwd():
        egg_name = dist.project_name
    else:
        egg_name = dist.egg_name()

    egg_path = Path(dist.location) / (egg_name + ".egg-info")
    dist_path = Path(dist.location) / (dist.project_name + "-" + dist.version + ".dist-info")
    if egg_path.exists():
        src_path = egg_path
    elif dist_path.exists():
        src_path = dist_path
    else:
        raise IOError("No egg-info or dist-info could be found for {0}!".format(pkg))
    if src_path.is_dir():
        shutil.copytree(unicode(src_path), unicode(pkg_dir / src_path.name))
    else:
        shutil.copy2(unicode(src_path), unicode(pkg_dir / src_path.name))
def build_msi(bitness=32):
    egg_path = Path('spreads.egg-info')
    if egg_path.exists():
        shutil.rmtree(unicode(egg_path))
    build_path = Path('build')
    if not build_path.exists():
        build_path.mkdir()
    pkg_dir = build_path/'pynsist_pkgs'
    if pkg_dir.exists():
        shutil.rmtree(unicode(pkg_dir))
    pkg_dir.mkdir()
    for pkg in BINARY_PACKAGES.itervalues():
        arch = 'win32' if bitness == 32 else 'win-amd64'
        extract_native_pkg(pkg.format(arch=arch), pkg_dir)

    for pkg in (x.project_name for x in SOURCE_PACKAGES
                if x.project_name is not None):
        copy_info(pkg, pkg_dir)

    icon = os.path.abspath("spreads.ico")
    extra_files = [(unicode((Path('win_deps') / 'extra' /
                             x.format(arch='.amd64' if bitness == 64 else ''))
                            .absolute()), None) for x in EXTRA_FILES]
    nsi_template = os.path.abspath("template.nsi")

    # NOTE: We need to remove the working directory from sys.path to force
    # pynsist to copy all of our modules, including 'spreads' and 'spreadsplug'
    # from the site-packages. Additionally, we need to change into the
    # build directory.
    if os.getcwd() in sys.path:
        sys.path.remove(os.getcwd())
    os.chdir(unicode(build_path))
    builder = InstallerBuilder(
        appname="spreads",
        version=spreads.__version__,
        packages=[x.module_name for x in SOURCE_PACKAGES],
        extra_files=extra_files,
        py_version="2.7.6",
        py_bitness=bitness,
        build_dir='msi{0}'.format(bitness),
        installer_name=None,
        nsi_template=nsi_template,
        icon=icon,
        shortcuts={
            'Configure spreads': {
                'entry_point': 'spreads.main:run_config_windows',
                'icon': icon,
                'console': False},
            'Spreads Web Service': {
                'entry_point': 'spreads.main:run_service_windows',
                'icon': icon,
                'console': False}
        }
    )
    builder.run()
    os.chdir('..')
Exemple #42
0
    def output(self, pages, target_path, metadata, table_of_contents):
        logger.info("Assembling PDF.")

        tmpdir = Path(tempfile.mkdtemp())

        meta_file = tmpdir/'metadata.txt'
        with codecs.open(unicode(meta_file), "w", "utf-8") as fp:
            for key, value in metadata.iteritems():
                if key == 'title':
                    fp.write("Title: \"{0}\"\n".format(value))
                if key == 'creator':
                    for author in value:
                        fp.write("Author: \"{0}\"\n".format(author))

        images = []
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            link_path = (tmpdir/fpath.name)
            if IS_WIN:
                shutil.copy(unicode(fpath), unicode(link_path))
            else:
                link_path.symlink_to(fpath.absolute())
            if 'tesseract' in page.processed_images:
                ocr_path = page.processed_images['tesseract']
                if IS_WIN:
                    shutil.copy(unicode(ocr_path),
                                unicode(tmpdir/ocr_path.name))
                else:
                    (tmpdir/ocr_path.name).symlink_to(ocr_path.absolute())
            images.append(link_path.absolute())

        pdf_file = target_path.absolute()/"book.pdf"

        # TODO: Use table_of_contents to create a TOCFILE for pdfbeads
        # TODO: Use page.page_label to create a LSPEC for pdfbeads

        # NOTE: pdfbeads only finds *html files for the text layer in the
        #       working directory, so we have to chdir() into it
        old_path = os.path.abspath(os.path.curdir)
        os.chdir(unicode(tmpdir))

        cmd = [BIN, "-d", "-M", unicode(meta_file)]
        if IS_WIN:
            cmd.append(util.wildcardify(tuple(f.name for f in images)))
        else:
            cmd.extend([unicode(f) for f in images])
        cmd.extend(["-o", unicode(pdf_file)])
        logger.debug("Running " + " ".join(cmd))
        proc = util.get_subprocess(cmd, stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE, shell=IS_WIN)
        if IS_WIN:
            # NOTE: Due to a bug in the jbig2enc version for Windows, the error
            #       output gets huge, creating a deadlock. Hence, we go the
            #       safe way and use `communicate()`, though this means no
            #       progress notification for the user.
            output, errors = proc.communicate()
        else:
            last_count = 0
            while proc.poll() is None:
                current_count = sum(1 for x in tmpdir.glob('*.jbig2'))
                if current_count > last_count:
                    last_count = current_count
                    self.on_progressed.send(
                        self, progress=float(current_count)/len(images))
                time.sleep(.01)
            output = proc.stdout.read()
            errors = proc.stderr.read()
        logger.debug("pdfbeads stdout:\n{0}".format(output))
        logger.debug("pdfbeads stderr:\n{0}".format(errors))
        os.chdir(old_path)
        shutil.rmtree(unicode(tmpdir))
    def process(self, pages, target_path):
        """ Run the most recent image of every page through ScanTailor.

        :param pages:       Pages to be processed
        :type pages:        list of :py:class:`spreads.workflow.Page`
        :param target_path: Base directory where rotated images are to be
                            stored
        :type target_path:  :py:class:`pathlib.Path`
        """
        autopilot = self.config['autopilot'].get(bool)
        if not autopilot and not util.find_in_path('scantailor'):
            raise util.MissingDependencyException(
                "Could not find executable `scantailor` in"
                " $PATH. Please install the appropriate"
                " package(s)!")

        # Create temporary files/directories
        projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1])
        out_dir = Path(tempfile.mkdtemp(prefix='st-out'))

        # Map input paths to their pages so we can more easily associate
        # the generated output files with their pages later on
        in_paths = {}
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            in_paths[unicode(fpath)] = page

        logger.info("Generating ScanTailor configuration")
        self._generate_configuration(sorted(in_paths.keys()),
                                     projectfile, out_dir)

        if not autopilot:
            logger.warn("If you are changing output settings (in the last "
                        "step, you *have* to run the last step from the GUI. "
                        "Due to a bug in ScanTailor, your settings would "
                        "otherwise be ignored.")
            time.sleep(5)
            logger.info("Opening ScanTailor GUI for manual adjustment")
            util.get_subprocess([GUI_BIN, unicode(projectfile)])
        # Check if the user already generated output files from the GUI
        if not sum(1 for x in out_dir.glob('*.tif')) == len(pages):
            logger.info("Generating output images from ScanTailor "
                        "configuration.")
            self._generate_output(projectfile, out_dir, len(pages))

        # Associate generated output files with our pages
        for fname in out_dir.glob('*.tif'):
            out_stem = fname.stem
            for in_path, page in in_paths.iteritems():
                if Path(in_path).stem == out_stem:
                    target_fname = target_path/fname.name
                    shutil.copyfile(unicode(fname), unicode(target_fname))
                    page.processed_images[self.__name__] = target_fname
                    break
            else:
                logger.warn("Could not find page for output file {0}"
                            .format(fname))

        # Remove temporary files/directories
        shutil.rmtree(unicode(out_dir))
        # FIXME: This fails on Windows since there seems to be some non-gcable
        #        reference to the file around, but I currently cannot figure
        #        out where, so we just ignore the error...
        try:
            projectfile.unlink()
        except WindowsError as e:
            if e.errno == 32:
                pass
def open_connection():
    db_path = Path(app.config['database'])
    if not db_path.exists():
        initialize_database()
    return sqlite3.connect(unicode(db_path))