Exemple #1
0
class JobDocScan(Job):
    __gsignals__ = {
        'scan-start': (GObject.SignalFlags.RUN_LAST, None,
                       # current page / total
                       (GObject.TYPE_INT, GObject.TYPE_INT)),
        'ocr-start': (GObject.SignalFlags.RUN_LAST, None,
                      # current page / total
                      (GObject.TYPE_INT, GObject.TYPE_INT)),
        'scan-done': (GObject.SignalFlags.RUN_LAST, None,
                      # current page, total
                      (GObject.TYPE_PYOBJECT, GObject.TYPE_INT)),
        'scan-error': (GObject.SignalFlags.RUN_LAST, None,
                       # exception
                       (GObject.TYPE_PYOBJECT,)),
    }

    can_stop = True
    priority = 500

    def __init__(self, factory, id,
                 config, nb_pages, line_in_treeview, docsearch, doc,
                 scan_src):
        Job.__init__(self, factory, id)
        self.__config = config
        self.__scan_src = scan_src
        self.docsearch = docsearch
        self.doc = doc
        self.nb_pages = nb_pages
        self.line_in_treeview = line_in_treeview
        self.current_page = None

    def __progress_cb(self, progression, total, step=None):
        if progression == 0 and step == ImgPage.SCAN_STEP_OCR:
            self.emit('ocr-start', self.current_page, self.nb_pages)

    def do(self):
        if self.doc is None:
            self.doc = ImgDoc(self.__config.workdir)
        for self.current_page in range(0, self.nb_pages):
            self.emit('scan-start', self.current_page, self.nb_pages)
            try:
                self.doc.scan_single_page(self.__scan_src,
                                          self.__config.scanner_resolution,
                                          self.__config.scanner_calibration,
                                          self.__config.langs,
                                          self.__progress_cb)
                page = self.doc.pages[self.doc.nb_pages - 1]
                self.docsearch.index_page(page)
                self.emit('scan-done', page, self.nb_pages)
            except StopIteration, exc:
                logger.warning("Feeder appears to be empty and we "
                               "haven't scanned all the pages yet !")
                self.emit('scan-error', exc)
                self._wait(5.0, force=True)  # wait for all the jobs to be cancelled
                return
            except Exception, exc:
                logger.error("Error: Exception: %s" % str(exc))
                self.emit('scan-error', exc)
                self._wait(5.0, force=True)  # wait for all the jobs to be cancelled
                return
Exemple #2
0
 def import_doc(self, file_uri, config, docsearch, current_doc=None):
     if current_doc == None:
         current_doc = ImgDoc(config.workdir)
     current_doc.import_image(file_uri, config.ocrlang)
     page = current_doc.pages[current_doc.nb_pages - 1]
     docsearch.index_page(page)
     return (current_doc, page)
Exemple #3
0
 def import_doc(self, file_uri, config, docsearch, current_doc=None):
     if current_doc == None:
         current_doc = ImgDoc(config.workdir)
     current_doc.import_image(file_uri, config.ocrlang)
     page = current_doc.pages[current_doc.nb_pages-1]
     docsearch.index_page(page)
     return (current_doc, page)
Exemple #4
0
class JobDocScan(Job):
    __gsignals__ = {
        'scan-start': (GObject.SignalFlags.RUN_LAST, None,
                       # current page / total
                       (GObject.TYPE_INT, GObject.TYPE_INT)),
        'ocr-start': (GObject.SignalFlags.RUN_LAST, None,
                      # current page / total
                      (GObject.TYPE_INT, GObject.TYPE_INT)),
        'scan-done': (GObject.SignalFlags.RUN_LAST, None,
                      # current page, total
                      (GObject.TYPE_PYOBJECT, GObject.TYPE_INT)),
        'scan-error': (GObject.SignalFlags.RUN_LAST, None,
                       # exception
                       (GObject.TYPE_PYOBJECT,)),
    }

    can_stop = True
    priority = 500

    def __init__(self, factory, id,
                 config, nb_pages, line_in_treeview, docsearch, doc,
                 scan_src):
        Job.__init__(self, factory, id)
        self.__config = config
        self.__scan_src = scan_src
        self.docsearch = docsearch
        self.doc = doc
        self.nb_pages = nb_pages
        self.line_in_treeview = line_in_treeview
        self.current_page = None

    def __progress_cb(self, progression, total, step=None):
        if progression == 0 and step == ImgPage.SCAN_STEP_OCR:
            self.emit('ocr-start', self.current_page, self.nb_pages)

    def do(self):
        if self.doc is None:
            self.doc = ImgDoc(self.__config.workdir)
        for self.current_page in range(0, self.nb_pages):
            self.emit('scan-start', self.current_page, self.nb_pages)
            try:
                self.doc.scan_single_page(self.__scan_src,
                                          self.__config.scanner_resolution,
                                          self.__config.scanner_calibration,
                                          self.__config.langs,
                                          self.__progress_cb)
                page = self.doc.pages[self.doc.nb_pages - 1]
                self.docsearch.index_page(page)
                self.emit('scan-done', page, self.nb_pages)
            except StopIteration, exc:
                logger.warning("Feeder appears to be empty and we "
                               "haven't scanned all the pages yet !")
                self.emit('scan-error', exc)
                self._wait(5.0, force=True)  # wait for all the jobs to be cancelled
                return
            except Exception, exc:
                logger.error("Error: Exception: %s" % str(exc))
                self.emit('scan-error', exc)
                self._wait(5.0, force=True)  # wait for all the jobs to be cancelled
                return
Exemple #5
0
 def do(self):
     if self.doc is None:
         self.doc = ImgDoc(self.__config.workdir)
     for self.current_page in range(0, self.nb_pages):
         self.emit('scan-start', self.current_page, self.nb_pages)
         try:
             self.doc.scan_single_page(self.__scan_src,
                                       self.__config.scanner_resolution,
                                       self.__config.scanner_calibration,
                                       self.__config.langs,
                                       self.__progress_cb)
             page = self.doc.pages[self.doc.nb_pages - 1]
             self.docsearch.index_page(page)
             self.emit('scan-done', page, self.nb_pages)
         except StopIteration, exc:
             logger.warning("Feeder appears to be empty and we "
                            "haven't scanned all the pages yet !")
             self.emit('scan-error', exc)
             self._wait(5.0, force=True)  # wait for all the jobs to be cancelled
             return
         except Exception, exc:
             logger.error("Error: Exception: %s" % str(exc))
             self.emit('scan-error', exc)
             self._wait(5.0, force=True)  # wait for all the jobs to be cancelled
             return
Exemple #6
0
class DocScanWorker(Worker):
    __gsignals__ = {
        'scan-start': (
            GObject.SignalFlags.RUN_LAST,
            None,
            # current page / total
            (GObject.TYPE_INT, GObject.TYPE_INT)),
        'ocr-start': (
            GObject.SignalFlags.RUN_LAST,
            None,
            # current page / total
            (GObject.TYPE_INT, GObject.TYPE_INT)),
        'scan-done': (
            GObject.SignalFlags.RUN_LAST,
            None,
            # current page / total
            (GObject.TYPE_PYOBJECT, GObject.TYPE_INT)),
    }

    can_interrupt = True

    def __init__(self,
                 config,
                 nb_pages,
                 line_in_treeview,
                 docsearch,
                 doc=None):
        Worker.__init__(self, "Document scanner (doc %d)" % (line_in_treeview))
        self.__config = config
        self.docsearch = docsearch
        self.doc = doc
        self.nb_pages = nb_pages
        self.line_in_treeview = line_in_treeview
        self.current_page = None

    def __progress_cb(self, progression, total, step=None):
        if not self.can_run:
            raise Exception("Scan interrupted")
        if progression == 0 and step == ImgPage.SCAN_STEP_OCR:
            self.emit('ocr-start', self.current_page, self.nb_pages)

    def do(self, scan_src):
        if self.doc == None:
            self.doc = ImgDoc(self.__config.workdir)
        for self.current_page in range(0, self.nb_pages):
            self.emit('scan-start', self.current_page, self.nb_pages)
            self.doc.scan_single_page(scan_src,
                                      self.__config.scanner_resolution,
                                      self.__config.scanner_calibration,
                                      self.__config.ocrlang,
                                      self.__progress_cb)
            page = self.doc.pages[self.doc.nb_pages - 1]
            self.docsearch.index_page(page)
            self.emit('scan-done', page, self.nb_pages)
        self.current_page = None
Exemple #7
0
 def import_doc(file_uri, config, docsearch, current_doc=None):
     """
     Import the specified image
     """
     logger.info("Importing doc '%s'" % (file_uri))
     if current_doc is None:
         current_doc = ImgDoc(config.workdir)
     current_doc.import_image(file_uri, config.langs)
     page = current_doc.pages[current_doc.nb_pages - 1]
     docsearch.index_page(page)
     return (current_doc, page)
Exemple #8
0
 def import_doc(file_uri, config, docsearch, current_doc=None):
     """
     Import the specified image
     """
     logger.info("Importing doc '%s'" % (file_uri))
     if current_doc is None:
         current_doc = ImgDoc(config.workdir)
     current_doc.import_image(file_uri, config.langs)
     page = current_doc.pages[current_doc.nb_pages-1]
     docsearch.index_page(page)
     return (current_doc, page)
Exemple #9
0
 def import_doc(file_uri, config, docsearch, current_doc=None):
     """
     Import the specified image
     """
     logger.info("Importing doc '%s'" % (file_uri))
     if current_doc is None:
         current_doc = ImgDoc(config.settings['workdir'].value)
     new = current_doc.is_new
     if file_uri[:7] == "file://":
         # XXX(Jflesch): bad bad bad
         file_uri = file_uri[7:]
     img = Image.open(file_uri)
     page = current_doc.add_page(img, [])
     return ([current_doc], page, new)
Exemple #10
0
 def do(self, scan_src):
     if self.doc == None:
         self.doc = ImgDoc(self.__config.workdir)
     for self.current_page in range(0, self.nb_pages):
         self.emit('scan-start', self.current_page, self.nb_pages)
         self.doc.scan_single_page(scan_src,
                                   self.__config.scanner_resolution,
                                   self.__config.scanner_calibration,
                                   self.__config.ocrlang,
                                   self.__progress_cb)
         page = self.doc.pages[self.doc.nb_pages - 1]
         self.docsearch.index_page(page)
         self.emit('scan-done', page, self.nb_pages)
     self.current_page = None
Exemple #11
0
 def import_doc(file_uri, docsearch, current_doc=None):
     """
     Import the specified image
     """
     logger.info("Importing doc '%s'" % (file_uri))
     if current_doc is None:
         current_doc = ImgDoc(docsearch.rootdir)
     new = current_doc.is_new
     if file_uri[:7] == "file://":
         # XXX(Jflesch): bad bad bad
         file_uri = urllib.unquote(file_uri[7:])
     img = Image.open(file_uri)
     page = current_doc.add_page(img, [])
     return ([current_doc], page, new)
Exemple #12
0
 def do(self):
     if self.doc is None:
         self.doc = ImgDoc(self.__config.workdir)
     for self.current_page in range(0, self.nb_pages):
         self.emit('scan-start', self.current_page, self.nb_pages)
         try:
             self.doc.scan_single_page(self.__scan_src,
                                       self.__config.scanner_resolution,
                                       self.__config.scanner_calibration,
                                       self.__config.langs,
                                       self.__progress_cb)
             page = self.doc.pages[self.doc.nb_pages - 1]
             self.docsearch.index_page(page)
             self.emit('scan-done', page, self.nb_pages)
         except StopIteration, exc:
             logger.warning("Feeder appears to be empty and we "
                            "haven't scanned all the pages yet !")
             self.emit('scan-error', exc)
             self._wait(5.0, force=True)  # wait for all the jobs to be cancelled
             return
         except Exception, exc:
             logger.error("Error: Exception: %s" % str(exc))
             self.emit('scan-error', exc)
             self._wait(5.0, force=True)  # wait for all the jobs to be cancelled
             return
Exemple #13
0
class DocScanWorker(Worker):
    __gsignals__ = {
        'scan-start' : (gobject.SIGNAL_RUN_LAST, gobject.TYPE_NONE,
                        # current page / total
                        (gobject.TYPE_INT, gobject.TYPE_INT)),
        'ocr-start' : (gobject.SIGNAL_RUN_LAST, gobject.TYPE_NONE,
                        # current page / total
                       (gobject.TYPE_INT, gobject.TYPE_INT)),
        'scan-done' : (gobject.SIGNAL_RUN_LAST, gobject.TYPE_NONE,
                        # current page / total
                       (gobject.TYPE_INT, gobject.TYPE_INT)),
    }

    can_interrupt = True

    def __init__(self, config, nb_pages, line_in_treeview, docsearch, doc=None):
        Worker.__init__(self, "Document scanner (doc %d)" % (line_in_treeview))
        self.__config = config
        self.docsearch = docsearch
        self.doc = doc
        self.nb_pages = nb_pages
        self.line_in_treeview = line_in_treeview
        self.current_page = None

    def __progress_cb(self, progression, total, step=None):
        if not self.can_run:
            raise Exception("Scan interrupted")
        if progression == 0 and step == ImgPage.SCAN_STEP_OCR:
            self.emit('ocr-start', self.current_page, self.nb_pages)

    def do(self, scan_src):
        if self.doc == None:
            self.doc = ImgDoc(self.__config.workdir)
        for self.current_page in range(0, self.nb_pages):
            self.emit('scan-start', self.current_page, self.nb_pages)
            self.doc.scan_single_page(scan_src,
                                      self.__config.scanner_resolution,
                                      self.__config.scanner_calibration,
                                      self.__config.ocrlang,
                                      self.__progress_cb)
            page = self.doc.pages[self.doc.nb_pages - 1]
            self.docsearch.index_page(page)
            self.emit('scan-done', self.current_page, self.nb_pages)
        self.current_page = None
Exemple #14
0
    def examine_rootdir(self,
                        on_new_doc,
                        on_doc_modified,
                        on_doc_deleted,
                        on_doc_unchanged,
                        progress_cb=dummy_progress_cb):
        """
        Examine the rootdir.
        Calls on_new_doc(doc), on_doc_modified(doc), on_doc_deleted(docid)
        every time a new, modified, or deleted document is found
        """
        # getting the doc list from the index
        query = whoosh.query.Every()
        results = self.__searcher.search(query, limit=None)
        old_doc_list = [result['docid'] for result in results]
        old_doc_infos = {}
        for result in results:
            old_doc_infos[result['docid']] = (result['doctype'],
                                              result['last_read'])
        old_doc_list = set(old_doc_list)

        # and compare it to the current directory content
        docdirs = os.listdir(self.docsearch.rootdir)
        progress = 0
        for docdir in docdirs:
            old_infos = old_doc_infos.get(docdir)
            doctype = None
            if old_infos is not None:
                doctype = old_infos[0]
            doc = self.docsearch.get_doc_from_docid(docdir, doctype, inst=True)
            if doc is None:
                continue
            if docdir in old_doc_list:
                old_doc_list.remove(docdir)
                assert(old_infos is not None)
                last_mod = datetime.datetime.fromtimestamp(doc.last_mod)
                if old_infos[1] != last_mod:
                    on_doc_modified(doc)
                else:
                    on_doc_unchanged(doc)
            else:
                on_new_doc(doc)
            progress_cb(progress, len(docdirs),
                        DocSearch.INDEX_STEP_CHECKING, doc)
            progress += 1

        # remove all documents from the index that don't exist anymore
        for old_doc in old_doc_list:
            # Will be a document with 0 pages
            docpath = os.path.join(self.docsearch.rootdir, old_doc)
            on_doc_deleted(ImgDoc(docpath, old_doc, label_store=self.docsearch.label_store))

        progress_cb(1, 1, DocSearch.INDEX_STEP_CHECKING)
Exemple #15
0
 def do(self, scan_src):
     if self.doc == None:
         self.doc = ImgDoc(self.__config.workdir)
     for self.current_page in range(0, self.nb_pages):
         self.emit('scan-start', self.current_page, self.nb_pages)
         self.doc.scan_single_page(scan_src,
                                   self.__config.scanner_resolution,
                                   self.__config.scanner_calibration,
                                   self.__config.ocrlang,
                                   self.__progress_cb)
         page = self.doc.pages[self.doc.nb_pages - 1]
         self.docsearch.index_page(page)
         self.emit('scan-done', self.current_page, self.nb_pages)
     self.current_page = None
Exemple #16
0
def main(src_dir, dst_dir):
    sys.stdout.write("Loading document %s ... " % src_dir)
    sys.stdout.flush()
    src_doc = ImgDoc(src_dir, os.path.basename(src_dir))
    sys.stdout.write("Done\n")

    if (src_doc.nb_pages <= 0):
        raise Exception("No pages found. Is this an image doc ?")

    sys.stdout.write("Analyzing document ... ")
    sys.stdout.flush()
    chars = get_chars(src_doc)
    sys.stdout.write("Done\n")

    sys.stdout.write("Generating salt ... ")
    sys.stdout.flush()
    salt = gen_salt()
    sys.stdout.write("Done\n")
    print("Will use [%s] as salt for the hash" % salt)

    sys.stdout.write("Generating char mapping ... ")
    sys.stdout.flush()
    mapping = generate_mapping(chars)
    sys.stdout.write("Done\n")

    print_mapping(mapping)

    os.mkdir(dst_dir)

    sys.stdout.write("Generating document %s ... " % dst_dir)
    sys.stdout.flush()
    dst_doc = ImgDoc(dst_dir, os.path.basename(dst_dir))
    clone_doc_content(src_doc, dst_doc, mapping, salt)
    sys.stdout.write("... Done\n")

    print("All done")
Exemple #17
0
    def __init__(self, main_win, config, widget_tree):
        self.__main_win = main_win
        self.__config = config

        self.default_thumbnail = self.__init_default_thumbnail(
            JobDocThumbnailer.SMALL_THUMBNAIL_WIDTH,
            JobDocThumbnailer.SMALL_THUMBNAIL_HEIGHT)

        self.gui = {
            'list': widget_tree.get_object("listboxDocList"),
            'box': widget_tree.get_object("doclist_box"),
            'scrollbars': widget_tree.get_object("scrolledwindowDocList"),
            'spinner': SpinnerAnimation((0, 0)),
        }
        self.gui['loading'] = Canvas(self.gui['scrollbars'])
        self.gui['loading'].set_visible(False)
        self.gui['box'].add(self.gui['loading'])
        self.gui['scrollbars'].connect(
            "size-allocate",
            lambda x, s: GLib.idle_add(self._on_size_allocate)
        )

        self.actions = {
            'open_doc': (
                [
                    self.gui['list'],
                ],
                ActionOpenSelectedDocument(main_win, config, self)
            ),
        }
        connect_actions(self.actions)

        self.model = {
            'has_new': False,
            'by_row': {},  # Gtk.ListBoxRow: docid
            'by_id': {},  # docid: Gtk.ListBoxRow
            # keep the thumbnails in cache
            'thumbnails': {}  # docid: pixbuf
        }
        self.new_doc = ImgDoc(config['workdir'].value)

        self.job_factories = {
            'doc_thumbnailer': JobFactoryDocThumbnailer(self),
        }
        self.selected_doc = None

        self.gui['scrollbars'].get_vadjustment().connect(
            "value-changed",
            lambda v: GLib.idle_add(self._on_scrollbar_value_changed)
        )

        self.gui['list'].connect("drag-motion", self._on_drag_motion)
        self.gui['list'].connect("drag-leave", self._on_drag_leave)
        self.gui['list'].connect(
            "drag-data-received",
            self._on_drag_data_received
        )
        self.gui['list'].drag_dest_set(
            Gtk.DestDefaults.ALL,
            [], Gdk.DragAction.MOVE
        )
        self.gui['list'].drag_dest_add_text_targets()

        self.accel_group = Gtk.AccelGroup()
        self.__main_win.window.add_accel_group(self.accel_group)

        self.show_loading()
Exemple #18
0
 def get_new_doc(self):
     if self.new_doc.is_new:
         return self.new_doc
     self.new_doc = ImgDoc(self.__config['workdir'].value)
     return self.new_doc