def main(argv): args = argv[1:] if args == []: rm_rf(OUT_DIRECTORY) try: os.mkdir(OUT_DIRECTORY) except: pass for (sc_name, sc_method) in SCREENSHOTS.items(): if args != [] and sc_name not in args: continue paperwork_inst = paperwork.PaperworkInstance() paperwork_inst.start() try: # weird workaround for annoying bug GLib.idle_add( paperwork_inst.main_window.doclist._on_scrollbar_value_changed) paperwork_inst.wait() time.sleep(1) paperwork_inst.wait() sc_method(paperwork_inst) paperwork_inst.wait() finally: paperwork_inst.stop()
def main(): pconfig = config.PaperworkConfig() pconfig.read() src_dir = pconfig.settings['workdir'].value print("Source work directory : {}".format(src_dir)) src_dsearch = docsearch.DocSearch(src_dir) src_dsearch.reload_index() dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs") dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index") print("Destination directories : {} | {}".format(dst_doc_dir, dst_index_dir)) dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir) dst_dsearch.reload_index() try: documents = [x for x in src_dsearch.docs] documents.sort(key=lambda doc: doc.docid) for src_doc in documents: print("Document [{}]".format(src_doc.docid)) files = os.listdir(src_doc.path) files.sort() current_doc = None dst_doc = None for filename in files: if "thumb" in filename: continue filepath = os.path.join(src_doc.path, filename) fileuri = "file://" + filepath importers = docimport.get_possible_importers( fileuri, current_doc=current_doc) if len(importers) <= 0: continue assert (len(importers) == 1) importer = importers[0] (docs, page, new) = importer.import_doc(fileuri, dst_dsearch, current_doc) dst_doc = docs[0] for page_nb in range(0, dst_doc.nb_pages): if dst_doc.can_edit: dst_doc.pages[page_nb].boxes = \ src_doc.pages[page_nb].boxes dst_doc.pages[page_nb].drop_cache() if current_doc is None: # first page --> guess labels and see if it matchs label_guess(dst_dsearch, src_doc, dst_doc) else: # just update the index upd_index(dst_dsearch, dst_doc, new=False) current_doc = docs[0] if dst_doc is not None: fix_labels(dst_dsearch, src_doc, dst_doc) finally: rm_rf(dst_doc_dir) rm_rf(dst_index_dir) print_stats()
def main(): # enable_logging() pconfig = config.PaperworkConfig() pconfig.read() src_dir = pconfig.settings['workdir'].value print("Source work directory : {}".format(src_dir)) src_dsearch = docsearch.DocSearch(src_dir, use_default_index_client=False) src_dsearch.reload_index() dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs") dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index") print( "Destination directories : {} | {}".format(dst_doc_dir, dst_index_dir) ) dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir, use_default_index_client=False) dst_dsearch.reload_index() print("Testing ...") try: documents = [x for x in src_dsearch.docs] documents.sort(key=lambda doc: doc.docid) print("Number of documents: {}".format(len(documents))) for src_doc in documents: print("Document [{}] | [{}]".format(src_doc.docid, src_doc.path)) files = [x for x in g_fs.listdir(src_doc.path)] files.sort() current_doc = None for filepath in files: print("File: {}".format(filepath)) filename = g_fs.basename(filepath) if "thumb" in filename or "labels" == filename: continue importers = docimport.get_possible_importers( [filepath], current_doc=current_doc ) if len(importers) <= 0: continue print("Importer(s): {}".format(", ".join([ str(x) for x in importers ]))) assert(len(importers) == 1) importer = importers[0] result = importer.import_doc( [filepath], dst_dsearch, current_doc ) print("Import result: {}".format(str(result.get()))) if current_doc is None: if result.new_docs == []: print("Nothing imported ?!") continue dst_doc = result.new_docs[0] else: dst_doc = current_doc for page_nb in range(0, dst_doc.nb_pages): if dst_doc.can_edit: dst_doc.pages[page_nb].boxes = \ src_doc.pages[page_nb].boxes if current_doc is None: # first page --> guess labels and see if it matchs label_guess(dst_dsearch, src_doc, dst_doc) fix_labels(dst_dsearch, src_doc, dst_doc) else: # just update the index upd_index(dst_dsearch, dst_doc, new=False) current_doc = dst_doc print("") finally: print("---") rm_rf(dst_doc_dir) rm_rf(dst_index_dir) print_stats()
def run_simulation(src_dsearch, min_yes, csvwriter): stats = { 'nb_documents': 0, 'correct_guess': 0, 'missing_guess': 0, 'wrong_guess': 0, 'nb_src_labels': 0, 'nb_dst_labels': 0, 'perfect': 0, } dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs") dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index") print("Destination directories : {} | {}".format(dst_doc_dir, dst_index_dir)) dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir) dst_dsearch.reload_index() dst_dsearch.label_guesser.min_yes = min_yes try: documents = [x for x in src_dsearch.docs] documents.sort(key=lambda doc: doc.docid) for src_doc in documents: files = os.listdir(src_doc.path) files.sort() current_doc = None for filename in files: if "thumb" in filename: continue filepath = os.path.join(src_doc.path, filename) fileuri = "file://" + filepath importers = docimport.get_possible_importers( fileuri, current_doc=current_doc) if len(importers) <= 0: continue assert (len(importers) == 1) importer = importers[0] (docs, page, new) = importer.import_doc(fileuri, dst_dsearch, current_doc) dst_doc = docs[0] for page_nb in range(0, dst_doc.nb_pages): if dst_doc.can_edit: dst_doc.pages[page_nb].boxes = \ src_doc.pages[page_nb].boxes dst_doc.pages[page_nb].drop_cache() if current_doc is None: # first page --> guess labels and see if it matchs label_guess(dst_dsearch, src_doc, dst_doc) fix_labels(stats, dst_dsearch, src_doc, dst_doc) else: # just update the index upd_index(dst_dsearch, dst_doc, new=False) current_doc = docs[0] finally: g_lock.acquire() try: csvwriter.writerow([ min_yes, stats['nb_documents'], stats['perfect'], ]) finally: g_lock.release() rm_rf(dst_doc_dir) rm_rf(dst_index_dir) print_stats(stats)
def run_simulation( src_dsearch, min_yes, csvwriter ): stats = { 'nb_documents': 0, 'correct_guess': 0, 'missing_guess': 0, 'wrong_guess': 0, 'nb_src_labels': 0, 'nb_dst_labels': 0, 'perfect': 0, } dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs") dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index") print( "Destination directories : {} | {}".format(dst_doc_dir, dst_index_dir) ) dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir) dst_dsearch.reload_index() dst_dsearch.label_guesser.min_yes = min_yes try: documents = [x for x in src_dsearch.docs] documents.sort(key=lambda doc: doc.docid) for src_doc in documents: files = os.listdir(src_doc.path) files.sort() current_doc = None for filename in files: if "thumb" in filename: continue filepath = os.path.join(src_doc.path, filename) fileuri = "file://" + filepath importers = docimport.get_possible_importers( fileuri, current_doc=current_doc ) if len(importers) <= 0: continue assert(len(importers) == 1) importer = importers[0] (docs, page, new) = importer.import_doc( fileuri, dst_dsearch, current_doc ) dst_doc = docs[0] for page_nb in range(0, dst_doc.nb_pages): if dst_doc.can_edit: dst_doc.pages[page_nb].boxes = \ src_doc.pages[page_nb].boxes dst_doc.pages[page_nb].drop_cache() if current_doc is None: # first page --> guess labels and see if it matchs label_guess(dst_dsearch, src_doc, dst_doc) fix_labels(stats, dst_dsearch, src_doc, dst_doc) else: # just update the index upd_index(dst_dsearch, dst_doc, new=False) current_doc = docs[0] finally: with g_lock: csvwriter.writerow([ min_yes, stats['nb_documents'], stats['perfect'], ]) rm_rf(dst_doc_dir) rm_rf(dst_index_dir) print_stats(stats)
def main(): # enable_logging() pconfig = config.PaperworkConfig() pconfig.read() src_dir = pconfig.settings['workdir'].value print("Source work directory : {}".format(src_dir)) src_dsearch = docsearch.DocSearch(src_dir, use_default_index_client=False) src_dsearch.reload_index() dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs") dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index") print("Destination directories : {} | {}".format(dst_doc_dir, dst_index_dir)) dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir, use_default_index_client=False) dst_dsearch.reload_index() print("Testing ...") try: documents = [x for x in src_dsearch.docs] documents.sort(key=lambda doc: doc.docid) print("Number of documents: {}".format(len(documents))) for src_doc in documents: print("Document [{}] | [{}]".format(src_doc.docid, src_doc.path)) files = [x for x in g_fs.listdir(src_doc.path)] files.sort() current_doc = None for filepath in files: print("File: {}".format(filepath)) filename = g_fs.basename(filepath) if "thumb" in filename or "labels" == filename: continue importers = docimport.get_possible_importers( [filepath], current_doc=current_doc) if len(importers) <= 0: continue print("Importer(s): {}".format(", ".join( [str(x) for x in importers]))) assert (len(importers) == 1) importer = importers[0] result = importer.import_doc([filepath], dst_dsearch, current_doc) print("Import result: {}".format(str(result.get()))) if current_doc is None: if result.new_docs == []: print("Nothing imported ?!") continue dst_doc = result.new_docs[0] else: dst_doc = current_doc for page_nb in range(0, dst_doc.nb_pages): if dst_doc.can_edit: dst_doc.pages[page_nb].boxes = \ src_doc.pages[page_nb].boxes if current_doc is None: # first page --> guess labels and see if it matchs label_guess(dst_dsearch, src_doc, dst_doc) fix_labels(dst_dsearch, src_doc, dst_doc) else: # just update the index upd_index(dst_dsearch, dst_doc, new=False) current_doc = dst_doc print("") finally: print("---") rm_rf(dst_doc_dir) rm_rf(dst_index_dir) print_stats()
def main(): pconfig = config.PaperworkConfig() pconfig.read() src_dir = pconfig.settings['workdir'].value print("Source work directory : {}".format(src_dir)) src_dsearch = docsearch.DocSearch(src_dir) src_dsearch.reload_index() dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs") dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index") print( "Destination directories : {} | {}".format(dst_doc_dir, dst_index_dir) ) dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir) dst_dsearch.reload_index() try: documents = [x for x in src_dsearch.docs] documents.sort(key=lambda doc: doc.docid) for src_doc in documents: print("Document [{}]".format(src_doc.docid)) files = os.listdir(src_doc.path) files.sort() current_doc = None dst_doc = None for filename in files: if "thumb" in filename: continue filepath = os.path.join(src_doc.path, filename) fileuri = "file://" + filepath importers = docimport.get_possible_importers( fileuri, current_doc=current_doc ) if len(importers) <= 0: continue assert(len(importers) == 1) importer = importers[0] (docs, page, new) = importer.import_doc( fileuri, dst_dsearch, current_doc ) dst_doc = docs[0] for page_nb in range(0, dst_doc.nb_pages): if dst_doc.can_edit: dst_doc.pages[page_nb].boxes = \ src_doc.pages[page_nb].boxes dst_doc.pages[page_nb].drop_cache() if current_doc is None: # first page --> guess labels and see if it matchs label_guess(dst_dsearch, src_doc, dst_doc) else: # just update the index upd_index(dst_dsearch, dst_doc, new=False) current_doc = docs[0] if dst_doc is not None: fix_labels(dst_dsearch, src_doc, dst_doc) finally: rm_rf(dst_doc_dir) rm_rf(dst_index_dir) print_stats()