def init_docsearch(self): paperwork.setup_test_env() new_docs = set() upd_docs = set() missing_docs = set() config = PaperworkConfig() config.read() # start from scratch dsearch = docsearch.DocSearch(config['workdir'].value) dsearch.destroy_index() dsearch = docsearch.DocSearch(config['workdir'].value) dsearch.reload_index() doc_examiner = dsearch.get_doc_examiner() doc_examiner.examine_rootdir( lambda x: new_docs.add(x), lambda x: upd_docs.add(x), lambda x: missing_docs.add(x), lambda x: None, ) assert (len(upd_docs) <= 0) assert (len(missing_docs) <= 0) index_updater = dsearch.get_index_updater() for doc in new_docs: index_updater.add_doc(doc) index_updater.commit() return dsearch
def main(): if len(sys.argv) < 3: print("Syntax:") print(" {} [min_yeses] [out_csv_file]".format(sys.argv[0])) sys.exit(1) min_yeses = eval(sys.argv[1]) out_csv_file = sys.argv[2] pconfig = config.PaperworkConfig() pconfig.read() src_dir = pconfig.settings['workdir'].value print("Source work directory : {}".format(src_dir)) src_dsearch = docsearch.DocSearch(src_dir) src_dsearch.reload_index() nb_threads = multiprocessing.cpu_count() pool = multiprocessing.pool.ThreadPool(processes=nb_threads) with open(out_csv_file, 'a', newline='') as csvfile: csvwriter = csv.writer(csvfile) for min_yes in min_yeses: pool.apply_async(_run_simulation, ( src_dsearch, min_yes, csvwriter, )) pool.close() pool.join() print("All done !")
def main(): global g_lang global g_dictionnary global g_tknzr global g_nb_total_pages global g_start_time print("Will use {} for OCR".format(OCR_TOOL.get_name())) print("Initializing dictionnary ...") g_lang = "eng" if len(sys.argv) > 1: g_lang = "fra" g_dictionnary = enchant.request_dict(g_lang[:2]) try: g_tknzr = enchant.tokenize.get_tokenizer(g_lang[:2]) except enchant.tokenize.TokenizerNotFoundError as exc: print("Warning: Falling back to default tokenizer ({})".format(exc)) g_tknzr = enchant.tokenize.get_tokenizer() print("Done") print("Loading documents list ...") pconfig = config.PaperworkConfig() pconfig.read() work_dir = pconfig.settings['workdir'].value dsearch = docsearch.DocSearch(work_dir) dsearch.reload_index() print("Documents loaded") print("") print("Initalizing workers ...") manager = WorkerManager() manager.start() factory = JobFactoryImageProcessing() print("Done") g_start_time = datetime.datetime.now() try: print("Queueing jobs ...") nb_docs = 0 nb_pages = 0 for doc in dsearch.docs: if not doc.can_edit: # probably not an OCR-ized doc continue nb_docs += 1 for page in doc.pages: if not page.can_edit: # probably not an OCR-ized page continue nb_pages += 1 g_nb_total_pages += 1 for algos in ALGORITHMS: job = factory.make(page, algos) manager.schedule(job) print("Queued jobs : {} docs | {} pages".format(nb_docs, nb_pages)) manager.wait_for_all() finally: manager.stop()
def main(): pconfig = config.PaperworkConfig() pconfig.read() src_dir = pconfig.settings['workdir'].value print("Source work directory : {}".format(src_dir)) src_dsearch = docsearch.DocSearch(src_dir) src_dsearch.reload_index() dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs") dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index") print("Destination directories : {} | {}".format(dst_doc_dir, dst_index_dir)) dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir) dst_dsearch.reload_index() try: documents = [x for x in src_dsearch.docs] documents.sort(key=lambda doc: doc.docid) for src_doc in documents: print("Document [{}]".format(src_doc.docid)) files = os.listdir(src_doc.path) files.sort() current_doc = None dst_doc = None for filename in files: if "thumb" in filename: continue filepath = os.path.join(src_doc.path, filename) fileuri = "file://" + filepath importers = docimport.get_possible_importers( fileuri, current_doc=current_doc) if len(importers) <= 0: continue assert (len(importers) == 1) importer = importers[0] (docs, page, new) = importer.import_doc(fileuri, dst_dsearch, current_doc) dst_doc = docs[0] for page_nb in range(0, dst_doc.nb_pages): if dst_doc.can_edit: dst_doc.pages[page_nb].boxes = \ src_doc.pages[page_nb].boxes dst_doc.pages[page_nb].drop_cache() if current_doc is None: # first page --> guess labels and see if it matchs label_guess(dst_dsearch, src_doc, dst_doc) else: # just update the index upd_index(dst_dsearch, dst_doc, new=False) current_doc = docs[0] if dst_doc is not None: fix_labels(dst_dsearch, src_doc, dst_doc) finally: rm_rf(dst_doc_dir) rm_rf(dst_index_dir) print_stats()
def main(): pconfig = config.PaperworkConfig() pconfig.read() print("Opening docs (%s)" % pconfig.settings['workdir'].value) print("====================") dsearch = docsearch.DocSearch(pconfig.settings['workdir'].value) dsearch.reload_index() nb_words = 0 nb_docs = (len(dsearch.docs)) nb_pages = 0 max_pages = 0 total_word_len = 0 max_word_len = 0 words = set() total_nb_unique_words = 0 total_nb_unique_words_per_doc = 0 print("") print("Analysis") print("========") all_labels = set([l.name for l in dsearch.label_list]) label_keys = ['exact', 'global', 'positive', 'negative'] # for the order total_label_accuracy = { 'exact': 0, 'global': 0, 'positive': 0, 'negative': 0, } total_labels = { 'exact': 0, 'global': 0, 'positive': 0, 'negative': 0, } for doc in sorted(dsearch.docs, key=lambda x: x.docid): sys.stdout.write(str(doc) + ": ") sys.stdout.flush() doc_words = set() if doc.nb_pages > max_pages: max_pages = doc.nb_pages # Keyword stats for page in doc.pages: sys.stdout.write("%d " % (page.page_nb + 1)) sys.stdout.flush() nb_pages += 1 for line in page.text: for word in util.split_words(line): # ignore words too short to be useful if (len(word) < 4): continue if word not in words: words.add(word) total_nb_unique_words += 1 if word not in doc_words: doc_words.add(word) total_nb_unique_words_per_doc += 1 nb_words += 1 total_word_len += len(word) if max_word_len < len(word): max_word_len = len(word) # Label predictions stats doc_labels = {l.name for l in doc.labels} predicted_labels = { l.name for (l, scores) in dsearch.guess_labels(doc) } accurate = { 'exact': 1, 'global': 0, 'negative': 0, 'positive': 0, } nb_labels = { 'exact': 1, 'global': len(all_labels), 'positive': len(doc_labels), 'negative': len(all_labels) - len(doc_labels), } for key in label_keys: total_labels[key] += nb_labels[key] missing = [] for label in all_labels: if not ((label in doc_labels) ^ (label in predicted_labels)): accurate['global'] += 1 total_label_accuracy['global'] += 1 if label in doc_labels: accurate['positive'] += 1 total_label_accuracy['positive'] += 1 else: accurate['negative'] += 1 total_label_accuracy['negative'] += 1 else: if label in predicted_labels: missing.append(label) accurate['exact'] = 0 if accurate['exact']: total_label_accuracy['exact'] += 1 for key in label_keys: total = nb_labels[key] value = accurate[key] if total == 0: continue value = accurate[key] sys.stdout.write("\n\t- label prediction accuracy (%s): %d%%" % (key, (100 * accurate[key] / total))) sys.stdout.write("\n") for missing_label in missing: sys.stdout.write("Missing: {}\n".format(missing_label)) print("") print("Statistics") print("==========") print("Total number of documents: %d" % nb_docs) print("Total number of pages: %d" % nb_pages) print("Total number of words: %d" % nb_words) print("Total words len: %d" % total_word_len) print("Total number of unique words: %d" % total_nb_unique_words) print("===") print("Maximum number of pages in one document: %d" % max_pages) print("Maximum word length: %d" % max_word_len) print("Average word length: %f" % (float(total_word_len) / float(nb_words))) print("Average number of words per page: %f" % (float(nb_words) / float(nb_pages))) print("Average number of words per document: %f" % (float(nb_words) / float(nb_docs))) print("Average number of pages per document: %f" % (float(nb_pages) / float(nb_docs))) print("Average number of unique words per document: %f" % (float(total_nb_unique_words_per_doc) / float(nb_docs))) for key in label_keys: total = total_labels[key] value = total_label_accuracy[key] print("Average accuracy of label prediction (%s): %f%%" % (key, (100 * value / total)))
def run_simulation(src_dsearch, min_yes, csvwriter): stats = { 'nb_documents': 0, 'correct_guess': 0, 'missing_guess': 0, 'wrong_guess': 0, 'nb_src_labels': 0, 'nb_dst_labels': 0, 'perfect': 0, } dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs") dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index") print("Destination directories : {} | {}".format(dst_doc_dir, dst_index_dir)) dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir) dst_dsearch.reload_index() dst_dsearch.label_guesser.min_yes = min_yes try: documents = [x for x in src_dsearch.docs] documents.sort(key=lambda doc: doc.docid) for src_doc in documents: files = os.listdir(src_doc.path) files.sort() current_doc = None for filename in files: if "thumb" in filename: continue filepath = os.path.join(src_doc.path, filename) fileuri = "file://" + filepath importers = docimport.get_possible_importers( fileuri, current_doc=current_doc) if len(importers) <= 0: continue assert (len(importers) == 1) importer = importers[0] (docs, page, new) = importer.import_doc(fileuri, dst_dsearch, current_doc) dst_doc = docs[0] for page_nb in range(0, dst_doc.nb_pages): if dst_doc.can_edit: dst_doc.pages[page_nb].boxes = \ src_doc.pages[page_nb].boxes dst_doc.pages[page_nb].drop_cache() if current_doc is None: # first page --> guess labels and see if it matchs label_guess(dst_dsearch, src_doc, dst_doc) fix_labels(stats, dst_dsearch, src_doc, dst_doc) else: # just update the index upd_index(dst_dsearch, dst_doc, new=False) current_doc = docs[0] finally: g_lock.acquire() try: csvwriter.writerow([ min_yes, stats['nb_documents'], stats['perfect'], ]) finally: g_lock.release() rm_rf(dst_doc_dir) rm_rf(dst_index_dir) print_stats(stats)
def main(): # enable_logging() pconfig = config.PaperworkConfig() pconfig.read() src_dir = pconfig.settings['workdir'].value print("Source work directory : {}".format(src_dir)) src_dsearch = docsearch.DocSearch(src_dir, use_default_index_client=False) src_dsearch.reload_index() dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs") dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index") print("Destination directories : {} | {}".format(dst_doc_dir, dst_index_dir)) dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir, use_default_index_client=False) dst_dsearch.reload_index() print("Testing ...") try: documents = [x for x in src_dsearch.docs] documents.sort(key=lambda doc: doc.docid) print("Number of documents: {}".format(len(documents))) for src_doc in documents: print("Document [{}] | [{}]".format(src_doc.docid, src_doc.path)) files = [x for x in g_fs.listdir(src_doc.path)] files.sort() current_doc = None for filepath in files: print("File: {}".format(filepath)) filename = g_fs.basename(filepath) if "thumb" in filename or "labels" == filename: continue importers = docimport.get_possible_importers( [filepath], current_doc=current_doc) if len(importers) <= 0: continue print("Importer(s): {}".format(", ".join( [str(x) for x in importers]))) assert (len(importers) == 1) importer = importers[0] result = importer.import_doc([filepath], dst_dsearch, current_doc) print("Import result: {}".format(str(result.get()))) if current_doc is None: if result.new_docs == []: print("Nothing imported ?!") continue dst_doc = result.new_docs[0] else: dst_doc = current_doc for page_nb in range(0, dst_doc.nb_pages): if dst_doc.can_edit: dst_doc.pages[page_nb].boxes = \ src_doc.pages[page_nb].boxes if current_doc is None: # first page --> guess labels and see if it matchs label_guess(dst_dsearch, src_doc, dst_doc) fix_labels(dst_dsearch, src_doc, dst_doc) else: # just update the index upd_index(dst_dsearch, dst_doc, new=False) current_doc = dst_doc print("") finally: print("---") rm_rf(dst_doc_dir) rm_rf(dst_index_dir) print_stats()