Beispiel #1
0
def main(argv):
    args = argv[1:]

    if args == []:
        rm_rf(OUT_DIRECTORY)
    try:
        os.mkdir(OUT_DIRECTORY)
    except:
        pass

    for (sc_name, sc_method) in SCREENSHOTS.items():
        if args != [] and sc_name not in args:
            continue

        paperwork_inst = paperwork.PaperworkInstance()
        paperwork_inst.start()
        try:
            # weird workaround for annoying bug
            GLib.idle_add(
                paperwork_inst.main_window.doclist._on_scrollbar_value_changed)
            paperwork_inst.wait()
            time.sleep(1)
            paperwork_inst.wait()

            sc_method(paperwork_inst)
            paperwork_inst.wait()
        finally:
            paperwork_inst.stop()
Beispiel #2
0
def main():
    pconfig = config.PaperworkConfig()
    pconfig.read()

    src_dir = pconfig.settings['workdir'].value
    print("Source work directory : {}".format(src_dir))
    src_dsearch = docsearch.DocSearch(src_dir)
    src_dsearch.reload_index()

    dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs")
    dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index")
    print("Destination directories : {} | {}".format(dst_doc_dir,
                                                     dst_index_dir))
    dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir)
    dst_dsearch.reload_index()

    try:
        documents = [x for x in src_dsearch.docs]
        documents.sort(key=lambda doc: doc.docid)

        for src_doc in documents:
            print("Document [{}]".format(src_doc.docid))
            files = os.listdir(src_doc.path)
            files.sort()

            current_doc = None
            dst_doc = None
            for filename in files:
                if "thumb" in filename:
                    continue
                filepath = os.path.join(src_doc.path, filename)
                fileuri = "file://" + filepath
                importers = docimport.get_possible_importers(
                    fileuri, current_doc=current_doc)
                if len(importers) <= 0:
                    continue
                assert (len(importers) == 1)
                importer = importers[0]
                (docs, page,
                 new) = importer.import_doc(fileuri, dst_dsearch, current_doc)
                dst_doc = docs[0]

                for page_nb in range(0, dst_doc.nb_pages):
                    if dst_doc.can_edit:
                        dst_doc.pages[page_nb].boxes = \
                            src_doc.pages[page_nb].boxes
                        dst_doc.pages[page_nb].drop_cache()

                if current_doc is None:
                    # first page --> guess labels and see if it matchs
                    label_guess(dst_dsearch, src_doc, dst_doc)
                else:
                    # just update the index
                    upd_index(dst_dsearch, dst_doc, new=False)

                current_doc = docs[0]

            if dst_doc is not None:
                fix_labels(dst_dsearch, src_doc, dst_doc)

    finally:
        rm_rf(dst_doc_dir)
        rm_rf(dst_index_dir)
        print_stats()
Beispiel #3
0
def main():
    # enable_logging()
    pconfig = config.PaperworkConfig()
    pconfig.read()

    src_dir = pconfig.settings['workdir'].value
    print("Source work directory : {}".format(src_dir))
    src_dsearch = docsearch.DocSearch(src_dir, use_default_index_client=False)
    src_dsearch.reload_index()

    dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs")
    dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index")
    print(
        "Destination directories : {} | {}".format(dst_doc_dir, dst_index_dir)
    )
    dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir,
                                      use_default_index_client=False)
    dst_dsearch.reload_index()

    print("Testing ...")

    try:
        documents = [x for x in src_dsearch.docs]
        documents.sort(key=lambda doc: doc.docid)

        print("Number of documents: {}".format(len(documents)))

        for src_doc in documents:
            print("Document [{}] | [{}]".format(src_doc.docid, src_doc.path))
            files = [x for x in g_fs.listdir(src_doc.path)]
            files.sort()

            current_doc = None
            for filepath in files:
                print("File: {}".format(filepath))
                filename = g_fs.basename(filepath)
                if "thumb" in filename or "labels" == filename:
                    continue
                importers = docimport.get_possible_importers(
                    [filepath], current_doc=current_doc
                )
                if len(importers) <= 0:
                    continue
                print("Importer(s): {}".format(", ".join([
                    str(x) for x in importers
                ])))
                assert(len(importers) == 1)
                importer = importers[0]
                result = importer.import_doc(
                    [filepath], dst_dsearch, current_doc
                )
                print("Import result: {}".format(str(result.get())))
                if current_doc is None:
                    if result.new_docs == []:
                        print("Nothing imported ?!")
                        continue
                    dst_doc = result.new_docs[0]
                else:
                    dst_doc = current_doc

                for page_nb in range(0, dst_doc.nb_pages):
                    if dst_doc.can_edit:
                        dst_doc.pages[page_nb].boxes = \
                            src_doc.pages[page_nb].boxes

                if current_doc is None:
                    # first page --> guess labels and see if it matchs
                    label_guess(dst_dsearch, src_doc, dst_doc)
                    fix_labels(dst_dsearch, src_doc, dst_doc)
                else:
                    # just update the index
                    upd_index(dst_dsearch, dst_doc, new=False)

                current_doc = dst_doc
            print("")

    finally:
        print("---")
        rm_rf(dst_doc_dir)
        rm_rf(dst_index_dir)
        print_stats()
def run_simulation(src_dsearch, min_yes, csvwriter):
    stats = {
        'nb_documents': 0,
        'correct_guess': 0,
        'missing_guess': 0,
        'wrong_guess': 0,
        'nb_src_labels': 0,
        'nb_dst_labels': 0,
        'perfect': 0,
    }

    dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs")
    dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index")
    print("Destination directories : {} | {}".format(dst_doc_dir,
                                                     dst_index_dir))
    dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir)
    dst_dsearch.reload_index()

    dst_dsearch.label_guesser.min_yes = min_yes

    try:
        documents = [x for x in src_dsearch.docs]
        documents.sort(key=lambda doc: doc.docid)

        for src_doc in documents:
            files = os.listdir(src_doc.path)
            files.sort()

            current_doc = None
            for filename in files:
                if "thumb" in filename:
                    continue
                filepath = os.path.join(src_doc.path, filename)
                fileuri = "file://" + filepath
                importers = docimport.get_possible_importers(
                    fileuri, current_doc=current_doc)
                if len(importers) <= 0:
                    continue
                assert (len(importers) == 1)
                importer = importers[0]
                (docs, page,
                 new) = importer.import_doc(fileuri, dst_dsearch, current_doc)
                dst_doc = docs[0]

                for page_nb in range(0, dst_doc.nb_pages):
                    if dst_doc.can_edit:
                        dst_doc.pages[page_nb].boxes = \
                            src_doc.pages[page_nb].boxes
                        dst_doc.pages[page_nb].drop_cache()

                if current_doc is None:
                    # first page --> guess labels and see if it matchs
                    label_guess(dst_dsearch, src_doc, dst_doc)
                    fix_labels(stats, dst_dsearch, src_doc, dst_doc)
                else:
                    # just update the index
                    upd_index(dst_dsearch, dst_doc, new=False)

                current_doc = docs[0]
    finally:
        g_lock.acquire()
        try:
            csvwriter.writerow([
                min_yes,
                stats['nb_documents'],
                stats['perfect'],
            ])
        finally:
            g_lock.release()
        rm_rf(dst_doc_dir)
        rm_rf(dst_index_dir)
        print_stats(stats)
def run_simulation(
    src_dsearch,
    min_yes,
    csvwriter
):
    stats = {
        'nb_documents': 0,
        'correct_guess': 0,
        'missing_guess': 0,
        'wrong_guess': 0,
        'nb_src_labels': 0,
        'nb_dst_labels': 0,
        'perfect': 0,
    }

    dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs")
    dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index")
    print(
        "Destination directories : {} | {}".format(dst_doc_dir, dst_index_dir)
    )
    dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir)
    dst_dsearch.reload_index()

    dst_dsearch.label_guesser.min_yes = min_yes

    try:
        documents = [x for x in src_dsearch.docs]
        documents.sort(key=lambda doc: doc.docid)

        for src_doc in documents:
            files = os.listdir(src_doc.path)
            files.sort()

            current_doc = None
            for filename in files:
                if "thumb" in filename:
                    continue
                filepath = os.path.join(src_doc.path, filename)
                fileuri = "file://" + filepath
                importers = docimport.get_possible_importers(
                    fileuri, current_doc=current_doc
                )
                if len(importers) <= 0:
                    continue
                assert(len(importers) == 1)
                importer = importers[0]
                (docs, page, new) = importer.import_doc(
                    fileuri, dst_dsearch, current_doc
                )
                dst_doc = docs[0]

                for page_nb in range(0, dst_doc.nb_pages):
                    if dst_doc.can_edit:
                        dst_doc.pages[page_nb].boxes = \
                            src_doc.pages[page_nb].boxes
                        dst_doc.pages[page_nb].drop_cache()

                if current_doc is None:
                    # first page --> guess labels and see if it matchs
                    label_guess(dst_dsearch, src_doc, dst_doc)
                    fix_labels(stats, dst_dsearch, src_doc, dst_doc)
                else:
                    # just update the index
                    upd_index(dst_dsearch, dst_doc, new=False)

                current_doc = docs[0]
    finally:
        with g_lock:
            csvwriter.writerow([
                min_yes,
                stats['nb_documents'], stats['perfect'],
            ])
        rm_rf(dst_doc_dir)
        rm_rf(dst_index_dir)
        print_stats(stats)
Beispiel #6
0
def main():
    # enable_logging()
    pconfig = config.PaperworkConfig()
    pconfig.read()

    src_dir = pconfig.settings['workdir'].value
    print("Source work directory : {}".format(src_dir))
    src_dsearch = docsearch.DocSearch(src_dir, use_default_index_client=False)
    src_dsearch.reload_index()

    dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs")
    dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index")
    print("Destination directories : {} | {}".format(dst_doc_dir,
                                                     dst_index_dir))
    dst_dsearch = docsearch.DocSearch(dst_doc_dir,
                                      indexdir=dst_index_dir,
                                      use_default_index_client=False)
    dst_dsearch.reload_index()

    print("Testing ...")

    try:
        documents = [x for x in src_dsearch.docs]
        documents.sort(key=lambda doc: doc.docid)

        print("Number of documents: {}".format(len(documents)))

        for src_doc in documents:
            print("Document [{}] | [{}]".format(src_doc.docid, src_doc.path))
            files = [x for x in g_fs.listdir(src_doc.path)]
            files.sort()

            current_doc = None
            for filepath in files:
                print("File: {}".format(filepath))
                filename = g_fs.basename(filepath)
                if "thumb" in filename or "labels" == filename:
                    continue
                importers = docimport.get_possible_importers(
                    [filepath], current_doc=current_doc)
                if len(importers) <= 0:
                    continue
                print("Importer(s): {}".format(", ".join(
                    [str(x) for x in importers])))
                assert (len(importers) == 1)
                importer = importers[0]
                result = importer.import_doc([filepath], dst_dsearch,
                                             current_doc)
                print("Import result: {}".format(str(result.get())))
                if current_doc is None:
                    if result.new_docs == []:
                        print("Nothing imported ?!")
                        continue
                    dst_doc = result.new_docs[0]
                else:
                    dst_doc = current_doc

                for page_nb in range(0, dst_doc.nb_pages):
                    if dst_doc.can_edit:
                        dst_doc.pages[page_nb].boxes = \
                            src_doc.pages[page_nb].boxes

                if current_doc is None:
                    # first page --> guess labels and see if it matchs
                    label_guess(dst_dsearch, src_doc, dst_doc)
                    fix_labels(dst_dsearch, src_doc, dst_doc)
                else:
                    # just update the index
                    upd_index(dst_dsearch, dst_doc, new=False)

                current_doc = dst_doc
            print("")

    finally:
        print("---")
        rm_rf(dst_doc_dir)
        rm_rf(dst_index_dir)
        print_stats()
Beispiel #7
0
def main():
    pconfig = config.PaperworkConfig()
    pconfig.read()

    src_dir = pconfig.settings['workdir'].value
    print("Source work directory : {}".format(src_dir))
    src_dsearch = docsearch.DocSearch(src_dir)
    src_dsearch.reload_index()

    dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs")
    dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index")
    print(
        "Destination directories : {} | {}".format(dst_doc_dir, dst_index_dir)
    )
    dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir)
    dst_dsearch.reload_index()

    try:
        documents = [x for x in src_dsearch.docs]
        documents.sort(key=lambda doc: doc.docid)

        for src_doc in documents:
            print("Document [{}]".format(src_doc.docid))
            files = os.listdir(src_doc.path)
            files.sort()

            current_doc = None
            dst_doc = None
            for filename in files:
                if "thumb" in filename:
                    continue
                filepath = os.path.join(src_doc.path, filename)
                fileuri = "file://" + filepath
                importers = docimport.get_possible_importers(
                    fileuri, current_doc=current_doc
                )
                if len(importers) <= 0:
                    continue
                assert(len(importers) == 1)
                importer = importers[0]
                (docs, page, new) = importer.import_doc(
                    fileuri, dst_dsearch, current_doc
                )
                dst_doc = docs[0]

                for page_nb in range(0, dst_doc.nb_pages):
                    if dst_doc.can_edit:
                        dst_doc.pages[page_nb].boxes = \
                            src_doc.pages[page_nb].boxes
                        dst_doc.pages[page_nb].drop_cache()

                if current_doc is None:
                    # first page --> guess labels and see if it matchs
                    label_guess(dst_dsearch, src_doc, dst_doc)
                else:
                    # just update the index
                    upd_index(dst_dsearch, dst_doc, new=False)

                current_doc = docs[0]

            if dst_doc is not None:
                fix_labels(dst_dsearch, src_doc, dst_doc)

    finally:
        rm_rf(dst_doc_dir)
        rm_rf(dst_index_dir)
        print_stats()