Exemple #1
0
 def test_gen_queryimages(self, tmpdir):
     col_dir = tmpdir.mkdir("collection")
     test_col = gen_test_collection(col_dir)
     ocr = OCR(col=test_col)
     all_note_ids = ocr.col.db.list("select * from notes")
     q_images = NotesQuery(col=test_col, note_ids=all_note_ids)
     print(q_images)
Exemple #2
0
 def test_query_noteids(self, tmpdir):
     col_dir = tmpdir.mkdir("collection")
     test_col = gen_test_collection(col_dir)
     ocr = OCR(col=test_col)
     note_ids = [1601851621708, 1601851571572]
     q_images = NotesQuery(col=test_col, note_ids=note_ids)
     assert len(q_images.notes) == 2
     for note in q_images.notes:
         assert note.note_id in note_ids
Exemple #3
0
 def test_clean_ocr_text(self):
     input_str = "this is some text: with a result\n\n\nThis is some double colon :: with result" \
                 "\n\nwithout spaces::new word\none space:: new word\n\n\n\none space before ::new word\n" \
                 "triple ::: new word\n\n\n\n\nquadruple ::::newword"""
     expected_output = "this is some text: with a result\nThis is some double colon : with result\n" \
                       "without spaces:new word\none space: new word\none space before :new word\n" \
                       "triple : new word\nquadruple :newword"
     output = OCR.clean_ocr_text(input_str)
     assert output == expected_output
Exemple #4
0
 def test_add_ocr_field_then_remove_text_new_field(self, tmpdir):
     col_dir = tmpdir.mkdir("collection")
     test_col = gen_test_collection(col_dir)
     ocr = OCR(col=test_col, text_output_location="new_field")
     note_ids = [1601851571572, 1601851621708]
     ocr.run_ocr_on_notes(note_ids=note_ids)
     ocr.remove_ocr_on_notes(note_ids=note_ids)
    def test_unbatched_single_threaded(self):
        console.print("Starting un-batched single threaded")

        ocr = OCR(col=None,
                  progress=None,
                  languages=["eng"],
                  num_threads=1,
                  use_batching=False)
        _, time_taken = timeit(ocr._ocr_unbatched_process, self.IMG_PTHS)
        try:
            console.print(
                f"OMP_THREAD_LIMIT = {os.environ['OMP_THREAD_LIMIT']}")
        except KeyError:
            console.print("No thread limit found.")
        return time_taken
    def test_batched_multi_threaded(self):
        console.print("Starting batched multi threaded")

        ocr = OCR(col=None,
                  progress=None,
                  languages=["eng"],
                  num_threads=4,
                  use_batching=True)
        _, time_taken = timeit(ocr._ocr_batch_process, self.batched_txts)
        try:
            console.print(
                f"OMP_THREAD_LIMIT = {os.environ['OMP_THREAD_LIMIT']}")
        except KeyError:
            console.print("No thread limit found.")
        return time_taken
Exemple #7
0
 def test_run_ocr_on_notes_unbatched_multithreaded(self, tmpdir):
     col_dir = tmpdir.mkdir("collection")
     test_col = gen_test_collection(col_dir)
     ocr = OCR(col=test_col, use_batching=False, num_threads=4)
     ocr.run_ocr_on_notes(note_ids=[1601851571572, 1601851621708])
Exemple #8
0
 def test_run_ocr_on_collection(self, tmpdir):
     col_dir = tmpdir.mkdir("collection")
     test_col = gen_test_collection(col_dir)
     ocr = OCR(col=test_col)
     all_note_ids = ocr.col.db.list("select * from notes")
     ocr.run_ocr_on_query(note_ids=all_note_ids)
Exemple #9
0
 def test_ocr_img_without_lang(self, img_pth, expected):
     img = str(img_pth.absolute())
     ocr_result = OCR._ocr_img(img, num_threads=1).strip()
     cleaned_result = OCR.clean_ocr_text(ocr_result).strip()
     expected = expected.strip()
     assert cleaned_result == expected
Exemple #10
0
class TestOCR:
    all_img_files = list(Path(TESTDATA_DIR, "annotated_imgs").glob("*"))
    img_pths = sorted([f for f in all_img_files if f.suffix in [".png", ".jpg", ".tiff", ".tif", ".jpeg"]])
    annot_pths = sorted([f for f in all_img_files if f.suffix == ".txt"])
    annot_txts = [f.read_text(encoding="utf-8") for f in annot_pths]
    assert len(img_pths) == len(annot_pths)
    tesseract_cmd = OCR.path_to_tesseract()
    pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

    def test_collection_ok(self, tmpdir):
        col_dir = tmpdir.mkdir("collection")
        test_col = gen_test_collection(col_dir)
        assert test_col.basicCheck()

    @pytest.mark.parametrize(["img_pth", "expected"], [(i, a) for i, a in zip(img_pths, annot_txts)])
    def test_ocr_img_with_lang(self, img_pth, expected):
        img = str(img_pth.absolute())
        ocr_result = OCR._ocr_img(img, num_threads=1, languages=["eng"])
        cleaned_result = OCR.clean_ocr_text(ocr_result).strip()
        expected = expected.strip()
        assert cleaned_result == expected


    @pytest.mark.parametrize(["img_pth", "expected"], [(i, a) for i, a in zip(img_pths, annot_txts)])
    def test_ocr_img_without_lang(self, img_pth, expected):
        img = str(img_pth.absolute())
        ocr_result = OCR._ocr_img(img, num_threads=1).strip()
        cleaned_result = OCR.clean_ocr_text(ocr_result).strip()
        expected = expected.strip()
        assert cleaned_result == expected

    def test_gen_queryimages(self, tmpdir):
        col_dir = tmpdir.mkdir("collection")
        test_col = gen_test_collection(col_dir)
        ocr = OCR(col=test_col)
        all_note_ids = ocr.col.db.list("select * from notes")
        q_images = NotesQuery(col=test_col, note_ids=all_note_ids)
        print(q_images)

    def test_query_noteids(self, tmpdir):
        col_dir = tmpdir.mkdir("collection")
        test_col = gen_test_collection(col_dir)
        ocr = OCR(col=test_col)
        note_ids = [1601851621708, 1601851571572]
        q_images = NotesQuery(col=test_col, note_ids=note_ids)
        assert len(q_images.notes) == 2
        for note in q_images.notes:
            assert note.note_id in note_ids

    def test_run_ocr_on_collection(self, tmpdir):
        col_dir = tmpdir.mkdir("collection")
        test_col = gen_test_collection(col_dir)
        ocr = OCR(col=test_col)
        all_note_ids = ocr.col.db.list("select * from notes")
        ocr.run_ocr_on_query(note_ids=all_note_ids)

    def test_run_ocr_on_notes_batched_multithreaded(self, tmpdir):
        col_dir = tmpdir.mkdir("collection")
        test_col = gen_test_collection(col_dir)
        ocr = OCR(col=test_col, use_batching=True, num_threads=4)
        ocr.run_ocr_on_notes(note_ids=[1601851571572, 1601851621708])

    def test_run_ocr_on_notes_batched_single_threaded(self, tmpdir):
        col_dir = tmpdir.mkdir("collection")
        test_col = gen_test_collection(col_dir)
        ocr = OCR(col=test_col, use_batching=True, num_threads=1)
        ocr.run_ocr_on_notes(note_ids=[1601851571572, 1601851621708])

    def test_run_ocr_on_notes_unbatched_multithreaded(self, tmpdir):
        col_dir = tmpdir.mkdir("collection")
        test_col = gen_test_collection(col_dir)
        ocr = OCR(col=test_col, use_batching=False, num_threads=4)
        ocr.run_ocr_on_notes(note_ids=[1601851571572, 1601851621708])

    def test_run_ocr_on_notes_unbatched_singlethreaded(self, tmpdir):
        col_dir = tmpdir.mkdir("collection")
        test_col = gen_test_collection(col_dir)
        ocr = OCR(col=test_col, use_batching=False, num_threads=1)
        ocr.run_ocr_on_notes(note_ids=[1601851571572, 1601851621708])

    def test_add_ocr_field_then_remove_text_tooltip(self, tmpdir):
        col_dir = tmpdir.mkdir("collection")
        test_col = gen_test_collection(col_dir)
        ocr = OCR(col=test_col, text_output_location="tooltip")
        note_ids = [1601851571572, 1601851621708]
        ocr.run_ocr_on_notes(note_ids=note_ids)
        ocr.remove_ocr_on_notes(note_ids=note_ids)

    def test_add_ocr_field_then_remove_text_new_field(self, tmpdir):
        col_dir = tmpdir.mkdir("collection")
        test_col = gen_test_collection(col_dir)
        ocr = OCR(col=test_col, text_output_location="new_field")
        note_ids = [1601851571572, 1601851621708]
        ocr.run_ocr_on_notes(note_ids=note_ids)
        ocr.remove_ocr_on_notes(note_ids=note_ids)

    def test_clean_ocr_text(self):
        input_str = "this is some text: with a result\n\n\nThis is some double colon :: with result" \
                    "\n\nwithout spaces::new word\none space:: new word\n\n\n\none space before ::new word\n" \
                    "triple ::: new word\n\n\n\n\nquadruple ::::newword"""
        expected_output = "this is some text: with a result\nThis is some double colon : with result\n" \
                          "without spaces:new word\none space: new word\none space before :new word\n" \
                          "triple : new word\nquadruple :newword"
        output = OCR.clean_ocr_text(input_str)
        assert output == expected_output
Exemple #11
0
import logging
from pathlib import Path

from anki import Collection

from anki_ocr.ocr import SCRIPT_DIR, OCR

if __name__ == '__main__':
    logging_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    logging.basicConfig(format=logging_format, level=logging.INFO)
    # Not to be run inside Anki
    PROFILE_HOME = Path(SCRIPT_DIR.parent, "tests/User 1")
    cpath = PROFILE_HOME / "collection.anki2"

    collection = Collection(str(cpath), log=True)  # Collection is locked from here on

    ocr = OCR(col=collection, text_output_location="new_field")
    all_note_ids = ocr.col.db.list("select * from notes")
    ocr.run_ocr_on_query(note_ids=all_note_ids)
    # collection.close(save=True)
    # ocr.remove_ocr_on_notes(note_ids_c)
class TestPerformance:
    test_img_pths = list(Path(TESTDATA_DIR, "annotated_imgs").glob("*"))
    tesseract_cmd = OCR.path_to_tesseract()
    pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
    IMG_PTHS = [img_pth.absolute() for img_pth in IMGS_DIR.glob("*.png")]
    NUM_IMGS = len(IMG_PTHS)
    TXT_PATH = Path(IMGS_DIR, "imgs.txt")
    TXT_PATH.write_text("\n".join([str(i) for i in IMG_PTHS]))
    BATCH_SIZE = 10
    console.log(f"BATCH_SIZE : {BATCH_SIZE}")
    console.log(f"Number of images = {len(IMG_PTHS)}")
    batched_txts, batched_txts_dir = gen_batched_txts(img_pths=IMG_PTHS,
                                                      batch_size=BATCH_SIZE)
    console.log(
        f"Generated {len(batched_txts)} batches of max {BATCH_SIZE} images")

    def test_batched_single_threaded(self):
        console.print("Starting batched single threaded")

        ocr = OCR(col=None,
                  progress=None,
                  languages=["eng"],
                  num_threads=1,
                  use_batching=True)
        _, time_taken = timeit(ocr._ocr_batch_process, self.batched_txts)
        try:
            console.print(
                f"OMP_THREAD_LIMIT = {os.environ['OMP_THREAD_LIMIT']}")
        except KeyError:
            console.print("No thread limit found.")
        return time_taken

    def test_batched_multi_threaded(self):
        console.print("Starting batched multi threaded")

        ocr = OCR(col=None,
                  progress=None,
                  languages=["eng"],
                  num_threads=4,
                  use_batching=True)
        _, time_taken = timeit(ocr._ocr_batch_process, self.batched_txts)
        try:
            console.print(
                f"OMP_THREAD_LIMIT = {os.environ['OMP_THREAD_LIMIT']}")
        except KeyError:
            console.print("No thread limit found.")
        return time_taken

    def test_unbatched_single_threaded(self):
        console.print("Starting un-batched single threaded")

        ocr = OCR(col=None,
                  progress=None,
                  languages=["eng"],
                  num_threads=1,
                  use_batching=False)
        _, time_taken = timeit(ocr._ocr_unbatched_process, self.IMG_PTHS)
        try:
            console.print(
                f"OMP_THREAD_LIMIT = {os.environ['OMP_THREAD_LIMIT']}")
        except KeyError:
            console.print("No thread limit found.")
        return time_taken

    def test_unbatched_multi_threaded(self):
        console.print("Starting un-batched multi threaded")

        ocr = OCR(col=None,
                  progress=None,
                  languages=["eng"],
                  num_threads=4,
                  use_batching=False)
        _, time_taken = timeit(ocr._ocr_unbatched_process, self.IMG_PTHS)
        try:
            console.print(
                f"OMP_THREAD_LIMIT = {os.environ['OMP_THREAD_LIMIT']}")
        except KeyError:
            console.print("No thread limit found.")
        return time_taken