コード例 #1
0
def test_word_segmentation(workspace):
    CalamariRecognize(
        workspace,
        input_file_grp="OCR-D-GT-SEG-LINE",
        output_file_grp="OCR-D-OCR-CALAMARI",
        parameter={
            "checkpoint": CHECKPOINT,
            "textequiv_level": "word",   # Note that we're going down to word level here
        }
    ).process()
    workspace.save_mets()

    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
    assert os.path.exists(page1)
    tree = etree.parse(page1)

    # The result should contain a TextLine that contains the text "December"
    line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0]
    assert line

    # The textline should a. contain multiple words and b. these should concatenate fine to produce the same line text
    words = line.xpath(".//pc:Word", namespaces=NSMAP)
    assert len(words) >= 2
    words_text = " ".join(word.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text for word in words)
    line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text
    assert words_text == line_text

    # For extra measure, check that we're not seeing any glyphs, as we asked for textequiv_level == "word"
    glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
    assert len(glyphs) == 0
コード例 #2
0
def test_recognize(workspace):
    CalamariRecognize(workspace,
                      input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
                      output_file_grp="OCR-D-OCR-CALAMARI",
                      parameter={
                          "checkpoint_dir": CHECKPOINT_DIR,
                      }).process()
    workspace.save_mets()

    page1 = os.path.join(workspace.directory,
                         "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
    assert os.path.exists(page1)
    assertFileContains(page1, "verſchuldeten")
コード例 #3
0
def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(
        workspace, caplog):
    caplog.set_level(logging.WARNING)
    CalamariRecognize(workspace,
                      input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
                      output_file_grp="OCR-D-OCR-CALAMARI-BROKEN",
                      parameter={
                          'checkpoint_dir': CHECKPOINT_DIR
                      }).process()

    interesting_log_messages = [
        t[2] for t in caplog.record_tuples if "Using raw image" in t[2]
    ]
    assert len(interesting_log_messages) > 10  # For every line!
コード例 #4
0
def test_recognize(workspace):
    CalamariRecognize(
        workspace,
        input_file_grp="OCR-D-GT-SEG-LINE",
        output_file_grp="OCR-D-OCR-CALAMARI",
        parameter={
            "checkpoint": CHECKPOINT,
        }
    ).process()
    workspace.save_mets()

    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
    assert os.path.exists(page1)
    with open(page1, "r", encoding="utf-8") as f:
        assert "verſchuldeten" in f.read()
コード例 #5
0
def test_glyphs(workspace):
    CalamariRecognize(
        workspace,
        input_file_grp="OCR-D-GT-SEG-LINE",
        output_file_grp="OCR-D-OCR-CALAMARI",
        parameter={
            "checkpoint": CHECKPOINT,
            "textequiv_level": "glyph",   # Note that we're going down to glyph level here
        }
    ).process()
    workspace.save_mets()

    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
    assert os.path.exists(page1)
    tree = etree.parse(page1)

    # The result should contain a lot of glyphs
    glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
    assert len(glyphs) >= 100