Beispiel #1
0
def sym_wordcorrect(conf, uncorrected_dir, corrected_dir):
    """Correct OCR files from inputdir specified in config.ini - using word level SymSpell"""
    print("Initialize SymSpell")
    sym_spell = SymSpell()
    param_tuple, param_str = util.get_params(conf)
    dictionary_path = conf[param_tuple[1]]
    sym_spell.load_dictionary(dictionary_path, 0, 1)

    # Sort novels, just because; then correct each novel
    sorted_novels = sorted_listdir(uncorrected_dir)
    for novel in sorted_novels:
        novel_str = get_novel_string(novel, uncorrected_dir)
        # Correct individual words using SymSpell
        corrected_novel_str = word_correct_text(novel_str, sym_spell)
        # Create output folder if not exists and write to file
        outfolder = os.path.join(corrected_dir, novel)
        try:
            os.makedirs(outfolder)
        except FileExistsError:
            pass
        outpath = os.path.join(outfolder,
                               os.path.basename(novel) + '.corrected.txt')
        print(outpath)
        with open(outpath, 'w') as f:
            f.write(corrected_novel_str + "\n")
def get_ocr_pages(ocr_dir, text_elem):
    """Find the correct novel dir in ocr_dir based on id in text_elem, and return list of page paths."""
    novel_id = re.search(r'id="([^"]+)"', text_elem).group(1)
    novel_dirs = [x for x in os.listdir(ocr_dir) if x.startswith(novel_id)]
    if len(novel_dirs) != 1:
        raise Exception(f'Did not find unique novel dir for novel id "{novel_id}"')
    pages = sorted_listdir(os.path.join(ocr_dir, novel_dirs[0]))
    return [os.path.join(ocr_dir, novel_dirs[0], page) for page in pages]
Beispiel #3
0
def generate_novels_vrt(novels_dir, corpus_id):
    """Generator that yields the lines of a VRT file with all novels in a corpus."""
    novel_ids = sorted_listdir(novels_dir)
    novel_dirs = [os.path.join(novels_dir, d) for d in novel_ids]
    yield f'<corpus id="{corpus_id}">' + '\n'
    for novel_id, novel_dir in zip(novel_ids, novel_dirs):
        # Process and write novel.
        novel_vrt = pages2vrt(novel_dir)
        yield novel_vrt + '\n'
    yield '</corpus>' + '\n'
Beispiel #4
0
def get_novel_string(novel, uncorrected_dir):
    """Create a single string from novel pages."""
    # TODO Hack to accommodate missing page specifications on the dir name in the default case ...
    if not os.path.isdir(os.path.join(uncorrected_dir, novel)):
        novel = re.sub(r'-s\d.{0,5}$', '', novel)
    novel_pages = sorted_listdir(os.path.join(uncorrected_dir, novel))
    # Create one big string from pages. Keep newlines.
    novel_pagestrings = get_novel_pagestrings(novel_pages, uncorrected_dir,
                                              novel)
    novel_pagestrings = fix_hyphens(novel_pagestrings)
    novel_string = '\n'.join(novel_pagestrings)
    # Eliminate hyphenation in the text
    novel_string = '\n'.join(
        fix_hyphens([line for line in novel_string.splitlines()]))
    return novel_string
Beispiel #5
0
def pages2vrt(pagedir):
    """Convert pages to text represented in VRT format."""
    def get_pagenum(page: str):
        """Extract page number from page filename."""
        return re.search(r'page_(\d+)', page).group(1)

    text_id = os.path.basename(pagedir)
    pages = sorted_listdir(pagedir)
    pagepaths = [os.path.join(pagedir, p) for p in pages]
    pagenums = [int(get_pagenum(p)) for p in pages]
    tokenlists = [
        page2tokens(page, pagenum, text_id)
        for page, pagenum in zip(pagepaths, pagenums)
    ]
    texttokens = flatten_tokenlists(tokenlists)
    vrt_lines = [
        f'{d["token"]}\t{d["i"]}\t{d["line"]}\t{d["page"]}\t{d["text_id"]}'
        for d in texttokens
    ]
    vrt_text = '<text id="{}">\n{}\n</text>'.format(text_id,
                                                    "\n".join(vrt_lines))
    return vrt_text
Beispiel #6
0
def correct_hard_fraktur_errors(uncorrected_dir, intermediate, corrected_dir):
    """Manually correct harder OCR errors by looking at 'dan' OCR. Designed for the Tesseract fraktur traineddata."""
    # Sort novels, just because; then correct each novel
    sorted_novels = [
        n for n in sorted_listdir(uncorrected_dir) if n != '.DS_Store'
    ]  # Hack alert!
    for novel in sorted_novels:
        novel_str = get_novel_string(novel, uncorrected_dir)
        dan_novel_str = get_novel_string(
            novel, os.path.join(intermediate, 'tess_out_dan'))
        frk_novel_str = get_novel_string(
            novel, os.path.join(intermediate, 'tess_out_frk'))
        kb_novel_str = get_novel_string(
            novel, os.path.join(intermediate, 'orig_pages'))

        dan_replacements = [('o', 'ø'), ('a', 'æ'), ('e', 'æ'), ('J', 'I'),
                            ('t', 'k'), ('o', 'æ'), ('D', 'Ø')]
        corrected_novel_str = alt_ocr_correct(novel_str, dan_novel_str,
                                              dan_replacements)
        frk_replacements = [('t', 'k'), ('g', 'a')]
        corrected_novel_str = alt_ocr_correct(corrected_novel_str,
                                              frk_novel_str, frk_replacements)
        kb_replacements = [('J', 'I')]
        corrected_novel_str = alt_ocr_correct(corrected_novel_str,
                                              kb_novel_str, kb_replacements)

        # Create output folder if not exists and write to file
        outfolder = os.path.join(corrected_dir, novel)
        try:
            os.makedirs(outfolder)
        except FileExistsError:
            pass
        outpath = os.path.join(outfolder,
                               os.path.basename(novel) + '.corrected.txt')
        print(outpath)
        with open(outpath, 'w') as f:
            f.write(corrected_novel_str + "\n")
Beispiel #7
0
def correct_easy_fraktur_errors(uncorrected_dir, corrected_dir):
    """Manually correct 'safe' and easy OCR errors. Designed for the Tesseract fraktur traineddata."""
    # Sort novels, just because; then correct each novel
    sorted_novels = sorted_listdir(uncorrected_dir)
    for novel in sorted_novels:
        novel_str = get_novel_string(novel, uncorrected_dir)

        corrected_novel_str = re.sub(r'œæ', 'æ', novel_str)
        corrected_novel_str = re.sub(r'æœ', 'æ', corrected_novel_str)
        corrected_novel_str = re.sub(r'œe', 'æ', corrected_novel_str)
        corrected_novel_str = re.sub(r'eœ', 'æ', corrected_novel_str)
        corrected_novel_str = re.sub(r'œ', 'æ', corrected_novel_str)

        # Create output folder if not exists and write to file
        outfolder = os.path.join(corrected_dir, novel)
        try:
            os.makedirs(outfolder)
        except FileExistsError:
            pass
        outpath = os.path.join(outfolder,
                               os.path.basename(novel) + '.corrected.txt')
        print(outpath)
        with open(outpath, 'w') as f:
            f.write(corrected_novel_str + "\n")