コード例 #1
0
ファイル: lex.py プロジェクト: amitdo/nidaba
def tei_spellcheck(facsimile, dictionary, deletion_dictionary,
                   filter_punctuation=False):
    """
    Performs a spell check on an TEI XML document.

    Each ``seg`` element is treated as a single word and spelling corrections
    will be inserted using a choice tag. Correct words will be untouched and
    correction candidates will be sorted by edit distance.

    Args:
        facsimile (nidaba.tei.TEIFacsimile): TEIFacsimile object.
        dictionary (unicode): Path to a base dictionary.
        deletion_dictionary (unicode): Path to a deletion dictionary.
        filter_punctuation (bool): Switch to filter punctuation inside
                                   segments.

    Returns:
        A TEIFacsimile object containing the spelling corrections.
    """
    text_tokens = [x[-1] for x in facsimile.segments]
    if filter_punctuation:
        text_tokens = [regex.sub('[^\w]', '', x) for x in text_tokens]
    suggestions = spellcheck(text_tokens, dictionary, deletion_dictionary)
    facsimile.add_respstmt('spell-checker', 'nidaba-levenshtein')
    for segment in facsimile.segments:
        key = alg.sanitize(segment[-1])
        if filter_punctuation:
            key = regex.sub('[^\w]', '', key)
        if key not in suggestions:
            continue
        for sugg in suggestions[key]:
            facsimile.add_choices(segment[-2], [(sugg, 100 - 10 *
                                  alg.edit_distance(key, sugg))])
    return facsimile
コード例 #2
0
ファイル: lex.py プロジェクト: amitdo/nidaba
def spellcheck(tokens, dictionary, deletion_dictionary):
    """
    Performs a spell check on a sequence of tokens.

    The spelling of each sequence of characters is compared against a
    dictionary containing deletions of valid words and a dictionary of correct
    words.

    Args:
        tokens (iterable): An iterable returning sequences of unicode
                           characters.
        dictionary (unicode): Path to a base dictionary.
        deletion_dictionary (unicode): Path to a deletion dictionary.

    Returns:
        A dictionary containing a sorted (least to highest edit distance) list
        of suggestions for each character sequence that is not contained
        verbatim in the dictionary. Tokens that are not recognized as valid
        words but don't have spelling suggestions either will be contained in
        the result dictionary.
    """
    suggestions = {}
    for tok in tokens:
        tok = alg.sanitize(tok)
        if alg.mmap_bin_search(tok, dictionary,
                               entryparser_fn=alg.key_for_single_word):
            continue
        if tok in suggestions:
            continue
        ret = alg.mapped_sym_suggest(tok, deletion_dictionary, dictionary, 1)
        suggestions[tok] = alg.suggestions(tok, set.union(*ret.itervalues()))
    return suggestions
コード例 #3
0
ファイル: stats.py プロジェクト: OpenPhilology/nidaba
def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True):
    """
    Calculates the lexicality of text in input documents.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    dictionary = storage.get_abs_path(*nidaba_cfg['lang_dicts'][language]['dictionary'])
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        tei.load_tei(fp)
    cnt = 0
    err_cnt = 0
    for seg_id, segment in facsimile.segments.iteritems():
        tok = alg.sanitize(''.join(x['grapheme'] for x in segment['content'].itervalues()))
        tok = regex.sub('[^\w]', '', key)
        cnt += 1
        if not alg.mmap_bin_search(tok, dictionary, entryparser_fn=alg.key_for_single_word):
            err_cnt += 1
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(err_cnt / float(cnt)))
        return output_path
    else:
        return {'edit_ratio': err_cnt / float(cnt), 'ground_truth': '', 'doc': doc}
コード例 #4
0
def spellcheck(tokens, dictionary, deletion_dictionary):
    """
    Performs a spell check on a sequence of tokens.

    The spelling of each sequence of characters is compared against a
    dictionary containing deletions of valid words and a dictionary of correct
    words.

    Args:
        tokens (iterable): An iterable returning sequences of unicode
                           characters.
        dictionary (unicode): Path to a base dictionary.
        deletion_dictionary (unicode): Path to a deletion dictionary.

    Returns:
        A dictionary containing a sorted (least to highest edit distance) list
        of suggestions for each character sequence that is not contained
        verbatim in the dictionary. Tokens that are not recognized as valid
        words but don't have spelling suggestions either will be contained in
        the result dictionary.
    """
    suggestions = {}
    for tok in tokens:
        tok = alg.sanitize(tok)
        if alg.mmap_bin_search(tok,
                               dictionary,
                               entryparser_fn=alg.key_for_single_word):
            continue
        if tok in suggestions:
            continue
        ret = alg.mapped_sym_suggest(tok, deletion_dictionary, dictionary, 1)
        suggestions[tok] = alg.suggestions(tok, set.union(*ret.itervalues()))
    return suggestions
コード例 #5
0
def tei_spellcheck(facsimile,
                   dictionary,
                   deletion_dictionary,
                   filter_punctuation=False):
    """
    Performs a spell check on an TEI XML document.

    Each ``seg`` element is treated as a single word and spelling corrections
    will be inserted using a choice tag. Correct words will be untouched and
    correction candidates will be sorted by edit distance.

    Args:
        facsimile (nidaba.tei.OCRRecord): OCR record object.
        dictionary (unicode): Path to a base dictionary.
        deletion_dictionary (unicode): Path to a deletion dictionary.
        filter_punctuation (bool): Switch to filter punctuation inside
                                   segments.

    Returns:
        A OCRRecord object containing the spelling corrections.
    """
    text_tokens = set(''.join(y['grapheme']
                              for y in x.get('content').itervalues())
                      for x in facsimile.segments.itervalues())
    text_tokens.remove('')
    text_tokens = list(text_tokens)
    if filter_punctuation:
        text_tokens = [regex.sub('[^\w]', '', x) for x in text_tokens]
    suggestions = spellcheck(text_tokens, dictionary, deletion_dictionary)
    facsimile.add_respstmt('spell-checker', 'nidaba-levenshtein')
    for seg_id, segment in facsimile.segments.iteritems():
        key = alg.sanitize(''.join(x['grapheme']
                                   for x in segment['content'].itervalues()))
        if filter_punctuation:
            key = regex.sub('[^\w]', '', key)
        if key not in suggestions:
            continue
        for sugg in suggestions[key]:
            facsimile.add_choices(
                seg_id, [{
                    'alternative': sugg,
                    'confidence': 100 - 10 * alg.edit_distance(key, sugg)
                }])
    return facsimile
コード例 #6
0
ファイル: lex.py プロジェクト: amitdo/nidaba
def cleanlines(path, encoding=u'utf-8', normalization=u'NFD'):
    """
    Read in lines from a file and return them as a sanitized list.
    Non-unique linse will be repeated.

    Args:
        path (unicode): Absolute path of the file to be read
        encoding (unicode): Encoding to use for decoding the file
        normalization (unicode): Normalization format to use

    Returns:
        list: List of lines containing the sanitized output, i.e. normalized
              unicode objects.
    """
    words = []
    with codecs.open(path, u'r', encoding=encoding) as lines:
        for line in lines:
            words.append(alg.sanitize(line, normalization=normalization))
    return words
コード例 #7
0
def cleanlines(path, encoding=u'utf-8', normalization=u'NFD'):
    """
    Read in lines from a file and return them as a sanitized list.
    Non-unique linse will be repeated.

    Args:
        path (unicode): Absolute path of the file to be read
        encoding (unicode): Encoding to use for decoding the file
        normalization (unicode): Normalization format to use

    Returns:
        list: List of lines containing the sanitized output, i.e. normalized
              unicode objects.
    """
    words = []
    with codecs.open(path, u'r', encoding=encoding) as lines:
        for line in lines:
            words.append(alg.sanitize(line, normalization=normalization))
    return words
コード例 #8
0
def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True):
    """
    Calculates the lexicality of text in input documents.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    dictionary = storage.get_abs_path(
        *nidaba_cfg['lang_dicts'][language]['dictionary'])
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        tei.load_tei(fp)
    cnt = 0
    err_cnt = 0
    for seg_id, segment in facsimile.segments.iteritems():
        tok = alg.sanitize(''.join(x['grapheme']
                                   for x in segment['content'].itervalues()))
        tok = regex.sub('[^\w]', '', key)
        cnt += 1
        if not alg.mmap_bin_search(
                tok, dictionary, entryparser_fn=alg.key_for_single_word):
            err_cnt += 1
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(err_cnt / float(cnt)))
        return output_path
    else:
        return {
            'edit_ratio': err_cnt / float(cnt),
            'ground_truth': '',
            'doc': doc
        }
コード例 #9
0
ファイル: lex.py プロジェクト: amitdo/nidaba
def cleanwords(path, encoding=u'utf-8', normalization=u'NFD'):
    """
    Read in every word from a files as separated by lines and spaces.
    Non-unique words will be repeated as they are read in. Detects only words
    divided by a standard space.

    Args:
        path (unicode): Absolute path of the file to be read
        encoding (unicode): Encoding to use for decoding the file
        normalization (unicode): Normalization format to use

    Returns:
        list: List of words containing the sanitized output, i.e. normalized
              unicode objects.
    """
    words = []
    with codecs.open(path, u'r', encoding=encoding) as lines:
        for line in lines:
            for seg in line.split(u' '):
                clean = alg.sanitize(seg, normalization=normalization)
                if clean != u'':
                    words.append(clean)
    return words
コード例 #10
0
def cleanwords(path, encoding=u'utf-8', normalization=u'NFD'):
    """
    Read in every word from a files as separated by lines and spaces.
    Non-unique words will be repeated as they are read in. Detects only words
    divided by a standard space.

    Args:
        path (unicode): Absolute path of the file to be read
        encoding (unicode): Encoding to use for decoding the file
        normalization (unicode): Normalization format to use

    Returns:
        list: List of words containing the sanitized output, i.e. normalized
              unicode objects.
    """
    words = []
    with codecs.open(path, u'r', encoding=encoding) as lines:
        for line in lines:
            for seg in line.split(u' '):
                clean = alg.sanitize(seg, normalization=normalization)
                if clean != u'':
                    words.append(clean)
    return words
コード例 #11
0
ファイル: stats.py プロジェクト: amitdo/nidaba
def cleanup(text):
    """
    Removes lines containing only whitespace and normalizes to NFD.
    """
    text = sanitize(text)
    return '\n'.join([s for s in text.splitlines() if len(s.strip())])
コード例 #12
0
def cleanup(text):
    """
    Removes lines containing only whitespace and normalizes to NFD.
    """
    text = sanitize(text)
    return '\n'.join([s for s in text.splitlines() if len(s.strip())])