def test_beautiful(self):
        h_en = Hyphenator('en_US')

        self.assertEqual([['beau', 'tiful'], [u'beauti', 'ful']],
                         h_en.pairs('beautiful'))

        self.assertEqual(['beau-', 'tiful'], h_en.wrap('beautiful', 6))

        self.assertEqual(['beauti-', 'ful'], h_en.wrap('beautiful', 7))

        self.assertEqual(['beau', 'ti', 'ful'], h_en.syllables('beautiful'))
class hyphenator:
    def __init__(self, language='it_IT'):

        self.h = Hyphenator(language)

    def split_syllables(self, word):

        syllables = self.h.syllables(utils.check_unicode(word))

        return syllables

    def split_word(self, word):

        pairs = self.h.pairs(utils.check_unicode(word))

        return pairs
class HyphenationIntroducer:
    def __init__(self, p_hyphen: float):
        self.p_hyphen = p_hyphen
        self.hyphenator = Hyphenator()

    def get_candidates(self, token: str) -> List[str]:
        try:
            return self.hyphenator.pairs(token)
        except:
            return []

    def introduce_hyphens(self, text: str) -> str:
        tokens = text.split(" ")
        for i in range(len(tokens)):
            candidates = self.get_candidates(tokens[i])
            if len(candidates) > 0 and flip_coin(random, self.p_hyphen):
                candidate = random.choice(candidates)
                tokens[i] = "-".join(candidate)
        return " ".join(tokens)
Exemple #4
0
    hyphenator = Hyphenator()

    char_error_rates = []
    total_hyphen_edits = 0
    total_tokens = 0
    hyphenable_tokens = 0

    for i, (corrupt, correct) in enumerate(
            zip(read_lines(raw_file), read_lines(clean_file))):
        corrupt_tokens = corrupt.split()
        correct_tokens = correct.split()
        total_tokens += len(correct_tokens)
        for t in correct_tokens:
            try:
                if len(hyphenator.pairs(t)) > 0:
                    hyphenable_tokens += 1
            except IndexError:
                pass
        ocr_errors = get_ocr_errors(corrupt_tokens, correct_tokens)
        print(i + 1, ocr_errors)
        n_char_edits = 0
        n_hyphen_edits = 0
        for erroneous, corrected in ocr_errors:
            char_edits = get_ocr_character_edits(erroneous, corrected)
            char_edits = [
                edit for edit in char_edits
                if edit[0] != " " and edit[1] != " "
            ]
            hyphen_edits = len(
                [edit for edit in char_edits if edit == ("-", "")])
Exemple #5
0
from hyphen import Hyphenator
# Create some hyphenators
h_de = Hyphenator('de_DE')
h_en = Hyphenator('en_US')
h_es = Hyphenator('es_ES')

# Now hyphenate some words
# Note: the following examples are written in Python 3.x syntax.
# If you use Python 2.x, you must add the 'u' prefixes as Hyphenator methods expect unicode strings.

print(h_en.pairs('beautiful'))
#, [['beau', 'tiful'], [u'beauti', 'ful']])

print(h_en.wrap('beautiful', 6))
#['beau-', 'tiful']

print(h_en.wrap('beautiful', 7))
#['beauti-', 'ful']

print(h_en.syllables('beautiful'))
#['beau', 'ti', 'ful']

from textwrap2 import fill
print(fill('very long text...', width=40, use_hyphenator=h_en))