def test_beautiful(self): h_en = Hyphenator('en_US') self.assertEqual([['beau', 'tiful'], [u'beauti', 'ful']], h_en.pairs('beautiful')) self.assertEqual(['beau-', 'tiful'], h_en.wrap('beautiful', 6)) self.assertEqual(['beauti-', 'ful'], h_en.wrap('beautiful', 7)) self.assertEqual(['beau', 'ti', 'ful'], h_en.syllables('beautiful'))
class hyphenator: def __init__(self, language='it_IT'): self.h = Hyphenator(language) def split_syllables(self, word): syllables = self.h.syllables(utils.check_unicode(word)) return syllables def split_word(self, word): pairs = self.h.pairs(utils.check_unicode(word)) return pairs
class HyphenationIntroducer: def __init__(self, p_hyphen: float): self.p_hyphen = p_hyphen self.hyphenator = Hyphenator() def get_candidates(self, token: str) -> List[str]: try: return self.hyphenator.pairs(token) except: return [] def introduce_hyphens(self, text: str) -> str: tokens = text.split(" ") for i in range(len(tokens)): candidates = self.get_candidates(tokens[i]) if len(candidates) > 0 and flip_coin(random, self.p_hyphen): candidate = random.choice(candidates) tokens[i] = "-".join(candidate) return " ".join(tokens)
hyphenator = Hyphenator() char_error_rates = [] total_hyphen_edits = 0 total_tokens = 0 hyphenable_tokens = 0 for i, (corrupt, correct) in enumerate( zip(read_lines(raw_file), read_lines(clean_file))): corrupt_tokens = corrupt.split() correct_tokens = correct.split() total_tokens += len(correct_tokens) for t in correct_tokens: try: if len(hyphenator.pairs(t)) > 0: hyphenable_tokens += 1 except IndexError: pass ocr_errors = get_ocr_errors(corrupt_tokens, correct_tokens) print(i + 1, ocr_errors) n_char_edits = 0 n_hyphen_edits = 0 for erroneous, corrected in ocr_errors: char_edits = get_ocr_character_edits(erroneous, corrected) char_edits = [ edit for edit in char_edits if edit[0] != " " and edit[1] != " " ] hyphen_edits = len( [edit for edit in char_edits if edit == ("-", "")])
from hyphen import Hyphenator # Create some hyphenators h_de = Hyphenator('de_DE') h_en = Hyphenator('en_US') h_es = Hyphenator('es_ES') # Now hyphenate some words # Note: the following examples are written in Python 3.x syntax. # If you use Python 2.x, you must add the 'u' prefixes as Hyphenator methods expect unicode strings. print(h_en.pairs('beautiful')) #, [['beau', 'tiful'], [u'beauti', 'ful']]) print(h_en.wrap('beautiful', 6)) #['beau-', 'tiful'] print(h_en.wrap('beautiful', 7)) #['beauti-', 'ful'] print(h_en.syllables('beautiful')) #['beau', 'ti', 'ful'] from textwrap2 import fill print(fill('very long text...', width=40, use_hyphenator=h_en))