def hyphenate(dom, _lang): """Hyphenate a post.""" # circular import prevention from .nikola import LEGAL_VALUES lang = None if pyphen is not None: lang = LEGAL_VALUES["PYPHEN_LOCALES"].get(_lang, pyphen.language_fallback(_lang)) else: utils.req_missing(["pyphen"], "hyphenate texts", optional=True) hyphenator = None if pyphen is not None and lang is not None: # If pyphen does exist, we tell the user when configuring the site. # If it does not support a language, we ignore it quietly. try: hyphenator = pyphen.Pyphen(lang=lang) except KeyError: LOGGER.error("Cannot find hyphenation dictoniaries for {0} (from {1}).".format(lang, _lang)) LOGGER.error("Pyphen cannot be installed to ~/.local (pip install --user).") if hyphenator is not None: for tag in ("p", "li", "span"): for node in dom.xpath("//%s[not(parent::pre)]" % tag): skip_node = False skippable_nodes = ["kbd", "code", "samp", "mark", "math", "data", "ruby", "svg"] if node.getchildren(): for child in node.getchildren(): if child.tag in skippable_nodes or (child.tag == "span" and "math" in child.get("class", [])): skip_node = True elif "math" in node.get("class", []): skip_node = True if not skip_node: insert_hyphens(node, hyphenator) return dom
def hyphenate(dom, _lang): """Hyphenate a post.""" # circular import prevention from .nikola import LEGAL_VALUES lang = LEGAL_VALUES['PYPHEN_LOCALES'].get(_lang, pyphen.language_fallback(_lang)) if pyphen is not None and lang is not None: # If pyphen does exist, we tell the user when configuring the site. # If it does not support a language, we ignore it quietly. try: hyphenator = pyphen.Pyphen(lang=lang) except KeyError: LOGGER.error("Cannot find hyphenation dictoniaries for {0} (from {1}).".format(lang, _lang)) LOGGER.error("Pyphen cannot be installed to ~/.local (pip install --user).") for tag in ('p', 'li', 'span'): for node in dom.xpath("//%s[not(parent::pre)]" % tag): skip_node = False skippable_nodes = ['kbd', 'code', 'samp', 'mark', 'math', 'data', 'ruby', 'svg'] if node.getchildren(): for child in node.getchildren(): if child.tag in skippable_nodes or (child.tag == 'span' and 'math' in child.get('class', [])): skip_node = True elif 'math' in node.get('class', []): skip_node = True if not skip_node: insert_hyphens(node, hyphenator) return dom
def test_fallback(): """Test the language fallback algorithm.""" assert pyphen.language_fallback('en') == 'en' assert pyphen.language_fallback('en_US') == 'en_US' assert pyphen.language_fallback('en_FR') == 'en' assert pyphen.language_fallback('en-Latn-US') == 'en_Latn_US' assert pyphen.language_fallback('en-Cyrl-US') == 'en' assert pyphen.language_fallback('fr-Latn-FR') == 'fr' assert pyphen.language_fallback('en-US_variant1-x') == 'en_US'
def split_first_line(text, style, hinting, max_width, line_width): """Fit as much as possible in the available width for one line of text. Return ``(layout, length, resume_at, width, height, baseline)``. ``layout``: a pango Layout with the first line ``length``: length in UTF-8 bytes of the first line ``resume_at``: The number of UTF-8 bytes to skip for the next line. May be ``None`` if the whole text fits in one line. This may be greater than ``length`` in case of preserved newline characters. ``width``: width in pixels of the first line ``height``: height in pixels of the first line ``baseline``: baseline in pixels of the first line """ text_wrap = style.white_space in ('pre', 'nowrap') space_collapse = style.white_space in ('normal', 'nowrap', 'pre-line') if text_wrap: max_width = None elif max_width is not None: # In some cases (shrink-to-fit result being the preferred width) # this value is coming from Pango itself, # but floating point errors have accumulated: # width2 = (width + X) - X # in some cases, width2 < width # Increase the value a bit to compensate and not introduce # an unexpected line break. The 1e-9 value comes from PEP 485. max_width *= 1 + 1e-9 # Step #1: Get a draft layout with the first line layout = None if max_width: expected_length = int(max_width / style.font_size * 2.5) if expected_length < len(text): # Try to use a small amount of text instead of the whole text layout = create_layout( text[:expected_length], style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) if second_line is None: # The small amount of text fits in one line, give up and use # the whole text layout = None if layout is None: layout = create_layout(text, style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = None if second_line is None else second_line.start_index # Step #2: Don't hyphenize when it's not needed if max_width is None: # The first line can take all the place needed return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) first_line_width, _ = get_size(first_line, style) if second_line is None and first_line_width <= max_width: # The first line fits in the available width return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) # Step #3: Try to put the first word of the second line on the first line if first_line_width <= max_width: # The first line may have been cut too early by Pango second_line_index = second_line.start_index first_line_text = utf8_slice(text, slice(second_line_index)) second_line_text = utf8_slice(text, slice(second_line_index, None)) else: # The first word is longer than the line, try to hyphenize it first_line_text = '' second_line_text = text next_word = second_line_text.split(' ', 1)[0] if next_word: if space_collapse: # next_word might fit without a space afterwards # only try when space collapsing is allowed new_first_line_text = first_line_text + next_word layout.set_text(new_first_line_text) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) first_line_width, _ = get_size(first_line, style) if second_line is None and first_line_width <= max_width: # The next word fits in the first line, keep the layout resume_at = len(new_first_line_text.encode('utf-8')) + 1 if resume_at == len(text.encode('utf-8')): resume_at = None return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) elif first_line_text: # We found something on the first line but we did not find a word on # the next line, no need to hyphenate, we can keep the current layout return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) # Step #4: Try to hyphenize hyphens = style.hyphens lang = style.lang and pyphen.language_fallback(style.lang) total, left, right = style.hyphenate_limit_chars hyphenated = False soft_hyphen = u'\u00ad' # Automatic hyphenation possible and next word is long enough if hyphens != 'none' and len(next_word) >= total: first_line_width, _ = get_size(first_line, style) space = max_width - first_line_width if style.hyphenate_limit_zone.unit == '%': limit_zone = max_width * style.hyphenate_limit_zone.value / 100. else: limit_zone = style.hyphenate_limit_zone.value if space > limit_zone or space < 0: # Manual hyphenation: check that the line ends with a soft hyphen # and add the missing hyphen if hyphens == 'manual': if first_line_text.endswith(soft_hyphen): # The first line has been split on a soft hyphen if u' ' in first_line_text: first_line_text, next_word = ( first_line_text.rsplit(u' ', 1)) next_word = u' ' + next_word layout = create_layout( first_line_text, style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len( (first_line_text + u' ').encode('utf8')) else: first_line_text, next_word = u'', first_line_text soft_hyphen_indexes = [ match.start() for match in re.finditer(soft_hyphen, next_word)] soft_hyphen_indexes.reverse() dictionary_iterations = [ next_word[:i + 1] for i in soft_hyphen_indexes] elif hyphens == 'auto' and lang: # The next word does not fit, try hyphenation dictionary_key = (lang, left, right, total) dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key) if dictionary is None: dictionary = pyphen.Pyphen( lang=lang, left=left, right=right) PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary dictionary_iterations = [ start for start, end in dictionary.iterate(next_word)] else: dictionary_iterations = [] if dictionary_iterations: for first_word_part in dictionary_iterations: new_first_line_text = first_line_text + first_word_part hyphenated_first_line_text = ( new_first_line_text + style.hyphenate_character) new_layout = create_layout( hyphenated_first_line_text, style, hinting, max_width) new_lines = new_layout.iter_lines() new_first_line = next(new_lines, None) new_second_line = next(new_lines, None) new_first_line_width, _ = get_size(new_first_line, style) new_space = max_width - new_first_line_width if new_second_line is None and ( new_space >= 0 or first_word_part == dictionary_iterations[-1]): hyphenated = True layout = new_layout first_line = new_first_line second_line = new_second_line resume_at = len(new_first_line_text.encode('utf8')) if text[len(new_first_line_text)] == soft_hyphen: resume_at += len(soft_hyphen.encode('utf8')) break if not hyphenated and not first_line_text: # Recreate the layout with no max_width to be sure that # we don't break inside the hyphenate-character string hyphenated = True layout = create_layout( hyphenated_first_line_text, style, hinting, None) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len(new_first_line_text.encode('utf8')) if text[len(first_line_text)] == soft_hyphen: resume_at += len(soft_hyphen.encode('utf8')) if not hyphenated and first_line_text.endswith(soft_hyphen): # Recreate the layout with no max_width to be sure that # we don't break inside the hyphenate-character string hyphenated = True hyphenated_first_line_text = ( first_line_text + style.hyphenate_character) layout = create_layout( hyphenated_first_line_text, style, hinting, None) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len(first_line_text.encode('utf8')) # Step 5: Try to break word if it's too long for the line overflow_wrap = style.overflow_wrap first_line_width, _ = get_size(first_line, style) space = max_width - first_line_width # If we can break words and the first line is too long if overflow_wrap == 'break-word' and space < 0: # Is it really OK to remove hyphenation for word-break ? hyphenated = False # TODO: Modify code to preserve W3C condition: # "Shaping characters are still shaped as if the word were not broken" # The way new lines are processed in this function (one by one with no # memory of the last) prevents shaping characters (arabic, for # instance) from keeping their shape when wrapped on the next line with # pango layout. Maybe insert Unicode shaping characters in text ? temp_layout = create_layout(text, style, hinting, max_width) temp_layout.set_wrap(PANGO_WRAP_MODE['WRAP_WORD_CHAR']) temp_lines = temp_layout.iter_lines() next(temp_lines, None) temp_second_line = next(temp_lines, None) temp_second_line_index = ( len(text.encode('utf-8')) if temp_second_line is None else temp_second_line.start_index) resume_at = temp_second_line_index first_line_text = utf8_slice(text, slice(temp_second_line_index)) layout = create_layout(first_line_text, style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style, hyphenated, style.hyphenate_character)
def split_first_line(text, style, hinting, max_width, line_width): """Fit as much as possible in the available width for one line of text. Return ``(layout, length, resume_at, width, height, baseline)``. ``layout``: a pango Layout with the first line ``length``: length in UTF-8 bytes of the first line ``resume_at``: The number of UTF-8 bytes to skip for the next line. May be ``None`` if the whole text fits in one line. This may be greater than ``length`` in case of preserved newline characters. ``width``: width in pixels of the first line ``height``: height in pixels of the first line ``baseline``: baseline in pixels of the first line """ # In some cases (shrink-to-fit result being the preferred width) # this value is coming from Pango itself, # but floating point errors have accumulated: # width2 = (width + X) - X # in some cases, width2 < width # Increase the value a bit to compensate and not introduce # an unexpected line break. if max_width is not None: max_width += style.font_size * 0.2 # Step #1: Get a draft layout with the first line layout = None if max_width: expected_length = int(max_width / style.font_size * 2.5) if expected_length < len(text): # Try to use a small amount of text instead of the whole text layout = create_layout( text[:expected_length], style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) if second_line is None: # The small amount of text fits in one line, give up and use # the whole text layout = None if layout is None: layout = create_layout(text, style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = None if second_line is None else second_line.start_index # Step #2: Don't hyphenize when it's not needed if max_width is None: # The first line can take all the place needed return first_line_metrics(first_line, text, layout, resume_at) first_line_width, _height = get_size(first_line) if second_line is None and first_line_width <= max_width: # The first line fits in the available width return first_line_metrics(first_line, text, layout, resume_at) # Step #3: Try to put the first word of the second line on the first line if first_line_width <= max_width: # The first line may have been cut too early by Pango second_line_index = second_line.start_index first_part = utf8_slice(text, slice(second_line_index)) second_part = utf8_slice(text, slice(second_line_index, None)) else: # The first word is longer than the line, try to hyphenize it first_part = '' second_part = text next_word = second_part.split(' ', 1)[0] if not next_word: # We did not find a word on the next line return first_line_metrics(first_line, text, layout, resume_at) # next_word might fit without a space afterwards. # Pango previously counted that space’s advance width. new_first_line = first_part + next_word layout.set_text(new_first_line) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) first_line_width, _height = get_size(first_line) if second_line is None and first_line_width <= max_width: # The next word fits in the first line, keep the layout resume_at = len(new_first_line.encode('utf-8')) + 1 return first_line_metrics(first_line, text, layout, resume_at) # Step #4: Try to hyphenize hyphens = style.hyphens lang = style.lang and pyphen.language_fallback(style.lang) total, left, right = style.hyphenate_limit_chars hyphenated = False # Automatic hyphenation possible and next word is long enough if hyphens not in ('none', 'manual') and lang and len(next_word) >= total: first_line_width, _height = get_size(first_line) space = max_width - first_line_width if style.hyphenate_limit_zone.unit == '%': limit_zone = max_width * style.hyphenate_limit_zone.value / 100. else: limit_zone = style.hyphenate_limit_zone.value if space > limit_zone or space < 0: # The next word does not fit, try hyphenation dictionary_key = (lang, left, right, total) dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key) if dictionary is None: dictionary = pyphen.Pyphen(lang=lang, left=left, right=right) PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary for first_word_part, _ in dictionary.iterate(next_word): new_first_line = ( first_part + first_word_part + style.hyphenate_character) temp_layout = create_layout( new_first_line, style, hinting, max_width) temp_lines = temp_layout.iter_lines() temp_first_line = next(temp_lines, None) temp_second_line = next(temp_lines, None) if (temp_second_line is None and space >= 0) or space < 0: hyphenated = True # TODO: find why there's no need to .encode resume_at = len(first_part + first_word_part) layout = temp_layout first_line = temp_first_line second_line = temp_second_line temp_first_line_width, _height = get_size(temp_first_line) if temp_first_line_width <= max_width: break # Step 5: Try to break word if it's too long for the line overflow_wrap = style.overflow_wrap first_line_width, _height = get_size(first_line) space = max_width - first_line_width # If we can break words and the first line is too long if overflow_wrap == 'break-word' and space < 0: if hyphenated: # Is it really OK to remove hyphenation for word-break ? new_first_line = new_first_line.rstrip( new_first_line[-(len(style.hyphenate_character)):]) if second_line is not None: second_line_index = second_line.start_index second_part = utf8_slice(text, slice(second_line_index, None)) new_first_line += second_part hyphenated = False # TODO: Modify code to preserve W3C condition: # "Shaping characters are still shaped as if the word were not broken" # The way new lines are processed in this function (one by one with no # memory of the last) prevents shaping characters (arabic, for # instance) from keeping their shape when wrapped on the next line with # pango layout. Maybe insert Unicode shaping characters in text ? temp_layout = create_layout(new_first_line, style, hinting, max_width) temp_layout.set_wrap(PANGO_WRAP_MODE['WRAP_WORD_CHAR']) temp_lines = temp_layout.iter_lines() temp_first_line = next(temp_lines, None) temp_second_line = next(temp_lines, None) temp_second_line_index = ( len(new_first_line) if temp_second_line is None else temp_second_line.start_index) resume_at = temp_second_line_index first_part = utf8_slice(text, slice(temp_second_line_index)) layout = create_layout(first_part, style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) return first_line_metrics(first_line, text, layout, resume_at, hyphenated)
def split_first_line(text, style, context, max_width, justification_spacing): """Fit as much as possible in the available width for one line of text. Return ``(layout, length, resume_at, width, height, baseline)``. ``layout``: a pango Layout with the first line ``length``: length in UTF-8 bytes of the first line ``resume_at``: The number of UTF-8 bytes to skip for the next line. May be ``None`` if the whole text fits in one line. This may be greater than ``length`` in case of preserved newline characters. ``width``: width in pixels of the first line ``height``: height in pixels of the first line ``baseline``: baseline in pixels of the first line """ # See https://www.w3.org/TR/css-text-3/#white-space-property text_wrap = style['white_space'] in ('normal', 'pre-wrap', 'pre-line') space_collapse = style['white_space'] in ('normal', 'nowrap', 'pre-line') if not text_wrap: max_width = None # Step #1: Get a draft layout with the first line layout = None if (max_width is not None and max_width != float('inf') and style['font_size']): expected_length = int(max_width / style['font_size'] * 2.5) if expected_length < len(text): # Try to use a small amount of text instead of the whole text layout = create_layout(text[:expected_length], style, context, max_width, justification_spacing) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) if second_line is None: # The small amount of text fits in one line, give up and use # the whole text layout = None if layout is None: layout = create_layout(text, style, context, max_width, justification_spacing) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = None if second_line is None else second_line.start_index # Step #2: Don't split lines when it's not needed if max_width is None: # The first line can take all the place needed return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style) first_line_width, _ = get_size(first_line, style) if second_line is None and first_line_width <= max_width: # The first line fits in the available width return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style) # Step #3: Try to put the first word of the second line on the first line # https://mail.gnome.org/archives/gtk-i18n-list/2013-September/msg00006 # is a good thread related to this problem. if first_line_width <= max_width: # The first line may have been cut too early by Pango second_line_index = second_line.start_index first_line_text = utf8_slice(text, slice(second_line_index)) second_line_text = utf8_slice(text, slice(second_line_index, None)) else: # The first word is longer than the line, try to hyphenize it first_line_text = '' second_line_text = text next_word = second_line_text.split(' ', 1)[0] if next_word: if space_collapse: # next_word might fit without a space afterwards # only try when space collapsing is allowed new_first_line_text = first_line_text + next_word layout.set_text(new_first_line_text) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) first_line_width, _ = get_size(first_line, style) if second_line is None and first_line_text: # The next word fits in the first line, keep the layout resume_at = len(new_first_line_text.encode('utf-8')) + 1 if resume_at == len(text.encode('utf-8')): resume_at = None return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style) elif second_line: # Text may have been split elsewhere by Pango earlier resume_at = second_line.start_index else: # Second line is none resume_at = first_line.length + 1 elif first_line_text: # We found something on the first line but we did not find a word on # the next line, no need to hyphenate, we can keep the current layout return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style) # Step #4: Try to hyphenize hyphens = style['hyphens'] lang = style['lang'] and pyphen.language_fallback(style['lang']) total, left, right = style['hyphenate_limit_chars'] hyphenated = False soft_hyphen = u'\u00ad' # Automatic hyphenation possible and next word is long enough if hyphens != 'none' and len(next_word) >= total: first_line_width, _ = get_size(first_line, style) space = max_width - first_line_width if style['hyphenate_limit_zone'].unit == '%': limit_zone = max_width * style['hyphenate_limit_zone'].value / 100. else: limit_zone = style['hyphenate_limit_zone'].value if space > limit_zone or space < 0: # Manual hyphenation: check that the line ends with a soft hyphen # and add the missing hyphen if hyphens == 'manual': if first_line_text.endswith(soft_hyphen): # The first line has been split on a soft hyphen if u' ' in first_line_text: first_line_text, next_word = (first_line_text.rsplit( u' ', 1)) next_word = u' ' + next_word layout.set_text(first_line_text) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len( (first_line_text + u' ').encode('utf8')) else: first_line_text, next_word = u'', first_line_text soft_hyphen_indexes = [ match.start() for match in re.finditer(soft_hyphen, next_word) ] soft_hyphen_indexes.reverse() dictionary_iterations = [ next_word[:i + 1] for i in soft_hyphen_indexes ] elif hyphens == 'auto' and lang: # The next word does not fit, try hyphenation dictionary_key = (lang, left, right, total) dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key) if dictionary is None: dictionary = pyphen.Pyphen(lang=lang, left=left, right=right) PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary dictionary_iterations = [ start for start, end in dictionary.iterate(next_word) ] else: dictionary_iterations = [] if dictionary_iterations: for first_word_part in dictionary_iterations: new_first_line_text = first_line_text + first_word_part hyphenated_first_line_text = (new_first_line_text + style['hyphenate_character']) new_layout = create_layout(hyphenated_first_line_text, style, context, max_width, justification_spacing) new_lines = new_layout.iter_lines() new_first_line = next(new_lines, None) new_second_line = next(new_lines, None) new_first_line_width, _ = get_size(new_first_line, style) new_space = max_width - new_first_line_width if new_second_line is None and ( new_space >= 0 or first_word_part == dictionary_iterations[-1]): hyphenated = True layout = new_layout first_line = new_first_line second_line = new_second_line resume_at = len(new_first_line_text.encode('utf8')) if text[len(new_first_line_text)] == soft_hyphen: resume_at += len(soft_hyphen.encode('utf8')) break if not hyphenated and not first_line_text: # Recreate the layout with no max_width to be sure that # we don't break inside the hyphenate-character string hyphenated = True layout.set_text(hyphenated_first_line_text) pango.pango_layout_set_width(layout.layout, units_from_double(-1)) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len(new_first_line_text.encode('utf8')) if text[len(first_line_text)] == soft_hyphen: resume_at += len(soft_hyphen.encode('utf8')) if not hyphenated and first_line_text.endswith(soft_hyphen): # Recreate the layout with no max_width to be sure that # we don't break inside the hyphenate-character string hyphenated = True hyphenated_first_line_text = (first_line_text + style['hyphenate_character']) layout.set_text(hyphenated_first_line_text) pango.pango_layout_set_width(layout.layout, units_from_double(-1)) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len(first_line_text.encode('utf8')) # Step 5: Try to break word if it's too long for the line overflow_wrap = style['overflow_wrap'] first_line_width, _ = get_size(first_line, style) space = max_width - first_line_width # If we can break words and the first line is too long if overflow_wrap == 'break-word' and space < 0: # Is it really OK to remove hyphenation for word-break ? hyphenated = False # TODO: Modify code to preserve W3C condition: # "Shaping characters are still shaped as if the word were not broken" # The way new lines are processed in this function (one by one with no # memory of the last) prevents shaping characters (arabic, for # instance) from keeping their shape when wrapped on the next line with # pango layout. Maybe insert Unicode shaping characters in text? layout.set_text(text) pango.pango_layout_set_width(layout.layout, units_from_double(max_width)) layout.set_wrap(PANGO_WRAP_MODE['WRAP_CHAR']) temp_lines = layout.iter_lines() next(temp_lines, None) temp_second_line = next(temp_lines, None) temp_second_line_index = (len(text.encode('utf-8')) if temp_second_line is None else temp_second_line.start_index) # TODO: WRAP_CHAR is said to "wrap lines at character boundaries", but # it doesn't. Looks like it tries to split at word boundaries and then # at character boundaries if there's no enough space for a full word, # just as WRAP_WORD_CHAR does. That's why we have to split this text # twice. Find why. It may be related to the problem described in the # link given in step #3. first_line_text = utf8_slice(text, slice(temp_second_line_index)) layout.set_text(first_line_text) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = (first_line.length if second_line is None else second_line.start_index) if resume_at is not None and resume_at >= len(text.encode('utf-8')): resume_at = None return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style, hyphenated, style['hyphenate_character'])
from io import StringIO from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage import string, nltk, pyphen, glob, os import numpy as np nltk.download('punkt') pyphen.language_fallback('en') WordDic = pyphen.Pyphen(lang='en') def saveNPZ(Array, OutputFile): np.savez_compressed(OutputFile, array1=Array) def loadNPZ(InputFile): return np.load(InputFile)['array1'] def saveListTxt(Array, OutputFile): with open(OutputFile, 'w') as f: for item in Array: f.write('%s\n' % item) def loadListTxt(InputFile): Array = [] with open(InputFile, 'r') as f: Array = f.read().splitlines()
import pyphen import csv pyphen.language_fallback('it_IT') engine = pyphen.Pyphen(lang='it_IT') syllables = set() syllableCount = dict() syllablesLen = 0 uniqueWords = set() with open('dicts/it.csv') as dictionary: for idx, word in enumerate(dictionary): uniqueWords.add(word) currentSylls = engine.inserted(word).split('-') for s in currentSylls: s = s.strip('\n') if len(s) >= 1 and len(s) <= 3: syllables.add(s) if len(syllables) == syllablesLen: # sillaba non inserita, duplicato syllableCount[s] += 1 else: # prima volta che inserisco la sillaba syllableCount[s] = 1 syllablesLen = len(syllables) minOccurences = min(syllableCount.values()) avg = sum(syllableCount.values()) / len(syllableCount) maxOccurences = max(syllableCount.values())
import argparse import logging import sys import pyphen import nltk from termcolor import colored pyphen.language_fallback("en_US") logger = logging.getLogger() logger.setLevel(logging.INFO) console_out = logging.StreamHandler(sys.stdout) console_out.setLevel(logging.DEBUG) logger.addHandler(console_out) def parse_arguments(): """ Simple argument parser for the command line :return: The text to be edited """ parser = argparse.ArgumentParser(description="Receive text to be edited") parser.add_argument("text", metavar="input text", type=str) args = parser.parse_args() return args.text def clean_input(text): """ Text sanitization function
def split_first_line(text, style, hinting, max_width, line_width): """Fit as much as possible in the available width for one line of text. Return ``(layout, length, resume_at, width, height, baseline)``. ``layout``: a pango Layout with the first line ``length``: length in UTF-8 bytes of the first line ``resume_at``: The number of UTF-8 bytes to skip for the next line. May be ``None`` if the whole text fits in one line. This may be greater than ``length`` in case of preserved newline characters. ``width``: width in pixels of the first line ``height``: height in pixels of the first line ``baseline``: baseline in pixels of the first line """ # In some cases (shrink-to-fit result being the preferred width) # this value is coming from Pango itself, # but floating point errors have accumulated: # width2 = (width + X) - X # in some cases, width2 < width # Increase the value a bit to compensate and not introduce # an unexpected line break. if max_width is not None: max_width *= 1.0001 # Step #1: Get a draft layout with the first line layout = None if max_width: expected_length = int(max_width / style.font_size * 2.5) if expected_length < len(text): # Try to use a small amount of text instead of the whole text layout = create_layout(text[:expected_length], style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) if second_line is None: # The small amount of text fits in one line, give up and use # the whole text layout = None if layout is None: layout = create_layout(text, style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = None if second_line is None else second_line.start_index # Step #2: Don't hyphenize when it's not needed if max_width is None: # The first line can take all the place needed return first_line_metrics(first_line, text, layout, resume_at) first_line_width, _height = get_size(first_line) if second_line is None and first_line_width <= max_width: # The first line fits in the available width return first_line_metrics(first_line, text, layout, resume_at) # Step #3: Try to put the first word of the second line on the first line if first_line_width <= max_width: # The first line may have been cut too early by Pango second_line_index = second_line.start_index first_part = utf8_slice(text, slice(second_line_index)) second_part = utf8_slice(text, slice(second_line_index, None)) else: # The first word is longer than the line, try to hyphenize it first_part = '' second_part = text next_word = second_part.split(' ', 1)[0] if not next_word: # We did not find a word on the next line return first_line_metrics(first_line, text, layout, resume_at) # next_word might fit without a space afterwards. # Pango previously counted that space’s advance width. new_first_line = first_part + next_word layout.set_text(new_first_line) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) first_line_width, _height = get_size(first_line) if second_line is None and first_line_width <= max_width: # The next word fits in the first line, keep the layout resume_at = len(new_first_line.encode('utf-8')) + 1 return first_line_metrics(first_line, text, layout, resume_at) # Step #4: Try to hyphenize hyphens = style.hyphens lang = style.lang and pyphen.language_fallback(style.lang) total, left, right = style.hyphenate_limit_chars hyphenated = False # Automatic hyphenation possible and next word is long enough if hyphens not in ('none', 'manual') and lang and len(next_word) >= total: first_line_width, _height = get_size(first_line) space = max_width - first_line_width if style.hyphenate_limit_zone.unit == '%': limit_zone = max_width * style.hyphenate_limit_zone.value / 100. else: limit_zone = style.hyphenate_limit_zone.value if space > limit_zone or space < 0: # The next word does not fit, try hyphenation dictionary_key = (lang, left, right, total) dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key) if dictionary is None: dictionary = pyphen.Pyphen(lang=lang, left=left, right=right) PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary for first_word_part, _ in dictionary.iterate(next_word): new_first_line = (first_part + first_word_part + style.hyphenate_character) temp_layout = create_layout(new_first_line, style, hinting, max_width) temp_lines = temp_layout.iter_lines() temp_first_line = next(temp_lines, None) temp_second_line = next(temp_lines, None) if (temp_second_line is None and space >= 0) or space < 0: hyphenated = True # TODO: find why there's no need to .encode resume_at = len(first_part + first_word_part) layout = temp_layout first_line = temp_first_line second_line = temp_second_line temp_first_line_width, _height = get_size(temp_first_line) if temp_first_line_width <= max_width: break # Step 5: Try to break word if it's too long for the line overflow_wrap = style.overflow_wrap first_line_width, _height = get_size(first_line) space = max_width - first_line_width # If we can break words and the first line is too long if overflow_wrap == 'break-word' and space < 0: if hyphenated: # Is it really OK to remove hyphenation for word-break ? new_first_line = new_first_line.rstrip( new_first_line[-(len(style.hyphenate_character)):]) if second_line is not None: second_line_index = second_line.start_index second_part = utf8_slice(text, slice(second_line_index, None)) new_first_line += second_part hyphenated = False # TODO: Modify code to preserve W3C condition: # "Shaping characters are still shaped as if the word were not broken" # The way new lines are processed in this function (one by one with no # memory of the last) prevents shaping characters (arabic, for # instance) from keeping their shape when wrapped on the next line with # pango layout. Maybe insert Unicode shaping characters in text ? temp_layout = create_layout(new_first_line, style, hinting, max_width) temp_layout.set_wrap(PANGO_WRAP_MODE['WRAP_WORD_CHAR']) temp_lines = temp_layout.iter_lines() temp_first_line = next(temp_lines, None) temp_second_line = next(temp_lines, None) temp_second_line_index = (len(new_first_line) if temp_second_line is None else temp_second_line.start_index) resume_at = temp_second_line_index first_part = utf8_slice(text, slice(temp_second_line_index)) layout = create_layout(first_part, style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) return first_line_metrics(first_line, text, layout, resume_at, hyphenated)
import pyphen pyphen.language_fallback('pt_BR') dic = pyphen.Pyphen(lang='pt_BR') txt = """ hino """ expl = txt.split() for w in expl: print dic.inserted(w)
import pyphen import os import glob pyphen.language_fallback('nl_NL_variant1') 'nl_NL' in pyphen.LANGUAGES dic = pyphen.Pyphen(lang='en_US') path = '/home/ubuntu/statistics' for filename in glob.glob(os.path.join(path, 'fixed_*_words.txt')): shortFile = os.path.basename(filename) words = open(filename, 'r') syllableWriter = open('/home/ubuntu/statistics/' + shortFile[:-9] + 'syllables.txt', 'w') topLine = words.readline() syllableWriter.write(topLine + "\n") for word in words: hyphenated = dic.inserted(word) syllables = 1 for letter in hyphenated: if(letter == '-'): syllables = syllables + 1 syllableWriter.write(str(syllables)) syllableWriter.write("\n")
words.append(word) start_times.append(start_time) end_times.append(end_time) transcription = { # contains all the info about the audio transcription if i-th result part 'name': blob_name, 'transcript': transcript, 'confidence': confidence, 'words': words, 'start_times': start_times, 'end_times': end_times } # syllabification part # use czech language dictionary pyphen.language_fallback("cs_CZ") dic = pyphen.Pyphen(lang='cs_CZ') syllables_stream = [] timestamps_stream = [] print("Running syllabification on " + str(i) + "th part of file...") # iterate over the recognized words for ii in range(len(transcription.get('words'))): word = transcription.get('words')[ii] start_time = transcription.get('start_times')[ii] end_time = transcription.get('end_times')[ii] word_syllables = dic.inserted(word) # syllabify word_syllables = word_syllables.split('-') # split into a list syllables_stream = syllables_stream + word_syllables numSyl = len(word_syllables) # number of syllables in the word
def split_first_line(text, style, context, max_width, justification_spacing, minimum=False): """Fit as much as possible in the available width for one line of text. Return ``(layout, length, resume_at, width, height, baseline)``. ``layout``: a pango Layout with the first line ``length``: length in UTF-8 bytes of the first line ``resume_at``: The number of UTF-8 bytes to skip for the next line. May be ``None`` if the whole text fits in one line. This may be greater than ``length`` in case of preserved newline characters. ``width``: width in pixels of the first line ``height``: height in pixels of the first line ``baseline``: baseline in pixels of the first line """ # See https://www.w3.org/TR/css-text-3/#white-space-property text_wrap = style['white_space'] in ('normal', 'pre-wrap', 'pre-line') space_collapse = style['white_space'] in ('normal', 'nowrap', 'pre-line') if not text_wrap: max_width = None # Step #1: Get a draft layout with the first line layout = None if (max_width is not None and max_width != float('inf') and style['font_size']): if max_width == 0: # Trying to find minimum size, let's naively split on spaces and # keep one word + one letter space_index = text.find(' ') if space_index == -1: expected_length = len(text) else: expected_length = space_index + 2 # index + space + one letter else: expected_length = int(max_width / style['font_size'] * 2.5) if expected_length < len(text): # Try to use a small amount of text instead of the whole text layout = create_layout( text[:expected_length], style, context, max_width, justification_spacing) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) if second_line is None: # The small amount of text fits in one line, give up and use # the whole text layout = None if layout is None: layout = create_layout( text, style, context, max_width, justification_spacing) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = None if second_line is None else second_line.start_index # Step #2: Don't split lines when it's not needed if max_width is None: # The first line can take all the place needed return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) first_line_width, _ = get_size(first_line, style) if second_line is None and first_line_width <= max_width: # The first line fits in the available width return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) # Step #3: Try to put the first word of the second line on the first line # https://mail.gnome.org/archives/gtk-i18n-list/2013-September/msg00006 # is a good thread related to this problem. if first_line_width <= max_width: # The first line may have been cut too early by Pango second_line_index = second_line.start_index first_line_text = utf8_slice(text, slice(second_line_index)) second_line_text = utf8_slice(text, slice(second_line_index, None)) else: # The first word is longer than the line, try to hyphenize it first_line_text = '' second_line_text = text next_word = second_line_text.split(' ', 1)[0] if next_word: if space_collapse: # next_word might fit without a space afterwards # only try when space collapsing is allowed new_first_line_text = first_line_text + next_word layout.set_text(new_first_line_text) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) first_line_width, _ = get_size(first_line, style) if second_line is None and first_line_text: # The next word fits in the first line, keep the layout resume_at = len(new_first_line_text.encode('utf-8')) + 1 return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) elif second_line: # Text may have been split elsewhere by Pango earlier resume_at = second_line.start_index else: # Second line is none resume_at = first_line.length + 1 elif first_line_text: # We found something on the first line but we did not find a word on # the next line, no need to hyphenate, we can keep the current layout return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) # Step #4: Try to hyphenize hyphens = style['hyphens'] lang = style['lang'] and pyphen.language_fallback(style['lang']) total, left, right = style['hyphenate_limit_chars'] hyphenated = False soft_hyphen = u'\u00ad' # Automatic hyphenation possible and next word is long enough if hyphens != 'none' and len(next_word) >= total: first_line_width, _ = get_size(first_line, style) space = max_width - first_line_width if style['hyphenate_limit_zone'].unit == '%': limit_zone = max_width * style['hyphenate_limit_zone'].value / 100. else: limit_zone = style['hyphenate_limit_zone'].value if space > limit_zone or space < 0: # Manual hyphenation: check that the line ends with a soft hyphen # and add the missing hyphen if hyphens == 'manual': if first_line_text.endswith(soft_hyphen): # The first line has been split on a soft hyphen if u' ' in first_line_text: first_line_text, next_word = ( first_line_text.rsplit(u' ', 1)) next_word = u' ' + next_word layout.set_text(first_line_text) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len( (first_line_text + u' ').encode('utf8')) else: first_line_text, next_word = u'', first_line_text soft_hyphen_indexes = [ match.start() for match in re.finditer(soft_hyphen, next_word)] soft_hyphen_indexes.reverse() dictionary_iterations = [ next_word[:i + 1] for i in soft_hyphen_indexes] elif hyphens == 'auto' and lang: # The next word does not fit, try hyphenation dictionary_key = (lang, left, right, total) dictionary = context.dictionaries.get(dictionary_key) if dictionary is None: dictionary = pyphen.Pyphen( lang=lang, left=left, right=right) context.dictionaries[dictionary_key] = dictionary dictionary_iterations = [ start for start, end in dictionary.iterate(next_word)] else: dictionary_iterations = [] if dictionary_iterations: for first_word_part in dictionary_iterations: new_first_line_text = first_line_text + first_word_part hyphenated_first_line_text = ( new_first_line_text + style['hyphenate_character']) new_layout = create_layout( hyphenated_first_line_text, style, context, max_width, justification_spacing) new_lines = new_layout.iter_lines() new_first_line = next(new_lines, None) new_second_line = next(new_lines, None) new_first_line_width, _ = get_size(new_first_line, style) new_space = max_width - new_first_line_width if new_second_line is None and ( new_space >= 0 or first_word_part == dictionary_iterations[-1]): hyphenated = True layout = new_layout first_line = new_first_line second_line = new_second_line resume_at = len(new_first_line_text.encode('utf8')) if text[len(new_first_line_text)] == soft_hyphen: resume_at += len(soft_hyphen.encode('utf8')) break if not hyphenated and not first_line_text: # Recreate the layout with no max_width to be sure that # we don't break inside the hyphenate-character string hyphenated = True layout.set_text(hyphenated_first_line_text) pango.pango_layout_set_width( layout.layout, units_from_double(-1)) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len(new_first_line_text.encode('utf8')) if text[len(first_line_text)] == soft_hyphen: resume_at += len(soft_hyphen.encode('utf8')) if not hyphenated and first_line_text.endswith(soft_hyphen): # Recreate the layout with no max_width to be sure that # we don't break inside the hyphenate-character string hyphenated = True hyphenated_first_line_text = ( first_line_text + style['hyphenate_character']) layout.set_text(hyphenated_first_line_text) pango.pango_layout_set_width( layout.layout, units_from_double(-1)) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len(first_line_text.encode('utf8')) # Step 5: Try to break word if it's too long for the line overflow_wrap = style['overflow_wrap'] first_line_width, _ = get_size(first_line, style) space = max_width - first_line_width # If we can break words and the first line is too long if not minimum and overflow_wrap == 'break-word' and space < 0: # Is it really OK to remove hyphenation for word-break ? hyphenated = False # TODO: Modify code to preserve W3C condition: # "Shaping characters are still shaped as if the word were not broken" # The way new lines are processed in this function (one by one with no # memory of the last) prevents shaping characters (arabic, for # instance) from keeping their shape when wrapped on the next line with # pango layout. Maybe insert Unicode shaping characters in text? layout.set_text(text) pango.pango_layout_set_width( layout.layout, units_from_double(max_width)) layout.set_wrap(PANGO_WRAP_MODE['WRAP_CHAR']) temp_lines = layout.iter_lines() next(temp_lines, None) temp_second_line = next(temp_lines, None) temp_second_line_index = ( len(text.encode('utf-8')) if temp_second_line is None else temp_second_line.start_index) # TODO: WRAP_CHAR is said to "wrap lines at character boundaries", but # it doesn't. Looks like it tries to split at word boundaries and then # at character boundaries if there's no enough space for a full word, # just as WRAP_WORD_CHAR does. That's why we have to split this text # twice. Find why. It may be related to the problem described in the # link given in step #3. first_line_text = utf8_slice(text, slice(temp_second_line_index)) layout.set_text(first_line_text) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = ( first_line.length if second_line is None else second_line.start_index) if resume_at is not None and resume_at >= len(text.encode('utf-8')): resume_at = None return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style, hyphenated, style['hyphenate_character'])
def split_first_line(text, style, context, max_width, line_width): """Fit as much as possible in the available width for one line of text. Return ``(layout, length, resume_at, width, height, baseline)``. ``layout``: a pango Layout with the first line ``length``: length in UTF-8 bytes of the first line ``resume_at``: The number of UTF-8 bytes to skip for the next line. May be ``None`` if the whole text fits in one line. This may be greater than ``length`` in case of preserved newline characters. ``width``: width in pixels of the first line ``height``: height in pixels of the first line ``baseline``: baseline in pixels of the first line """ text_wrap = style.white_space in ('pre', 'nowrap') space_collapse = style.white_space in ('normal', 'nowrap', 'pre-line') if text_wrap: max_width = None elif max_width is not None: # In some cases (shrink-to-fit result being the preferred width) # this value is coming from Pango itself, # but floating point errors have accumulated: # width2 = (width + X) - X # in some cases, width2 < width # Increase the value a bit to compensate and not introduce # an unexpected line break. The 1e-9 value comes from PEP 485. max_width *= 1 + 1e-9 # Step #1: Get a draft layout with the first line layout = create_layout(text, style, context, max_width) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = None if second_line is None else second_line.start_index # Step #2: Don't hyphenize when it's not needed if max_width is None: # The first line can take all the place needed return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) first_line_width, _ = get_size(first_line, style) if second_line is None and first_line_width <= max_width: # The first line fits in the available width return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) # Step #3: Try to put the first word of the second line on the first line if first_line_width <= max_width: # The first line may have been cut too early by Pango second_line_index = second_line.start_index first_line_text = utf8_slice(text, slice(second_line_index)) second_line_text = utf8_slice(text, slice(second_line_index, None)) else: # The first word is longer than the line, try to hyphenize it first_line_text = '' second_line_text = text next_word = second_line_text.split(' ', 1)[0] if next_word: if space_collapse: # next_word might fit without a space afterwards # only try when space collapsing is allowed new_first_line_text = first_line_text + next_word layout.set_text(new_first_line_text) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) first_line_width, _ = get_size(first_line, style) if second_line is None and first_line_text: # The next word fits in the first line, keep the layout resume_at = len(new_first_line_text.encode('utf-8')) + 1 if resume_at == len(text.encode('utf-8')): resume_at = None return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) elif second_line: # Text may have been split elsewhere by Pango earlier resume_at = second_line.start_index else: resume_at = first_line.length + 1 elif first_line_text: # We found something on the first line but we did not find a word on # the next line, no need to hyphenate, we can keep the current layout return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) # Step #4: Try to hyphenize hyphens = style.hyphens lang = style.lang and pyphen.language_fallback(style.lang) total, left, right = style.hyphenate_limit_chars hyphenated = False soft_hyphen = u'\u00ad' # Automatic hyphenation possible and next word is long enough if hyphens != 'none' and len(next_word) >= total: first_line_width, _ = get_size(first_line, style) space = max_width - first_line_width if style.hyphenate_limit_zone.unit == '%': limit_zone = max_width * style.hyphenate_limit_zone.value / 100. else: limit_zone = style.hyphenate_limit_zone.value if space > limit_zone or space < 0: # Manual hyphenation: check that the line ends with a soft hyphen # and add the missing hyphen if hyphens == 'manual': if first_line_text.endswith(soft_hyphen): # The first line has been split on a soft hyphen if u' ' in first_line_text: first_line_text, next_word = ( first_line_text.rsplit(u' ', 1)) next_word = u' ' + next_word layout.set_text(first_line_text) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len( (first_line_text + u' ').encode('utf8')) else: first_line_text, next_word = u'', first_line_text soft_hyphen_indexes = [ match.start() for match in re.finditer(soft_hyphen, next_word)] soft_hyphen_indexes.reverse() dictionary_iterations = [ next_word[:i + 1] for i in soft_hyphen_indexes] elif hyphens == 'auto' and lang: # The next word does not fit, try hyphenation dictionary_key = (lang, left, right, total) dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key) if dictionary is None: dictionary = pyphen.Pyphen( lang=lang, left=left, right=right) PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary dictionary_iterations = [ start for start, end in dictionary.iterate(next_word)] else: dictionary_iterations = [] if dictionary_iterations: for first_word_part in dictionary_iterations: new_first_line_text = first_line_text + first_word_part hyphenated_first_line_text = ( new_first_line_text + style.hyphenate_character) new_layout = create_layout( hyphenated_first_line_text, style, context, max_width) new_lines = new_layout.iter_lines() new_first_line = next(new_lines, None) new_second_line = next(new_lines, None) new_first_line_width, _ = get_size(new_first_line, style) new_space = max_width - new_first_line_width if new_second_line is None and ( new_space >= 0 or first_word_part == dictionary_iterations[-1]): hyphenated = True layout = new_layout first_line = new_first_line second_line = new_second_line resume_at = len(new_first_line_text.encode('utf8')) if text[len(new_first_line_text)] == soft_hyphen: resume_at += len(soft_hyphen.encode('utf8')) break if not hyphenated and not first_line_text: # Recreate the layout with no max_width to be sure that # we don't break inside the hyphenate-character string hyphenated = True layout.set_text(hyphenated_first_line_text) pango.pango_layout_set_width( layout.layout, units_from_double(-1)) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len(new_first_line_text.encode('utf8')) if text[len(first_line_text)] == soft_hyphen: resume_at += len(soft_hyphen.encode('utf8')) if not hyphenated and first_line_text.endswith(soft_hyphen): # Recreate the layout with no max_width to be sure that # we don't break inside the hyphenate-character string hyphenated = True hyphenated_first_line_text = ( first_line_text + style.hyphenate_character) layout.set_text(hyphenated_first_line_text) pango.pango_layout_set_width( layout.layout, units_from_double(-1)) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len(first_line_text.encode('utf8')) # Step 5: Try to break word if it's too long for the line overflow_wrap = style.overflow_wrap first_line_width, _ = get_size(first_line, style) space = max_width - first_line_width # If we can break words and the first line is too long if overflow_wrap == 'break-word' and space < 0: # Is it really OK to remove hyphenation for word-break ? hyphenated = False # TODO: Modify code to preserve W3C condition: # "Shaping characters are still shaped as if the word were not broken" # The way new lines are processed in this function (one by one with no # memory of the last) prevents shaping characters (arabic, for # instance) from keeping their shape when wrapped on the next line with # pango layout. Maybe insert Unicode shaping characters in text ? layout.set_text(text) pango.pango_layout_set_width( layout.layout, units_from_double(max_width)) layout.set_wrap(PANGO_WRAP_MODE['WRAP_WORD_CHAR']) temp_lines = layout.iter_lines() next(temp_lines, None) temp_second_line = next(temp_lines, None) temp_second_line_index = ( len(text.encode('utf-8')) if temp_second_line is None else temp_second_line.start_index) resume_at = temp_second_line_index first_line_text = utf8_slice(text, slice(temp_second_line_index)) layout.set_text(first_line_text) lines = layout.iter_lines() first_line = next(lines, None) return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style, hyphenated, style.hyphenate_character)
import pyphen pyphen.language_fallback('es_ES_variant1') dic = pyphen.Pyphen(lang='es') def esVocal(texto): return texto in "aeiouáéíóú" def findSmallers(silabas): smallers = -1 min = 1000000 for i in range(len(silabas) - 1): if min > len(silabas[i]) + len(silabas[i + 1]): min = len(silabas[i]) + len(silabas[i + 1]) smallers = i return i def silabas(text, n): silabas = list( filter(lambda x: len(x) > 0, dic.inserted(text).replace(" ", "").split("-"))) if len(silabas) == n: return silabas elif len(silabas) < n: index = 0 while (len(silabas) < n): index = (index + 1) % len(silabas)
from typing import List import pyphen import nltk from nltk.tokenize import word_tokenize, sent_tokenize nltk.data.path.append("/srv/nltk_data") pyphen.language_fallback("en_GB") languages = {"en_US": "english", "en_GB": "english", "de_DE": "german"} class Text: def __init__(self, content: str, lang="en_GB"): self.language: str = lang self.content: str = content self.sentences: List["Sentence"] = Sentence.tokenize(content, lang=lang) @property def num_characters(self) -> int: return len(self.content) @property def num_sentences(self) -> int: return len(self.sentences) @property def words(self) -> List["Word"]: words: List["Word"] = [] for sentence in self.sentences:
def split_first_line(text, style, context, max_width, justification_spacing, minimum=False): """Fit as much as possible in the available width for one line of text. Return ``(layout, length, resume_index, width, height, baseline)``. ``layout``: a pango Layout with the first line ``length``: length in UTF-8 bytes of the first line ``resume_index``: The number of UTF-8 bytes to skip for the next line. May be ``None`` if the whole text fits in one line. This may be greater than ``length`` in case of preserved newline characters. ``width``: width in pixels of the first line ``height``: height in pixels of the first line ``baseline``: baseline in pixels of the first line """ # See https://www.w3.org/TR/css-text-3/#white-space-property text_wrap = style['white_space'] in ('normal', 'pre-wrap', 'pre-line') space_collapse = style['white_space'] in ('normal', 'nowrap', 'pre-line') original_max_width = max_width if not text_wrap: max_width = None # Step #1: Get a draft layout with the first line layout = None if (max_width is not None and max_width != float('inf') and style['font_size']): if max_width == 0: # Trying to find minimum size, let's naively split on spaces and # keep one word + one letter space_index = text.find(' ') if space_index == -1: expected_length = len(text) else: expected_length = space_index + 2 # index + space + one letter else: expected_length = int(max_width / style['font_size'] * 2.5) if expected_length < len(text): # Try to use a small amount of text instead of the whole text layout = create_layout( text[:expected_length], style, context, max_width, justification_spacing) first_line, index = layout.get_first_line() if index is None: # The small amount of text fits in one line, give up and use # the whole text layout = None if layout is None: layout = create_layout( text, style, context, original_max_width, justification_spacing) first_line, index = layout.get_first_line() resume_index = index # Step #2: Don't split lines when it's not needed if max_width is None: # The first line can take all the place needed return first_line_metrics( first_line, text, layout, resume_index, space_collapse, style) first_line_width, _ = line_size(first_line, style) if index is None and first_line_width <= max_width: # The first line fits in the available width return first_line_metrics( first_line, text, layout, resume_index, space_collapse, style) # Step #3: Try to put the first word of the second line on the first line # https://mail.gnome.org/archives/gtk-i18n-list/2013-September/msg00006 # is a good thread related to this problem. first_line_text = text.encode('utf-8')[:index].decode('utf-8') # We can’t rely on first_line_width, see # https://github.com/Kozea/WeasyPrint/issues/1051 first_line_fits = ( first_line_width <= max_width or ' ' in first_line_text.strip() or can_break_text(first_line_text.strip(), style['lang'])) if first_line_fits: # The first line fits but may have been cut too early by Pango second_line_text = text.encode('utf-8')[index:].decode('utf-8') else: # The line can't be split earlier, try to hyphenate the first word. first_line_text = '' second_line_text = text next_word = second_line_text.split(' ', 1)[0] if next_word: if space_collapse: # next_word might fit without a space afterwards # only try when space collapsing is allowed new_first_line_text = first_line_text + next_word layout.set_text(new_first_line_text) first_line, index = layout.get_first_line() first_line_width, _ = line_size(first_line, style) if index is None and first_line_text: # The next word fits in the first line, keep the layout resume_index = len(new_first_line_text.encode('utf-8')) + 1 return first_line_metrics( first_line, text, layout, resume_index, space_collapse, style) elif index: # Text may have been split elsewhere by Pango earlier resume_index = index else: # Second line is none resume_index = first_line.length + 1 if resume_index >= len(text.encode('utf-8')): resume_index = None elif first_line_text: # We found something on the first line but we did not find a word on # the next line, no need to hyphenate, we can keep the current layout return first_line_metrics( first_line, text, layout, resume_index, space_collapse, style) # Step #4: Try to hyphenate hyphens = style['hyphens'] lang = style['lang'] and pyphen.language_fallback(style['lang']) total, left, right = style['hyphenate_limit_chars'] hyphenated = False soft_hyphen = '\u00ad' try_hyphenate = False if hyphens != 'none': next_word_boundaries = get_next_word_boundaries(second_line_text, lang) if next_word_boundaries: # We have a word to hyphenate start_word, stop_word = next_word_boundaries next_word = second_line_text[start_word:stop_word] if stop_word - start_word >= total: # This word is long enough first_line_width, _ = line_size(first_line, style) space = max_width - first_line_width if style['hyphenate_limit_zone'].unit == '%': limit_zone = ( max_width * style['hyphenate_limit_zone'].value / 100.) else: limit_zone = style['hyphenate_limit_zone'].value if space > limit_zone or space < 0: # Available space is worth the try, or the line is even too # long to fit: try to hyphenate try_hyphenate = True if try_hyphenate: # Automatic hyphenation possible and next word is long enough auto_hyphenation = hyphens == 'auto' and lang manual_hyphenation = False if auto_hyphenation: if soft_hyphen in first_line_text or soft_hyphen in next_word: # Automatic hyphenation opportunities within a word must be # ignored if the word contains a conditional hyphen, in favor # of the conditional hyphen(s). # See https://drafts.csswg.org/css-text-3/#valdef-hyphens-auto manual_hyphenation = True else: manual_hyphenation = hyphens == 'manual' if manual_hyphenation: # Manual hyphenation: check that the line ends with a soft # hyphen and add the missing hyphen if first_line_text.endswith(soft_hyphen): # The first line has been split on a soft hyphen if ' ' in first_line_text: first_line_text, next_word = ( first_line_text.rsplit(' ', 1)) next_word = f' {next_word}' layout.set_text(first_line_text) first_line, index = layout.get_first_line() resume_index = len((first_line_text + ' ').encode('utf8')) else: first_line_text, next_word = '', first_line_text soft_hyphen_indexes = [ match.start() for match in re.finditer(soft_hyphen, next_word)] soft_hyphen_indexes.reverse() dictionary_iterations = [ next_word[:i + 1] for i in soft_hyphen_indexes] elif auto_hyphenation: dictionary_key = (lang, left, right, total) dictionary = context.dictionaries.get(dictionary_key) if dictionary is None: dictionary = pyphen.Pyphen(lang=lang, left=left, right=right) context.dictionaries[dictionary_key] = dictionary dictionary_iterations = [ start for start, end in dictionary.iterate(next_word)] else: dictionary_iterations = [] if dictionary_iterations: for first_word_part in dictionary_iterations: new_first_line_text = ( first_line_text + second_line_text[:start_word] + first_word_part) hyphenated_first_line_text = ( new_first_line_text + style['hyphenate_character']) new_layout = create_layout( hyphenated_first_line_text, style, context, max_width, justification_spacing) new_first_line, new_index = new_layout.get_first_line() new_first_line_width, _ = line_size(new_first_line, style) new_space = max_width - new_first_line_width if new_index is None and ( new_space >= 0 or first_word_part == dictionary_iterations[-1]): hyphenated = True layout = new_layout first_line = new_first_line index = new_index resume_index = len(new_first_line_text.encode('utf8')) if text[len(new_first_line_text)] == soft_hyphen: # Recreate the layout with no max_width to be sure that # we don't break before the soft hyphen pango.pango_layout_set_width( layout.layout, units_from_double(-1)) resume_index += len(soft_hyphen.encode('utf8')) break if not hyphenated and not first_line_text: # Recreate the layout with no max_width to be sure that # we don't break before or inside the hyphenate character hyphenated = True layout.set_text(hyphenated_first_line_text) pango.pango_layout_set_width( layout.layout, units_from_double(-1)) first_line, index = layout.get_first_line() resume_index = len(new_first_line_text.encode('utf8')) if text[len(first_line_text)] == soft_hyphen: resume_index += len(soft_hyphen.encode('utf8')) if not hyphenated and first_line_text.endswith(soft_hyphen): # Recreate the layout with no max_width to be sure that # we don't break inside the hyphenate-character string hyphenated = True hyphenated_first_line_text = ( first_line_text + style['hyphenate_character']) layout.set_text(hyphenated_first_line_text) pango.pango_layout_set_width( layout.layout, units_from_double(-1)) first_line, index = layout.get_first_line() resume_index = len(first_line_text.encode('utf8')) # Step 5: Try to break word if it's too long for the line overflow_wrap = style['overflow_wrap'] first_line_width, _ = line_size(first_line, style) space = max_width - first_line_width # If we can break words and the first line is too long if not minimum and overflow_wrap == 'break-word' and space < 0: # Is it really OK to remove hyphenation for word-break ? hyphenated = False # TODO: Modify code to preserve W3C condition: # "Shaping characters are still shaped as if the word were not broken" # The way new lines are processed in this function (one by one with no # memory of the last) prevents shaping characters (arabic, for # instance) from keeping their shape when wrapped on the next line with # pango layout. Maybe insert Unicode shaping characters in text? layout.set_text(text) pango.pango_layout_set_width( layout.layout, units_from_double(max_width)) pango.pango_layout_set_wrap( layout.layout, PANGO_WRAP_MODE['WRAP_CHAR']) first_line, index = layout.get_first_line() resume_index = index or first_line.length if resume_index >= len(text.encode('utf-8')): resume_index = None return first_line_metrics( first_line, text, layout, resume_index, space_collapse, style, hyphenated, style['hyphenate_character'])
#!/usr/bin/env python # -*- coding: utf-8 -*- import config import langdetect import pyphen from telegram import Update from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackContext pyphen.language_fallback('ru') def start(update: Update, context: CallbackContext) -> None: """Send a message when the command /start is issued.""" if(update.message.from_user.language_code == 'ru'): update.message.reply_text('Привет! Этот бот автоматически формирует хайку/хокку в ответ на 17 слоговые сообщения.') else: update.message.reply_text('Hi! This bot automatically generates haiku/hokku in response to 17 syllable messages.') def haikudetect(update: Update, context: CallbackContext) -> None: if not update.message.text: return message = update.message.text words = message.split() # Checking words count if (len(words) < 3 or len(words) > 17): return # Loading pyphen dictionary dic = pyphen.Pyphen(lang=langdetect.detect(message)) # Counting syllables syllable_count_in_message = 0 syllable_count = 0 line = 0