Example #1
0
def hyphenate(dom, _lang):
    """Hyphenate a post."""
    # circular import prevention
    from .nikola import LEGAL_VALUES

    lang = None
    if pyphen is not None:
        lang = LEGAL_VALUES["PYPHEN_LOCALES"].get(_lang, pyphen.language_fallback(_lang))
    else:
        utils.req_missing(["pyphen"], "hyphenate texts", optional=True)
    hyphenator = None
    if pyphen is not None and lang is not None:
        # If pyphen does exist, we tell the user when configuring the site.
        # If it does not support a language, we ignore it quietly.
        try:
            hyphenator = pyphen.Pyphen(lang=lang)
        except KeyError:
            LOGGER.error("Cannot find hyphenation dictoniaries for {0} (from {1}).".format(lang, _lang))
            LOGGER.error("Pyphen cannot be installed to ~/.local (pip install --user).")
    if hyphenator is not None:
        for tag in ("p", "li", "span"):
            for node in dom.xpath("//%s[not(parent::pre)]" % tag):
                skip_node = False
                skippable_nodes = ["kbd", "code", "samp", "mark", "math", "data", "ruby", "svg"]
                if node.getchildren():
                    for child in node.getchildren():
                        if child.tag in skippable_nodes or (child.tag == "span" and "math" in child.get("class", [])):
                            skip_node = True
                elif "math" in node.get("class", []):
                    skip_node = True
                if not skip_node:
                    insert_hyphens(node, hyphenator)
    return dom
Example #2
0
def hyphenate(dom, _lang):
    """Hyphenate a post."""
    # circular import prevention
    from .nikola import LEGAL_VALUES
    lang = LEGAL_VALUES['PYPHEN_LOCALES'].get(_lang, pyphen.language_fallback(_lang))
    if pyphen is not None and lang is not None:
        # If pyphen does exist, we tell the user when configuring the site.
        # If it does not support a language, we ignore it quietly.
        try:
            hyphenator = pyphen.Pyphen(lang=lang)
        except KeyError:
            LOGGER.error("Cannot find hyphenation dictoniaries for {0} (from {1}).".format(lang, _lang))
            LOGGER.error("Pyphen cannot be installed to ~/.local (pip install --user).")
        for tag in ('p', 'li', 'span'):
            for node in dom.xpath("//%s[not(parent::pre)]" % tag):
                skip_node = False
                skippable_nodes = ['kbd', 'code', 'samp', 'mark', 'math', 'data', 'ruby', 'svg']
                if node.getchildren():
                    for child in node.getchildren():
                        if child.tag in skippable_nodes or (child.tag == 'span' and 'math' in child.get('class', [])):
                            skip_node = True
                elif 'math' in node.get('class', []):
                    skip_node = True
                if not skip_node:
                    insert_hyphens(node, hyphenator)
    return dom
Example #3
0
def test_fallback():
    """Test the language fallback algorithm."""
    assert pyphen.language_fallback('en') == 'en'
    assert pyphen.language_fallback('en_US') == 'en_US'
    assert pyphen.language_fallback('en_FR') == 'en'
    assert pyphen.language_fallback('en-Latn-US') == 'en_Latn_US'
    assert pyphen.language_fallback('en-Cyrl-US') == 'en'
    assert pyphen.language_fallback('fr-Latn-FR') == 'fr'
    assert pyphen.language_fallback('en-US_variant1-x') == 'en_US'
Example #4
0
def split_first_line(text, style, hinting, max_width, line_width):
    """Fit as much as possible in the available width for one line of text.

    Return ``(layout, length, resume_at, width, height, baseline)``.

    ``layout``: a pango Layout with the first line
    ``length``: length in UTF-8 bytes of the first line
    ``resume_at``: The number of UTF-8 bytes to skip for the next line.
                   May be ``None`` if the whole text fits in one line.
                   This may be greater than ``length`` in case of preserved
                   newline characters.
    ``width``: width in pixels of the first line
    ``height``: height in pixels of the first line
    ``baseline``: baseline in pixels of the first line

    """
    text_wrap = style.white_space in ('pre', 'nowrap')
    space_collapse = style.white_space in ('normal', 'nowrap', 'pre-line')

    if text_wrap:
        max_width = None
    elif max_width is not None:
        # In some cases (shrink-to-fit result being the preferred width)
        # this value is coming from Pango itself,
        # but floating point errors have accumulated:
        #   width2 = (width + X) - X   # in some cases, width2 < width
        # Increase the value a bit to compensate and not introduce
        # an unexpected line break. The 1e-9 value comes from PEP 485.
        max_width *= 1 + 1e-9

    # Step #1: Get a draft layout with the first line
    layout = None
    if max_width:
        expected_length = int(max_width / style.font_size * 2.5)
        if expected_length < len(text):
            # Try to use a small amount of text instead of the whole text
            layout = create_layout(
                text[:expected_length], style, hinting, max_width)
            lines = layout.iter_lines()
            first_line = next(lines, None)
            second_line = next(lines, None)
            if second_line is None:
                # The small amount of text fits in one line, give up and use
                # the whole text
                layout = None
    if layout is None:
        layout = create_layout(text, style, hinting, max_width)
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
    resume_at = None if second_line is None else second_line.start_index

    # Step #2: Don't hyphenize when it's not needed
    if max_width is None:
        # The first line can take all the place needed
        return first_line_metrics(
            first_line, text, layout, resume_at, space_collapse, style)
    first_line_width, _ = get_size(first_line, style)
    if second_line is None and first_line_width <= max_width:
        # The first line fits in the available width
        return first_line_metrics(
            first_line, text, layout, resume_at, space_collapse, style)

    # Step #3: Try to put the first word of the second line on the first line
    if first_line_width <= max_width:
        # The first line may have been cut too early by Pango
        second_line_index = second_line.start_index
        first_line_text = utf8_slice(text, slice(second_line_index))
        second_line_text = utf8_slice(text, slice(second_line_index, None))
    else:
        # The first word is longer than the line, try to hyphenize it
        first_line_text = ''
        second_line_text = text

    next_word = second_line_text.split(' ', 1)[0]
    if next_word:
        if space_collapse:
            # next_word might fit without a space afterwards
            # only try when space collapsing is allowed
            new_first_line_text = first_line_text + next_word
            layout.set_text(new_first_line_text)
            lines = layout.iter_lines()
            first_line = next(lines, None)
            second_line = next(lines, None)
            first_line_width, _ = get_size(first_line, style)
            if second_line is None and first_line_width <= max_width:
                # The next word fits in the first line, keep the layout
                resume_at = len(new_first_line_text.encode('utf-8')) + 1
                if resume_at == len(text.encode('utf-8')):
                    resume_at = None
                return first_line_metrics(
                    first_line, text, layout, resume_at, space_collapse, style)
    elif first_line_text:
        # We found something on the first line but we did not find a word on
        # the next line, no need to hyphenate, we can keep the current layout
        return first_line_metrics(
            first_line, text, layout, resume_at, space_collapse, style)

    # Step #4: Try to hyphenize
    hyphens = style.hyphens
    lang = style.lang and pyphen.language_fallback(style.lang)
    total, left, right = style.hyphenate_limit_chars
    hyphenated = False
    soft_hyphen = u'\u00ad'

    # Automatic hyphenation possible and next word is long enough
    if hyphens != 'none' and len(next_word) >= total:
        first_line_width, _ = get_size(first_line, style)
        space = max_width - first_line_width
        if style.hyphenate_limit_zone.unit == '%':
            limit_zone = max_width * style.hyphenate_limit_zone.value / 100.
        else:
            limit_zone = style.hyphenate_limit_zone.value

        if space > limit_zone or space < 0:
            # Manual hyphenation: check that the line ends with a soft hyphen
            # and add the missing hyphen
            if hyphens == 'manual':
                if first_line_text.endswith(soft_hyphen):
                    # The first line has been split on a soft hyphen
                    if u' ' in first_line_text:
                        first_line_text, next_word = (
                            first_line_text.rsplit(u' ', 1))
                        next_word = u' ' + next_word
                        layout = create_layout(
                            first_line_text, style, hinting, max_width)
                        lines = layout.iter_lines()
                        first_line = next(lines, None)
                        second_line = next(lines, None)
                        resume_at = len(
                            (first_line_text + u' ').encode('utf8'))
                    else:
                        first_line_text, next_word = u'', first_line_text
                soft_hyphen_indexes = [
                    match.start() for match in
                    re.finditer(soft_hyphen, next_word)]
                soft_hyphen_indexes.reverse()
                dictionary_iterations = [
                    next_word[:i + 1] for i in soft_hyphen_indexes]
            elif hyphens == 'auto' and lang:
                # The next word does not fit, try hyphenation
                dictionary_key = (lang, left, right, total)
                dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key)
                if dictionary is None:
                    dictionary = pyphen.Pyphen(
                        lang=lang, left=left, right=right)
                    PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary
                dictionary_iterations = [
                    start for start, end in dictionary.iterate(next_word)]
            else:
                dictionary_iterations = []

            if dictionary_iterations:
                for first_word_part in dictionary_iterations:
                    new_first_line_text = first_line_text + first_word_part
                    hyphenated_first_line_text = (
                        new_first_line_text + style.hyphenate_character)
                    new_layout = create_layout(
                        hyphenated_first_line_text, style, hinting, max_width)
                    new_lines = new_layout.iter_lines()
                    new_first_line = next(new_lines, None)
                    new_second_line = next(new_lines, None)
                    new_first_line_width, _ = get_size(new_first_line, style)
                    new_space = max_width - new_first_line_width
                    if new_second_line is None and (
                            new_space >= 0 or
                            first_word_part == dictionary_iterations[-1]):
                        hyphenated = True
                        layout = new_layout
                        first_line = new_first_line
                        second_line = new_second_line
                        resume_at = len(new_first_line_text.encode('utf8'))
                        if text[len(new_first_line_text)] == soft_hyphen:
                            resume_at += len(soft_hyphen.encode('utf8'))
                        break

                if not hyphenated and not first_line_text:
                    # Recreate the layout with no max_width to be sure that
                    # we don't break inside the hyphenate-character string
                    hyphenated = True
                    layout = create_layout(
                        hyphenated_first_line_text, style, hinting, None)
                    lines = layout.iter_lines()
                    first_line = next(lines, None)
                    second_line = next(lines, None)
                    resume_at = len(new_first_line_text.encode('utf8'))
                    if text[len(first_line_text)] == soft_hyphen:
                        resume_at += len(soft_hyphen.encode('utf8'))

    if not hyphenated and first_line_text.endswith(soft_hyphen):
        # Recreate the layout with no max_width to be sure that
        # we don't break inside the hyphenate-character string
        hyphenated = True
        hyphenated_first_line_text = (
            first_line_text + style.hyphenate_character)
        layout = create_layout(
            hyphenated_first_line_text, style, hinting, None)
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
        resume_at = len(first_line_text.encode('utf8'))

    # Step 5: Try to break word if it's too long for the line
    overflow_wrap = style.overflow_wrap
    first_line_width, _ = get_size(first_line, style)
    space = max_width - first_line_width
    # If we can break words and the first line is too long
    if overflow_wrap == 'break-word' and space < 0:
        # Is it really OK to remove hyphenation for word-break ?
        hyphenated = False
        # TODO: Modify code to preserve W3C condition:
        # "Shaping characters are still shaped as if the word were not broken"
        # The way new lines are processed in this function (one by one with no
        # memory of the last) prevents shaping characters (arabic, for
        # instance) from keeping their shape when wrapped on the next line with
        # pango layout.  Maybe insert Unicode shaping characters in text ?
        temp_layout = create_layout(text, style, hinting, max_width)
        temp_layout.set_wrap(PANGO_WRAP_MODE['WRAP_WORD_CHAR'])
        temp_lines = temp_layout.iter_lines()
        next(temp_lines, None)
        temp_second_line = next(temp_lines, None)
        temp_second_line_index = (
            len(text.encode('utf-8')) if temp_second_line is None
            else temp_second_line.start_index)
        resume_at = temp_second_line_index
        first_line_text = utf8_slice(text, slice(temp_second_line_index))
        layout = create_layout(first_line_text, style, hinting, max_width)
        lines = layout.iter_lines()
        first_line = next(lines, None)

    return first_line_metrics(
        first_line, text, layout, resume_at, space_collapse, style, hyphenated,
        style.hyphenate_character)
Example #5
0
def split_first_line(text, style, hinting, max_width, line_width):
    """Fit as much as possible in the available width for one line of text.

    Return ``(layout, length, resume_at, width, height, baseline)``.

    ``layout``: a pango Layout with the first line
    ``length``: length in UTF-8 bytes of the first line
    ``resume_at``: The number of UTF-8 bytes to skip for the next line.
                   May be ``None`` if the whole text fits in one line.
                   This may be greater than ``length`` in case of preserved
                   newline characters.
    ``width``: width in pixels of the first line
    ``height``: height in pixels of the first line
    ``baseline``: baseline in pixels of the first line

    """
    # In some cases (shrink-to-fit result being the preferred width)
    # this value is coming from Pango itself,
    # but floating point errors have accumulated:
    #   width2 = (width + X) - X   # in some cases, width2 < width
    # Increase the value a bit to compensate and not introduce
    # an unexpected line break.
    if max_width is not None:
        max_width += style.font_size * 0.2
    # Step #1: Get a draft layout with the first line
    layout = None
    if max_width:
        expected_length = int(max_width / style.font_size * 2.5)
        if expected_length < len(text):
            # Try to use a small amount of text instead of the whole text
            layout = create_layout(
                text[:expected_length], style, hinting, max_width)
            lines = layout.iter_lines()
            first_line = next(lines, None)
            second_line = next(lines, None)
            if second_line is None:
                # The small amount of text fits in one line, give up and use
                # the whole text
                layout = None
    if layout is None:
        layout = create_layout(text, style, hinting, max_width)
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
    resume_at = None if second_line is None else second_line.start_index

    # Step #2: Don't hyphenize when it's not needed
    if max_width is None:
        # The first line can take all the place needed
        return first_line_metrics(first_line, text, layout, resume_at)
    first_line_width, _height = get_size(first_line)
    if second_line is None and first_line_width <= max_width:
        # The first line fits in the available width
        return first_line_metrics(first_line, text, layout, resume_at)

    # Step #3: Try to put the first word of the second line on the first line
    if first_line_width <= max_width:
        # The first line may have been cut too early by Pango
        second_line_index = second_line.start_index
        first_part = utf8_slice(text, slice(second_line_index))
        second_part = utf8_slice(text, slice(second_line_index, None))
    else:
        # The first word is longer than the line, try to hyphenize it
        first_part = ''
        second_part = text
    next_word = second_part.split(' ', 1)[0]

    if not next_word:
        # We did not find a word on the next line
        return first_line_metrics(first_line, text, layout, resume_at)

    # next_word might fit without a space afterwards.
    # Pango previously counted that space’s advance width.
    new_first_line = first_part + next_word
    layout.set_text(new_first_line)
    lines = layout.iter_lines()
    first_line = next(lines, None)
    second_line = next(lines, None)
    first_line_width, _height = get_size(first_line)
    if second_line is None and first_line_width <= max_width:
        # The next word fits in the first line, keep the layout
        resume_at = len(new_first_line.encode('utf-8')) + 1
        return first_line_metrics(first_line, text, layout, resume_at)

    # Step #4: Try to hyphenize
    hyphens = style.hyphens
    lang = style.lang and pyphen.language_fallback(style.lang)
    total, left, right = style.hyphenate_limit_chars

    hyphenated = False

    # Automatic hyphenation possible and next word is long enough
    if hyphens not in ('none', 'manual') and lang and len(next_word) >= total:
        first_line_width, _height = get_size(first_line)
        space = max_width - first_line_width
        if style.hyphenate_limit_zone.unit == '%':
            limit_zone = max_width * style.hyphenate_limit_zone.value / 100.
        else:
            limit_zone = style.hyphenate_limit_zone.value

        if space > limit_zone or space < 0:
            # The next word does not fit, try hyphenation
            dictionary_key = (lang, left, right, total)
            dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key)
            if dictionary is None:
                dictionary = pyphen.Pyphen(lang=lang, left=left, right=right)
                PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary
            for first_word_part, _ in dictionary.iterate(next_word):
                new_first_line = (
                    first_part + first_word_part + style.hyphenate_character)
                temp_layout = create_layout(
                    new_first_line, style, hinting, max_width)
                temp_lines = temp_layout.iter_lines()
                temp_first_line = next(temp_lines, None)
                temp_second_line = next(temp_lines, None)

                if (temp_second_line is None and space >= 0) or space < 0:
                    hyphenated = True
                    # TODO: find why there's no need to .encode
                    resume_at = len(first_part + first_word_part)
                    layout = temp_layout
                    first_line = temp_first_line
                    second_line = temp_second_line
                    temp_first_line_width, _height = get_size(temp_first_line)
                    if temp_first_line_width <= max_width:
                        break

    # Step 5: Try to break word if it's too long for the line
    overflow_wrap = style.overflow_wrap
    first_line_width, _height = get_size(first_line)
    space = max_width - first_line_width
    # If we can break words and the first line is too long
    if overflow_wrap == 'break-word' and space < 0:
        if hyphenated:
            # Is it really OK to remove hyphenation for word-break ?
            new_first_line = new_first_line.rstrip(
                new_first_line[-(len(style.hyphenate_character)):])
            if second_line is not None:
                second_line_index = second_line.start_index
                second_part = utf8_slice(text, slice(second_line_index, None))
                new_first_line += second_part
            hyphenated = False

        # TODO: Modify code to preserve W3C condition:
        # "Shaping characters are still shaped as if the word were not broken"
        # The way new lines are processed in this function (one by one with no
        # memory of the last) prevents shaping characters (arabic, for
        # instance) from keeping their shape when wrapped on the next line with
        # pango layout.  Maybe insert Unicode shaping characters in text ?
        temp_layout = create_layout(new_first_line, style, hinting, max_width)
        temp_layout.set_wrap(PANGO_WRAP_MODE['WRAP_WORD_CHAR'])
        temp_lines = temp_layout.iter_lines()
        temp_first_line = next(temp_lines, None)
        temp_second_line = next(temp_lines, None)
        temp_second_line_index = (
            len(new_first_line) if temp_second_line is None
            else temp_second_line.start_index)
        resume_at = temp_second_line_index
        first_part = utf8_slice(text, slice(temp_second_line_index))
        layout = create_layout(first_part, style, hinting, max_width)
        lines = layout.iter_lines()
        first_line = next(lines, None)

    return first_line_metrics(first_line, text, layout, resume_at, hyphenated)
Example #6
0
def split_first_line(text, style, context, max_width, justification_spacing):
    """Fit as much as possible in the available width for one line of text.

    Return ``(layout, length, resume_at, width, height, baseline)``.

    ``layout``: a pango Layout with the first line
    ``length``: length in UTF-8 bytes of the first line
    ``resume_at``: The number of UTF-8 bytes to skip for the next line.
                   May be ``None`` if the whole text fits in one line.
                   This may be greater than ``length`` in case of preserved
                   newline characters.
    ``width``: width in pixels of the first line
    ``height``: height in pixels of the first line
    ``baseline``: baseline in pixels of the first line

    """
    # See https://www.w3.org/TR/css-text-3/#white-space-property
    text_wrap = style['white_space'] in ('normal', 'pre-wrap', 'pre-line')
    space_collapse = style['white_space'] in ('normal', 'nowrap', 'pre-line')

    if not text_wrap:
        max_width = None

    # Step #1: Get a draft layout with the first line
    layout = None
    if (max_width is not None and max_width != float('inf')
            and style['font_size']):
        expected_length = int(max_width / style['font_size'] * 2.5)
        if expected_length < len(text):
            # Try to use a small amount of text instead of the whole text
            layout = create_layout(text[:expected_length], style, context,
                                   max_width, justification_spacing)
            lines = layout.iter_lines()
            first_line = next(lines, None)
            second_line = next(lines, None)
            if second_line is None:
                # The small amount of text fits in one line, give up and use
                # the whole text
                layout = None
    if layout is None:
        layout = create_layout(text, style, context, max_width,
                               justification_spacing)
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
    resume_at = None if second_line is None else second_line.start_index

    # Step #2: Don't split lines when it's not needed
    if max_width is None:
        # The first line can take all the place needed
        return first_line_metrics(first_line, text, layout, resume_at,
                                  space_collapse, style)
    first_line_width, _ = get_size(first_line, style)
    if second_line is None and first_line_width <= max_width:
        # The first line fits in the available width
        return first_line_metrics(first_line, text, layout, resume_at,
                                  space_collapse, style)

    # Step #3: Try to put the first word of the second line on the first line
    # https://mail.gnome.org/archives/gtk-i18n-list/2013-September/msg00006
    # is a good thread related to this problem.
    if first_line_width <= max_width:
        # The first line may have been cut too early by Pango
        second_line_index = second_line.start_index
        first_line_text = utf8_slice(text, slice(second_line_index))
        second_line_text = utf8_slice(text, slice(second_line_index, None))
    else:
        # The first word is longer than the line, try to hyphenize it
        first_line_text = ''
        second_line_text = text

    next_word = second_line_text.split(' ', 1)[0]
    if next_word:
        if space_collapse:
            # next_word might fit without a space afterwards
            # only try when space collapsing is allowed
            new_first_line_text = first_line_text + next_word
            layout.set_text(new_first_line_text)
            lines = layout.iter_lines()
            first_line = next(lines, None)
            second_line = next(lines, None)
            first_line_width, _ = get_size(first_line, style)
            if second_line is None and first_line_text:
                # The next word fits in the first line, keep the layout
                resume_at = len(new_first_line_text.encode('utf-8')) + 1
                if resume_at == len(text.encode('utf-8')):
                    resume_at = None
                return first_line_metrics(first_line, text, layout, resume_at,
                                          space_collapse, style)
            elif second_line:
                # Text may have been split elsewhere by Pango earlier
                resume_at = second_line.start_index
            else:
                # Second line is none
                resume_at = first_line.length + 1
    elif first_line_text:
        # We found something on the first line but we did not find a word on
        # the next line, no need to hyphenate, we can keep the current layout
        return first_line_metrics(first_line, text, layout, resume_at,
                                  space_collapse, style)

    # Step #4: Try to hyphenize
    hyphens = style['hyphens']
    lang = style['lang'] and pyphen.language_fallback(style['lang'])
    total, left, right = style['hyphenate_limit_chars']
    hyphenated = False
    soft_hyphen = u'\u00ad'

    # Automatic hyphenation possible and next word is long enough
    if hyphens != 'none' and len(next_word) >= total:
        first_line_width, _ = get_size(first_line, style)
        space = max_width - first_line_width
        if style['hyphenate_limit_zone'].unit == '%':
            limit_zone = max_width * style['hyphenate_limit_zone'].value / 100.
        else:
            limit_zone = style['hyphenate_limit_zone'].value

        if space > limit_zone or space < 0:
            # Manual hyphenation: check that the line ends with a soft hyphen
            # and add the missing hyphen
            if hyphens == 'manual':
                if first_line_text.endswith(soft_hyphen):
                    # The first line has been split on a soft hyphen
                    if u' ' in first_line_text:
                        first_line_text, next_word = (first_line_text.rsplit(
                            u' ', 1))
                        next_word = u' ' + next_word
                        layout.set_text(first_line_text)
                        lines = layout.iter_lines()
                        first_line = next(lines, None)
                        second_line = next(lines, None)
                        resume_at = len(
                            (first_line_text + u' ').encode('utf8'))
                    else:
                        first_line_text, next_word = u'', first_line_text
                soft_hyphen_indexes = [
                    match.start()
                    for match in re.finditer(soft_hyphen, next_word)
                ]
                soft_hyphen_indexes.reverse()
                dictionary_iterations = [
                    next_word[:i + 1] for i in soft_hyphen_indexes
                ]
            elif hyphens == 'auto' and lang:
                # The next word does not fit, try hyphenation
                dictionary_key = (lang, left, right, total)
                dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key)
                if dictionary is None:
                    dictionary = pyphen.Pyphen(lang=lang,
                                               left=left,
                                               right=right)
                    PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary
                dictionary_iterations = [
                    start for start, end in dictionary.iterate(next_word)
                ]
            else:
                dictionary_iterations = []

            if dictionary_iterations:
                for first_word_part in dictionary_iterations:
                    new_first_line_text = first_line_text + first_word_part
                    hyphenated_first_line_text = (new_first_line_text +
                                                  style['hyphenate_character'])
                    new_layout = create_layout(hyphenated_first_line_text,
                                               style, context, max_width,
                                               justification_spacing)
                    new_lines = new_layout.iter_lines()
                    new_first_line = next(new_lines, None)
                    new_second_line = next(new_lines, None)
                    new_first_line_width, _ = get_size(new_first_line, style)
                    new_space = max_width - new_first_line_width
                    if new_second_line is None and (
                            new_space >= 0
                            or first_word_part == dictionary_iterations[-1]):
                        hyphenated = True
                        layout = new_layout
                        first_line = new_first_line
                        second_line = new_second_line
                        resume_at = len(new_first_line_text.encode('utf8'))
                        if text[len(new_first_line_text)] == soft_hyphen:
                            resume_at += len(soft_hyphen.encode('utf8'))
                        break

                if not hyphenated and not first_line_text:
                    # Recreate the layout with no max_width to be sure that
                    # we don't break inside the hyphenate-character string
                    hyphenated = True
                    layout.set_text(hyphenated_first_line_text)
                    pango.pango_layout_set_width(layout.layout,
                                                 units_from_double(-1))
                    lines = layout.iter_lines()
                    first_line = next(lines, None)
                    second_line = next(lines, None)
                    resume_at = len(new_first_line_text.encode('utf8'))
                    if text[len(first_line_text)] == soft_hyphen:
                        resume_at += len(soft_hyphen.encode('utf8'))

    if not hyphenated and first_line_text.endswith(soft_hyphen):
        # Recreate the layout with no max_width to be sure that
        # we don't break inside the hyphenate-character string
        hyphenated = True
        hyphenated_first_line_text = (first_line_text +
                                      style['hyphenate_character'])
        layout.set_text(hyphenated_first_line_text)
        pango.pango_layout_set_width(layout.layout, units_from_double(-1))
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
        resume_at = len(first_line_text.encode('utf8'))

    # Step 5: Try to break word if it's too long for the line
    overflow_wrap = style['overflow_wrap']
    first_line_width, _ = get_size(first_line, style)
    space = max_width - first_line_width
    # If we can break words and the first line is too long
    if overflow_wrap == 'break-word' and space < 0:
        # Is it really OK to remove hyphenation for word-break ?
        hyphenated = False
        # TODO: Modify code to preserve W3C condition:
        # "Shaping characters are still shaped as if the word were not broken"
        # The way new lines are processed in this function (one by one with no
        # memory of the last) prevents shaping characters (arabic, for
        # instance) from keeping their shape when wrapped on the next line with
        # pango layout. Maybe insert Unicode shaping characters in text?
        layout.set_text(text)
        pango.pango_layout_set_width(layout.layout,
                                     units_from_double(max_width))
        layout.set_wrap(PANGO_WRAP_MODE['WRAP_CHAR'])
        temp_lines = layout.iter_lines()
        next(temp_lines, None)
        temp_second_line = next(temp_lines, None)
        temp_second_line_index = (len(text.encode('utf-8'))
                                  if temp_second_line is None else
                                  temp_second_line.start_index)
        # TODO: WRAP_CHAR is said to "wrap lines at character boundaries", but
        # it doesn't. Looks like it tries to split at word boundaries and then
        # at character boundaries if there's no enough space for a full word,
        # just as WRAP_WORD_CHAR does. That's why we have to split this text
        # twice. Find why. It may be related to the problem described in the
        # link given in step #3.
        first_line_text = utf8_slice(text, slice(temp_second_line_index))
        layout.set_text(first_line_text)
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
        resume_at = (first_line.length
                     if second_line is None else second_line.start_index)

    if resume_at is not None and resume_at >= len(text.encode('utf-8')):
        resume_at = None
    return first_line_metrics(first_line, text, layout, resume_at,
                              space_collapse, style, hyphenated,
                              style['hyphenate_character'])
Example #7
0
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import string, nltk, pyphen, glob, os
import numpy as np

nltk.download('punkt')
pyphen.language_fallback('en')
WordDic = pyphen.Pyphen(lang='en')


def saveNPZ(Array, OutputFile):
    np.savez_compressed(OutputFile, array1=Array)


def loadNPZ(InputFile):
    return np.load(InputFile)['array1']


def saveListTxt(Array, OutputFile):
    with open(OutputFile, 'w') as f:
        for item in Array:
            f.write('%s\n' % item)


def loadListTxt(InputFile):
    Array = []
    with open(InputFile, 'r') as f:
        Array = f.read().splitlines()
import pyphen
import csv

pyphen.language_fallback('it_IT')
engine = pyphen.Pyphen(lang='it_IT')

syllables = set()
syllableCount = dict()
syllablesLen = 0
uniqueWords = set()

with open('dicts/it.csv') as dictionary:
    for idx, word in enumerate(dictionary):
        uniqueWords.add(word)
        currentSylls = engine.inserted(word).split('-')
        for s in currentSylls:
            s = s.strip('\n')
            if len(s) >= 1 and len(s) <= 3:
                syllables.add(s)
                if len(syllables) == syllablesLen:
                    # sillaba non inserita, duplicato
                    syllableCount[s] += 1
                else:
                    # prima volta che inserisco la sillaba
                    syllableCount[s] = 1
                syllablesLen = len(syllables)

minOccurences = min(syllableCount.values())
avg = sum(syllableCount.values()) / len(syllableCount)
maxOccurences = max(syllableCount.values())
Example #9
0
import argparse
import logging
import sys

import pyphen
import nltk
from termcolor import colored

pyphen.language_fallback("en_US")

logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_out = logging.StreamHandler(sys.stdout)
console_out.setLevel(logging.DEBUG)
logger.addHandler(console_out)


def parse_arguments():
    """
    Simple argument parser for the command line
    :return: The text to be edited
    """
    parser = argparse.ArgumentParser(description="Receive text to be edited")
    parser.add_argument("text", metavar="input text", type=str)
    args = parser.parse_args()
    return args.text


def clean_input(text):
    """
    Text sanitization function
Example #10
0
def split_first_line(text, style, hinting, max_width, line_width):
    """Fit as much as possible in the available width for one line of text.

    Return ``(layout, length, resume_at, width, height, baseline)``.

    ``layout``: a pango Layout with the first line
    ``length``: length in UTF-8 bytes of the first line
    ``resume_at``: The number of UTF-8 bytes to skip for the next line.
                   May be ``None`` if the whole text fits in one line.
                   This may be greater than ``length`` in case of preserved
                   newline characters.
    ``width``: width in pixels of the first line
    ``height``: height in pixels of the first line
    ``baseline``: baseline in pixels of the first line

    """
    # In some cases (shrink-to-fit result being the preferred width)
    # this value is coming from Pango itself,
    # but floating point errors have accumulated:
    #   width2 = (width + X) - X   # in some cases, width2 < width
    # Increase the value a bit to compensate and not introduce
    # an unexpected line break.
    if max_width is not None:
        max_width *= 1.0001
    # Step #1: Get a draft layout with the first line
    layout = None
    if max_width:
        expected_length = int(max_width / style.font_size * 2.5)
        if expected_length < len(text):
            # Try to use a small amount of text instead of the whole text
            layout = create_layout(text[:expected_length], style, hinting,
                                   max_width)
            lines = layout.iter_lines()
            first_line = next(lines, None)
            second_line = next(lines, None)
            if second_line is None:
                # The small amount of text fits in one line, give up and use
                # the whole text
                layout = None
    if layout is None:
        layout = create_layout(text, style, hinting, max_width)
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
    resume_at = None if second_line is None else second_line.start_index

    # Step #2: Don't hyphenize when it's not needed
    if max_width is None:
        # The first line can take all the place needed
        return first_line_metrics(first_line, text, layout, resume_at)
    first_line_width, _height = get_size(first_line)
    if second_line is None and first_line_width <= max_width:
        # The first line fits in the available width
        return first_line_metrics(first_line, text, layout, resume_at)

    # Step #3: Try to put the first word of the second line on the first line
    if first_line_width <= max_width:
        # The first line may have been cut too early by Pango
        second_line_index = second_line.start_index
        first_part = utf8_slice(text, slice(second_line_index))
        second_part = utf8_slice(text, slice(second_line_index, None))
    else:
        # The first word is longer than the line, try to hyphenize it
        first_part = ''
        second_part = text
    next_word = second_part.split(' ', 1)[0]

    if not next_word:
        # We did not find a word on the next line
        return first_line_metrics(first_line, text, layout, resume_at)

    # next_word might fit without a space afterwards.
    # Pango previously counted that space’s advance width.
    new_first_line = first_part + next_word
    layout.set_text(new_first_line)
    lines = layout.iter_lines()
    first_line = next(lines, None)
    second_line = next(lines, None)
    first_line_width, _height = get_size(first_line)
    if second_line is None and first_line_width <= max_width:
        # The next word fits in the first line, keep the layout
        resume_at = len(new_first_line.encode('utf-8')) + 1
        return first_line_metrics(first_line, text, layout, resume_at)

    # Step #4: Try to hyphenize
    hyphens = style.hyphens
    lang = style.lang and pyphen.language_fallback(style.lang)
    total, left, right = style.hyphenate_limit_chars

    hyphenated = False

    # Automatic hyphenation possible and next word is long enough
    if hyphens not in ('none', 'manual') and lang and len(next_word) >= total:
        first_line_width, _height = get_size(first_line)
        space = max_width - first_line_width
        if style.hyphenate_limit_zone.unit == '%':
            limit_zone = max_width * style.hyphenate_limit_zone.value / 100.
        else:
            limit_zone = style.hyphenate_limit_zone.value

        if space > limit_zone or space < 0:
            # The next word does not fit, try hyphenation
            dictionary_key = (lang, left, right, total)
            dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key)
            if dictionary is None:
                dictionary = pyphen.Pyphen(lang=lang, left=left, right=right)
                PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary
            for first_word_part, _ in dictionary.iterate(next_word):
                new_first_line = (first_part + first_word_part +
                                  style.hyphenate_character)
                temp_layout = create_layout(new_first_line, style, hinting,
                                            max_width)
                temp_lines = temp_layout.iter_lines()
                temp_first_line = next(temp_lines, None)
                temp_second_line = next(temp_lines, None)

                if (temp_second_line is None and space >= 0) or space < 0:
                    hyphenated = True
                    # TODO: find why there's no need to .encode
                    resume_at = len(first_part + first_word_part)
                    layout = temp_layout
                    first_line = temp_first_line
                    second_line = temp_second_line
                    temp_first_line_width, _height = get_size(temp_first_line)
                    if temp_first_line_width <= max_width:
                        break

    # Step 5: Try to break word if it's too long for the line
    overflow_wrap = style.overflow_wrap
    first_line_width, _height = get_size(first_line)
    space = max_width - first_line_width
    # If we can break words and the first line is too long
    if overflow_wrap == 'break-word' and space < 0:
        if hyphenated:
            # Is it really OK to remove hyphenation for word-break ?
            new_first_line = new_first_line.rstrip(
                new_first_line[-(len(style.hyphenate_character)):])
            if second_line is not None:
                second_line_index = second_line.start_index
                second_part = utf8_slice(text, slice(second_line_index, None))
                new_first_line += second_part
            hyphenated = False

        # TODO: Modify code to preserve W3C condition:
        # "Shaping characters are still shaped as if the word were not broken"
        # The way new lines are processed in this function (one by one with no
        # memory of the last) prevents shaping characters (arabic, for
        # instance) from keeping their shape when wrapped on the next line with
        # pango layout.  Maybe insert Unicode shaping characters in text ?
        temp_layout = create_layout(new_first_line, style, hinting, max_width)
        temp_layout.set_wrap(PANGO_WRAP_MODE['WRAP_WORD_CHAR'])
        temp_lines = temp_layout.iter_lines()
        temp_first_line = next(temp_lines, None)
        temp_second_line = next(temp_lines, None)
        temp_second_line_index = (len(new_first_line)
                                  if temp_second_line is None else
                                  temp_second_line.start_index)
        resume_at = temp_second_line_index
        first_part = utf8_slice(text, slice(temp_second_line_index))
        layout = create_layout(first_part, style, hinting, max_width)
        lines = layout.iter_lines()
        first_line = next(lines, None)

    return first_line_metrics(first_line, text, layout, resume_at, hyphenated)
Example #11
0
import pyphen
pyphen.language_fallback('pt_BR')
dic = pyphen.Pyphen(lang='pt_BR')
txt = """ hino """
expl = txt.split()
for w in expl:
    print dic.inserted(w)
import pyphen
import os
import glob

pyphen.language_fallback('nl_NL_variant1')
'nl_NL' in pyphen.LANGUAGES

dic = pyphen.Pyphen(lang='en_US')

path = '/home/ubuntu/statistics'
for filename in glob.glob(os.path.join(path, 'fixed_*_words.txt')):
    shortFile = os.path.basename(filename)
    words = open(filename, 'r')
    syllableWriter = open('/home/ubuntu/statistics/' + shortFile[:-9] + 'syllables.txt', 'w')
    topLine = words.readline()
    syllableWriter.write(topLine + "\n")

    for word in words:
        hyphenated = dic.inserted(word)    
        syllables = 1
        for letter in hyphenated:
            if(letter == '-'):
                syllables = syllables + 1
        syllableWriter.write(str(syllables))
        syllableWriter.write("\n")
Example #13
0
            words.append(word)
            start_times.append(start_time)
            end_times.append(end_time)
        transcription = {  # contains all the info about the audio transcription if i-th result part
            'name': blob_name,
            'transcript': transcript,
            'confidence': confidence,
            'words': words,
            'start_times': start_times,
            'end_times': end_times
        }

        # syllabification part
        # use czech language dictionary
        pyphen.language_fallback("cs_CZ")
        dic = pyphen.Pyphen(lang='cs_CZ')

        syllables_stream = []
        timestamps_stream = []
        print("Running syllabification on " + str(i) + "th part of file...")
        # iterate over the recognized words
        for ii in range(len(transcription.get('words'))):
            word = transcription.get('words')[ii]
            start_time = transcription.get('start_times')[ii]
            end_time = transcription.get('end_times')[ii]

            word_syllables = dic.inserted(word)  # syllabify
            word_syllables = word_syllables.split('-')  # split into a list
            syllables_stream = syllables_stream + word_syllables
            numSyl = len(word_syllables)  # number of syllables in the word
Example #14
0
def split_first_line(text, style, context, max_width, justification_spacing,
                     minimum=False):
    """Fit as much as possible in the available width for one line of text.

    Return ``(layout, length, resume_at, width, height, baseline)``.

    ``layout``: a pango Layout with the first line
    ``length``: length in UTF-8 bytes of the first line
    ``resume_at``: The number of UTF-8 bytes to skip for the next line.
                   May be ``None`` if the whole text fits in one line.
                   This may be greater than ``length`` in case of preserved
                   newline characters.
    ``width``: width in pixels of the first line
    ``height``: height in pixels of the first line
    ``baseline``: baseline in pixels of the first line

    """
    # See https://www.w3.org/TR/css-text-3/#white-space-property
    text_wrap = style['white_space'] in ('normal', 'pre-wrap', 'pre-line')
    space_collapse = style['white_space'] in ('normal', 'nowrap', 'pre-line')

    if not text_wrap:
        max_width = None

    # Step #1: Get a draft layout with the first line
    layout = None
    if (max_width is not None and max_width != float('inf') and
            style['font_size']):
        if max_width == 0:
            # Trying to find minimum size, let's naively split on spaces and
            # keep one word + one letter
            space_index = text.find(' ')
            if space_index == -1:
                expected_length = len(text)
            else:
                expected_length = space_index + 2  # index + space + one letter
        else:
            expected_length = int(max_width / style['font_size'] * 2.5)
        if expected_length < len(text):
            # Try to use a small amount of text instead of the whole text
            layout = create_layout(
                text[:expected_length], style, context, max_width,
                justification_spacing)
            lines = layout.iter_lines()
            first_line = next(lines, None)
            second_line = next(lines, None)
            if second_line is None:
                # The small amount of text fits in one line, give up and use
                # the whole text
                layout = None
    if layout is None:
        layout = create_layout(
            text, style, context, max_width, justification_spacing)
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
    resume_at = None if second_line is None else second_line.start_index

    # Step #2: Don't split lines when it's not needed
    if max_width is None:
        # The first line can take all the place needed
        return first_line_metrics(
            first_line, text, layout, resume_at, space_collapse, style)
    first_line_width, _ = get_size(first_line, style)
    if second_line is None and first_line_width <= max_width:
        # The first line fits in the available width
        return first_line_metrics(
            first_line, text, layout, resume_at, space_collapse, style)

    # Step #3: Try to put the first word of the second line on the first line
    # https://mail.gnome.org/archives/gtk-i18n-list/2013-September/msg00006
    # is a good thread related to this problem.
    if first_line_width <= max_width:
        # The first line may have been cut too early by Pango
        second_line_index = second_line.start_index
        first_line_text = utf8_slice(text, slice(second_line_index))
        second_line_text = utf8_slice(text, slice(second_line_index, None))
    else:
        # The first word is longer than the line, try to hyphenize it
        first_line_text = ''
        second_line_text = text

    next_word = second_line_text.split(' ', 1)[0]
    if next_word:
        if space_collapse:
            # next_word might fit without a space afterwards
            # only try when space collapsing is allowed
            new_first_line_text = first_line_text + next_word
            layout.set_text(new_first_line_text)
            lines = layout.iter_lines()
            first_line = next(lines, None)
            second_line = next(lines, None)
            first_line_width, _ = get_size(first_line, style)
            if second_line is None and first_line_text:
                # The next word fits in the first line, keep the layout
                resume_at = len(new_first_line_text.encode('utf-8')) + 1
                return first_line_metrics(
                    first_line, text, layout, resume_at, space_collapse, style)
            elif second_line:
                # Text may have been split elsewhere by Pango earlier
                resume_at = second_line.start_index
            else:
                # Second line is none
                resume_at = first_line.length + 1
    elif first_line_text:
        # We found something on the first line but we did not find a word on
        # the next line, no need to hyphenate, we can keep the current layout
        return first_line_metrics(
            first_line, text, layout, resume_at, space_collapse, style)

    # Step #4: Try to hyphenize
    hyphens = style['hyphens']
    lang = style['lang'] and pyphen.language_fallback(style['lang'])
    total, left, right = style['hyphenate_limit_chars']
    hyphenated = False
    soft_hyphen = u'\u00ad'

    # Automatic hyphenation possible and next word is long enough
    if hyphens != 'none' and len(next_word) >= total:
        first_line_width, _ = get_size(first_line, style)
        space = max_width - first_line_width
        if style['hyphenate_limit_zone'].unit == '%':
            limit_zone = max_width * style['hyphenate_limit_zone'].value / 100.
        else:
            limit_zone = style['hyphenate_limit_zone'].value

        if space > limit_zone or space < 0:
            # Manual hyphenation: check that the line ends with a soft hyphen
            # and add the missing hyphen
            if hyphens == 'manual':
                if first_line_text.endswith(soft_hyphen):
                    # The first line has been split on a soft hyphen
                    if u' ' in first_line_text:
                        first_line_text, next_word = (
                            first_line_text.rsplit(u' ', 1))
                        next_word = u' ' + next_word
                        layout.set_text(first_line_text)
                        lines = layout.iter_lines()
                        first_line = next(lines, None)
                        second_line = next(lines, None)
                        resume_at = len(
                            (first_line_text + u' ').encode('utf8'))
                    else:
                        first_line_text, next_word = u'', first_line_text
                soft_hyphen_indexes = [
                    match.start() for match in
                    re.finditer(soft_hyphen, next_word)]
                soft_hyphen_indexes.reverse()
                dictionary_iterations = [
                    next_word[:i + 1] for i in soft_hyphen_indexes]
            elif hyphens == 'auto' and lang:
                # The next word does not fit, try hyphenation
                dictionary_key = (lang, left, right, total)
                dictionary = context.dictionaries.get(dictionary_key)
                if dictionary is None:
                    dictionary = pyphen.Pyphen(
                        lang=lang, left=left, right=right)
                    context.dictionaries[dictionary_key] = dictionary
                dictionary_iterations = [
                    start for start, end in dictionary.iterate(next_word)]
            else:
                dictionary_iterations = []

            if dictionary_iterations:
                for first_word_part in dictionary_iterations:
                    new_first_line_text = first_line_text + first_word_part
                    hyphenated_first_line_text = (
                        new_first_line_text + style['hyphenate_character'])
                    new_layout = create_layout(
                        hyphenated_first_line_text, style, context, max_width,
                        justification_spacing)
                    new_lines = new_layout.iter_lines()
                    new_first_line = next(new_lines, None)
                    new_second_line = next(new_lines, None)
                    new_first_line_width, _ = get_size(new_first_line, style)
                    new_space = max_width - new_first_line_width
                    if new_second_line is None and (
                            new_space >= 0 or
                            first_word_part == dictionary_iterations[-1]):
                        hyphenated = True
                        layout = new_layout
                        first_line = new_first_line
                        second_line = new_second_line
                        resume_at = len(new_first_line_text.encode('utf8'))
                        if text[len(new_first_line_text)] == soft_hyphen:
                            resume_at += len(soft_hyphen.encode('utf8'))
                        break

                if not hyphenated and not first_line_text:
                    # Recreate the layout with no max_width to be sure that
                    # we don't break inside the hyphenate-character string
                    hyphenated = True
                    layout.set_text(hyphenated_first_line_text)
                    pango.pango_layout_set_width(
                        layout.layout, units_from_double(-1))
                    lines = layout.iter_lines()
                    first_line = next(lines, None)
                    second_line = next(lines, None)
                    resume_at = len(new_first_line_text.encode('utf8'))
                    if text[len(first_line_text)] == soft_hyphen:
                        resume_at += len(soft_hyphen.encode('utf8'))

    if not hyphenated and first_line_text.endswith(soft_hyphen):
        # Recreate the layout with no max_width to be sure that
        # we don't break inside the hyphenate-character string
        hyphenated = True
        hyphenated_first_line_text = (
            first_line_text + style['hyphenate_character'])
        layout.set_text(hyphenated_first_line_text)
        pango.pango_layout_set_width(
            layout.layout, units_from_double(-1))
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
        resume_at = len(first_line_text.encode('utf8'))

    # Step 5: Try to break word if it's too long for the line
    overflow_wrap = style['overflow_wrap']
    first_line_width, _ = get_size(first_line, style)
    space = max_width - first_line_width
    # If we can break words and the first line is too long
    if not minimum and overflow_wrap == 'break-word' and space < 0:
        # Is it really OK to remove hyphenation for word-break ?
        hyphenated = False
        # TODO: Modify code to preserve W3C condition:
        # "Shaping characters are still shaped as if the word were not broken"
        # The way new lines are processed in this function (one by one with no
        # memory of the last) prevents shaping characters (arabic, for
        # instance) from keeping their shape when wrapped on the next line with
        # pango layout. Maybe insert Unicode shaping characters in text?
        layout.set_text(text)
        pango.pango_layout_set_width(
            layout.layout, units_from_double(max_width))
        layout.set_wrap(PANGO_WRAP_MODE['WRAP_CHAR'])
        temp_lines = layout.iter_lines()
        next(temp_lines, None)
        temp_second_line = next(temp_lines, None)
        temp_second_line_index = (
            len(text.encode('utf-8')) if temp_second_line is None
            else temp_second_line.start_index)
        # TODO: WRAP_CHAR is said to "wrap lines at character boundaries", but
        # it doesn't. Looks like it tries to split at word boundaries and then
        # at character boundaries if there's no enough space for a full word,
        # just as WRAP_WORD_CHAR does. That's why we have to split this text
        # twice. Find why. It may be related to the problem described in the
        # link given in step #3.
        first_line_text = utf8_slice(text, slice(temp_second_line_index))
        layout.set_text(first_line_text)
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
        resume_at = (
            first_line.length if second_line is None
            else second_line.start_index)

    if resume_at is not None and resume_at >= len(text.encode('utf-8')):
        resume_at = None
    return first_line_metrics(
        first_line, text, layout, resume_at, space_collapse, style, hyphenated,
        style['hyphenate_character'])
Example #15
0
def split_first_line(text, style, context, max_width, line_width):
    """Fit as much as possible in the available width for one line of text.

    Return ``(layout, length, resume_at, width, height, baseline)``.

    ``layout``: a pango Layout with the first line
    ``length``: length in UTF-8 bytes of the first line
    ``resume_at``: The number of UTF-8 bytes to skip for the next line.
                   May be ``None`` if the whole text fits in one line.
                   This may be greater than ``length`` in case of preserved
                   newline characters.
    ``width``: width in pixels of the first line
    ``height``: height in pixels of the first line
    ``baseline``: baseline in pixels of the first line

    """
    text_wrap = style.white_space in ('pre', 'nowrap')
    space_collapse = style.white_space in ('normal', 'nowrap', 'pre-line')

    if text_wrap:
        max_width = None
    elif max_width is not None:
        # In some cases (shrink-to-fit result being the preferred width)
        # this value is coming from Pango itself,
        # but floating point errors have accumulated:
        #   width2 = (width + X) - X   # in some cases, width2 < width
        # Increase the value a bit to compensate and not introduce
        # an unexpected line break. The 1e-9 value comes from PEP 485.
        max_width *= 1 + 1e-9

    # Step #1: Get a draft layout with the first line
    layout = create_layout(text, style, context, max_width)
    lines = layout.iter_lines()
    first_line = next(lines, None)
    second_line = next(lines, None)
    resume_at = None if second_line is None else second_line.start_index

    # Step #2: Don't hyphenize when it's not needed
    if max_width is None:
        # The first line can take all the place needed
        return first_line_metrics(
            first_line, text, layout, resume_at, space_collapse, style)
    first_line_width, _ = get_size(first_line, style)
    if second_line is None and first_line_width <= max_width:
        # The first line fits in the available width
        return first_line_metrics(
            first_line, text, layout, resume_at, space_collapse, style)

    # Step #3: Try to put the first word of the second line on the first line
    if first_line_width <= max_width:
        # The first line may have been cut too early by Pango
        second_line_index = second_line.start_index
        first_line_text = utf8_slice(text, slice(second_line_index))
        second_line_text = utf8_slice(text, slice(second_line_index, None))
    else:
        # The first word is longer than the line, try to hyphenize it
        first_line_text = ''
        second_line_text = text

    next_word = second_line_text.split(' ', 1)[0]
    if next_word:
        if space_collapse:
            # next_word might fit without a space afterwards
            # only try when space collapsing is allowed
            new_first_line_text = first_line_text + next_word
            layout.set_text(new_first_line_text)
            lines = layout.iter_lines()
            first_line = next(lines, None)
            second_line = next(lines, None)
            first_line_width, _ = get_size(first_line, style)
            if second_line is None and first_line_text:
                # The next word fits in the first line, keep the layout
                resume_at = len(new_first_line_text.encode('utf-8')) + 1
                if resume_at == len(text.encode('utf-8')):
                    resume_at = None
                return first_line_metrics(
                    first_line, text, layout, resume_at, space_collapse, style)
            elif second_line:
                # Text may have been split elsewhere by Pango earlier
                resume_at = second_line.start_index
            else:
                resume_at = first_line.length + 1
    elif first_line_text:
        # We found something on the first line but we did not find a word on
        # the next line, no need to hyphenate, we can keep the current layout
        return first_line_metrics(
            first_line, text, layout, resume_at, space_collapse, style)

    # Step #4: Try to hyphenize
    hyphens = style.hyphens
    lang = style.lang and pyphen.language_fallback(style.lang)
    total, left, right = style.hyphenate_limit_chars
    hyphenated = False
    soft_hyphen = u'\u00ad'

    # Automatic hyphenation possible and next word is long enough
    if hyphens != 'none' and len(next_word) >= total:
        first_line_width, _ = get_size(first_line, style)
        space = max_width - first_line_width
        if style.hyphenate_limit_zone.unit == '%':
            limit_zone = max_width * style.hyphenate_limit_zone.value / 100.
        else:
            limit_zone = style.hyphenate_limit_zone.value

        if space > limit_zone or space < 0:
            # Manual hyphenation: check that the line ends with a soft hyphen
            # and add the missing hyphen
            if hyphens == 'manual':
                if first_line_text.endswith(soft_hyphen):
                    # The first line has been split on a soft hyphen
                    if u' ' in first_line_text:
                        first_line_text, next_word = (
                            first_line_text.rsplit(u' ', 1))
                        next_word = u' ' + next_word
                        layout.set_text(first_line_text)
                        lines = layout.iter_lines()
                        first_line = next(lines, None)
                        second_line = next(lines, None)
                        resume_at = len(
                            (first_line_text + u' ').encode('utf8'))
                    else:
                        first_line_text, next_word = u'', first_line_text
                soft_hyphen_indexes = [
                    match.start() for match in
                    re.finditer(soft_hyphen, next_word)]
                soft_hyphen_indexes.reverse()
                dictionary_iterations = [
                    next_word[:i + 1] for i in soft_hyphen_indexes]
            elif hyphens == 'auto' and lang:
                # The next word does not fit, try hyphenation
                dictionary_key = (lang, left, right, total)
                dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key)
                if dictionary is None:
                    dictionary = pyphen.Pyphen(
                        lang=lang, left=left, right=right)
                    PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary
                dictionary_iterations = [
                    start for start, end in dictionary.iterate(next_word)]
            else:
                dictionary_iterations = []

            if dictionary_iterations:
                for first_word_part in dictionary_iterations:
                    new_first_line_text = first_line_text + first_word_part
                    hyphenated_first_line_text = (
                        new_first_line_text + style.hyphenate_character)
                    new_layout = create_layout(
                        hyphenated_first_line_text, style, context, max_width)
                    new_lines = new_layout.iter_lines()
                    new_first_line = next(new_lines, None)
                    new_second_line = next(new_lines, None)
                    new_first_line_width, _ = get_size(new_first_line, style)
                    new_space = max_width - new_first_line_width
                    if new_second_line is None and (
                            new_space >= 0 or
                            first_word_part == dictionary_iterations[-1]):
                        hyphenated = True
                        layout = new_layout
                        first_line = new_first_line
                        second_line = new_second_line
                        resume_at = len(new_first_line_text.encode('utf8'))
                        if text[len(new_first_line_text)] == soft_hyphen:
                            resume_at += len(soft_hyphen.encode('utf8'))
                        break

                if not hyphenated and not first_line_text:
                    # Recreate the layout with no max_width to be sure that
                    # we don't break inside the hyphenate-character string
                    hyphenated = True
                    layout.set_text(hyphenated_first_line_text)
                    pango.pango_layout_set_width(
                        layout.layout, units_from_double(-1))
                    lines = layout.iter_lines()
                    first_line = next(lines, None)
                    second_line = next(lines, None)
                    resume_at = len(new_first_line_text.encode('utf8'))
                    if text[len(first_line_text)] == soft_hyphen:
                        resume_at += len(soft_hyphen.encode('utf8'))

    if not hyphenated and first_line_text.endswith(soft_hyphen):
        # Recreate the layout with no max_width to be sure that
        # we don't break inside the hyphenate-character string
        hyphenated = True
        hyphenated_first_line_text = (
            first_line_text + style.hyphenate_character)
        layout.set_text(hyphenated_first_line_text)
        pango.pango_layout_set_width(
            layout.layout, units_from_double(-1))
        lines = layout.iter_lines()
        first_line = next(lines, None)
        second_line = next(lines, None)
        resume_at = len(first_line_text.encode('utf8'))

    # Step 5: Try to break word if it's too long for the line
    overflow_wrap = style.overflow_wrap
    first_line_width, _ = get_size(first_line, style)
    space = max_width - first_line_width
    # If we can break words and the first line is too long
    if overflow_wrap == 'break-word' and space < 0:
        # Is it really OK to remove hyphenation for word-break ?
        hyphenated = False
        # TODO: Modify code to preserve W3C condition:
        # "Shaping characters are still shaped as if the word were not broken"
        # The way new lines are processed in this function (one by one with no
        # memory of the last) prevents shaping characters (arabic, for
        # instance) from keeping their shape when wrapped on the next line with
        # pango layout.  Maybe insert Unicode shaping characters in text ?
        layout.set_text(text)
        pango.pango_layout_set_width(
            layout.layout, units_from_double(max_width))
        layout.set_wrap(PANGO_WRAP_MODE['WRAP_WORD_CHAR'])
        temp_lines = layout.iter_lines()
        next(temp_lines, None)
        temp_second_line = next(temp_lines, None)
        temp_second_line_index = (
            len(text.encode('utf-8')) if temp_second_line is None
            else temp_second_line.start_index)
        resume_at = temp_second_line_index
        first_line_text = utf8_slice(text, slice(temp_second_line_index))
        layout.set_text(first_line_text)
        lines = layout.iter_lines()
        first_line = next(lines, None)

    return first_line_metrics(
        first_line, text, layout, resume_at, space_collapse, style, hyphenated,
        style.hyphenate_character)
Example #16
0
import pyphen

pyphen.language_fallback('es_ES_variant1')

dic = pyphen.Pyphen(lang='es')


def esVocal(texto):
    return texto in "aeiouáéíóú"


def findSmallers(silabas):
    smallers = -1
    min = 1000000
    for i in range(len(silabas) - 1):
        if min > len(silabas[i]) + len(silabas[i + 1]):
            min = len(silabas[i]) + len(silabas[i + 1])
            smallers = i
    return i


def silabas(text, n):
    silabas = list(
        filter(lambda x: len(x) > 0,
               dic.inserted(text).replace(" ", "").split("-")))
    if len(silabas) == n:
        return silabas
    elif len(silabas) < n:
        index = 0
        while (len(silabas) < n):
            index = (index + 1) % len(silabas)
Example #17
0
from typing import List
import pyphen
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.data.path.append("/srv/nltk_data")

pyphen.language_fallback("en_GB")

languages = {"en_US": "english", "en_GB": "english", "de_DE": "german"}


class Text:
    def __init__(self, content: str, lang="en_GB"):
        self.language: str = lang
        self.content: str = content
        self.sentences: List["Sentence"] = Sentence.tokenize(content,
                                                             lang=lang)

    @property
    def num_characters(self) -> int:
        return len(self.content)

    @property
    def num_sentences(self) -> int:
        return len(self.sentences)

    @property
    def words(self) -> List["Word"]:
        words: List["Word"] = []
        for sentence in self.sentences:
Example #18
0
def split_first_line(text, style, context, max_width, justification_spacing,
                     minimum=False):
    """Fit as much as possible in the available width for one line of text.

    Return ``(layout, length, resume_index, width, height, baseline)``.

    ``layout``: a pango Layout with the first line
    ``length``: length in UTF-8 bytes of the first line
    ``resume_index``: The number of UTF-8 bytes to skip for the next line.
                      May be ``None`` if the whole text fits in one line.
                      This may be greater than ``length`` in case of preserved
                      newline characters.
    ``width``: width in pixels of the first line
    ``height``: height in pixels of the first line
    ``baseline``: baseline in pixels of the first line

    """
    # See https://www.w3.org/TR/css-text-3/#white-space-property
    text_wrap = style['white_space'] in ('normal', 'pre-wrap', 'pre-line')
    space_collapse = style['white_space'] in ('normal', 'nowrap', 'pre-line')

    original_max_width = max_width
    if not text_wrap:
        max_width = None

    # Step #1: Get a draft layout with the first line
    layout = None
    if (max_width is not None and max_width != float('inf') and
            style['font_size']):
        if max_width == 0:
            # Trying to find minimum size, let's naively split on spaces and
            # keep one word + one letter
            space_index = text.find(' ')
            if space_index == -1:
                expected_length = len(text)
            else:
                expected_length = space_index + 2  # index + space + one letter
        else:
            expected_length = int(max_width / style['font_size'] * 2.5)
        if expected_length < len(text):
            # Try to use a small amount of text instead of the whole text
            layout = create_layout(
                text[:expected_length], style, context, max_width,
                justification_spacing)
            first_line, index = layout.get_first_line()
            if index is None:
                # The small amount of text fits in one line, give up and use
                # the whole text
                layout = None
    if layout is None:
        layout = create_layout(
            text, style, context, original_max_width, justification_spacing)
        first_line, index = layout.get_first_line()
    resume_index = index

    # Step #2: Don't split lines when it's not needed
    if max_width is None:
        # The first line can take all the place needed
        return first_line_metrics(
            first_line, text, layout, resume_index, space_collapse, style)
    first_line_width, _ = line_size(first_line, style)
    if index is None and first_line_width <= max_width:
        # The first line fits in the available width
        return first_line_metrics(
            first_line, text, layout, resume_index, space_collapse, style)

    # Step #3: Try to put the first word of the second line on the first line
    # https://mail.gnome.org/archives/gtk-i18n-list/2013-September/msg00006
    # is a good thread related to this problem.
    first_line_text = text.encode('utf-8')[:index].decode('utf-8')
    # We can’t rely on first_line_width, see
    # https://github.com/Kozea/WeasyPrint/issues/1051
    first_line_fits = (
        first_line_width <= max_width or
        ' ' in first_line_text.strip() or
        can_break_text(first_line_text.strip(), style['lang']))
    if first_line_fits:
        # The first line fits but may have been cut too early by Pango
        second_line_text = text.encode('utf-8')[index:].decode('utf-8')
    else:
        # The line can't be split earlier, try to hyphenate the first word.
        first_line_text = ''
        second_line_text = text

    next_word = second_line_text.split(' ', 1)[0]
    if next_word:
        if space_collapse:
            # next_word might fit without a space afterwards
            # only try when space collapsing is allowed
            new_first_line_text = first_line_text + next_word
            layout.set_text(new_first_line_text)
            first_line, index = layout.get_first_line()
            first_line_width, _ = line_size(first_line, style)
            if index is None and first_line_text:
                # The next word fits in the first line, keep the layout
                resume_index = len(new_first_line_text.encode('utf-8')) + 1
                return first_line_metrics(
                    first_line, text, layout, resume_index, space_collapse,
                    style)
            elif index:
                # Text may have been split elsewhere by Pango earlier
                resume_index = index
            else:
                # Second line is none
                resume_index = first_line.length + 1
                if resume_index >= len(text.encode('utf-8')):
                    resume_index = None
    elif first_line_text:
        # We found something on the first line but we did not find a word on
        # the next line, no need to hyphenate, we can keep the current layout
        return first_line_metrics(
            first_line, text, layout, resume_index, space_collapse, style)

    # Step #4: Try to hyphenate
    hyphens = style['hyphens']
    lang = style['lang'] and pyphen.language_fallback(style['lang'])
    total, left, right = style['hyphenate_limit_chars']
    hyphenated = False
    soft_hyphen = '\u00ad'

    try_hyphenate = False
    if hyphens != 'none':
        next_word_boundaries = get_next_word_boundaries(second_line_text, lang)
        if next_word_boundaries:
            # We have a word to hyphenate
            start_word, stop_word = next_word_boundaries
            next_word = second_line_text[start_word:stop_word]
            if stop_word - start_word >= total:
                # This word is long enough
                first_line_width, _ = line_size(first_line, style)
                space = max_width - first_line_width
                if style['hyphenate_limit_zone'].unit == '%':
                    limit_zone = (
                        max_width * style['hyphenate_limit_zone'].value / 100.)
                else:
                    limit_zone = style['hyphenate_limit_zone'].value
                if space > limit_zone or space < 0:
                    # Available space is worth the try, or the line is even too
                    # long to fit: try to hyphenate
                    try_hyphenate = True

    if try_hyphenate:
        # Automatic hyphenation possible and next word is long enough
        auto_hyphenation = hyphens == 'auto' and lang
        manual_hyphenation = False
        if auto_hyphenation:
            if soft_hyphen in first_line_text or soft_hyphen in next_word:
                # Automatic hyphenation opportunities within a word must be
                # ignored if the word contains a conditional hyphen, in favor
                # of the conditional hyphen(s).
                # See https://drafts.csswg.org/css-text-3/#valdef-hyphens-auto
                manual_hyphenation = True
        else:
            manual_hyphenation = hyphens == 'manual'

        if manual_hyphenation:
            # Manual hyphenation: check that the line ends with a soft
            # hyphen and add the missing hyphen
            if first_line_text.endswith(soft_hyphen):
                # The first line has been split on a soft hyphen
                if ' ' in first_line_text:
                    first_line_text, next_word = (
                        first_line_text.rsplit(' ', 1))
                    next_word = f' {next_word}'
                    layout.set_text(first_line_text)
                    first_line, index = layout.get_first_line()
                    resume_index = len((first_line_text + ' ').encode('utf8'))
                else:
                    first_line_text, next_word = '', first_line_text
            soft_hyphen_indexes = [
                match.start() for match in re.finditer(soft_hyphen, next_word)]
            soft_hyphen_indexes.reverse()
            dictionary_iterations = [
                next_word[:i + 1] for i in soft_hyphen_indexes]
        elif auto_hyphenation:
            dictionary_key = (lang, left, right, total)
            dictionary = context.dictionaries.get(dictionary_key)
            if dictionary is None:
                dictionary = pyphen.Pyphen(lang=lang, left=left, right=right)
                context.dictionaries[dictionary_key] = dictionary
            dictionary_iterations = [
                start for start, end in dictionary.iterate(next_word)]
        else:
            dictionary_iterations = []

        if dictionary_iterations:
            for first_word_part in dictionary_iterations:
                new_first_line_text = (
                    first_line_text +
                    second_line_text[:start_word] +
                    first_word_part)
                hyphenated_first_line_text = (
                    new_first_line_text + style['hyphenate_character'])
                new_layout = create_layout(
                    hyphenated_first_line_text, style, context, max_width,
                    justification_spacing)
                new_first_line, new_index = new_layout.get_first_line()
                new_first_line_width, _ = line_size(new_first_line, style)
                new_space = max_width - new_first_line_width
                if new_index is None and (
                        new_space >= 0 or
                        first_word_part == dictionary_iterations[-1]):
                    hyphenated = True
                    layout = new_layout
                    first_line = new_first_line
                    index = new_index
                    resume_index = len(new_first_line_text.encode('utf8'))
                    if text[len(new_first_line_text)] == soft_hyphen:
                        # Recreate the layout with no max_width to be sure that
                        # we don't break before the soft hyphen
                        pango.pango_layout_set_width(
                            layout.layout, units_from_double(-1))
                        resume_index += len(soft_hyphen.encode('utf8'))
                    break

            if not hyphenated and not first_line_text:
                # Recreate the layout with no max_width to be sure that
                # we don't break before or inside the hyphenate character
                hyphenated = True
                layout.set_text(hyphenated_first_line_text)
                pango.pango_layout_set_width(
                    layout.layout, units_from_double(-1))
                first_line, index = layout.get_first_line()
                resume_index = len(new_first_line_text.encode('utf8'))
                if text[len(first_line_text)] == soft_hyphen:
                    resume_index += len(soft_hyphen.encode('utf8'))

    if not hyphenated and first_line_text.endswith(soft_hyphen):
        # Recreate the layout with no max_width to be sure that
        # we don't break inside the hyphenate-character string
        hyphenated = True
        hyphenated_first_line_text = (
            first_line_text + style['hyphenate_character'])
        layout.set_text(hyphenated_first_line_text)
        pango.pango_layout_set_width(
            layout.layout, units_from_double(-1))
        first_line, index = layout.get_first_line()
        resume_index = len(first_line_text.encode('utf8'))

    # Step 5: Try to break word if it's too long for the line
    overflow_wrap = style['overflow_wrap']
    first_line_width, _ = line_size(first_line, style)
    space = max_width - first_line_width
    # If we can break words and the first line is too long
    if not minimum and overflow_wrap == 'break-word' and space < 0:
        # Is it really OK to remove hyphenation for word-break ?
        hyphenated = False
        # TODO: Modify code to preserve W3C condition:
        # "Shaping characters are still shaped as if the word were not broken"
        # The way new lines are processed in this function (one by one with no
        # memory of the last) prevents shaping characters (arabic, for
        # instance) from keeping their shape when wrapped on the next line with
        # pango layout. Maybe insert Unicode shaping characters in text?
        layout.set_text(text)
        pango.pango_layout_set_width(
            layout.layout, units_from_double(max_width))
        pango.pango_layout_set_wrap(
            layout.layout, PANGO_WRAP_MODE['WRAP_CHAR'])
        first_line, index = layout.get_first_line()
        resume_index = index or first_line.length
        if resume_index >= len(text.encode('utf-8')):
            resume_index = None

    return first_line_metrics(
        first_line, text, layout, resume_index, space_collapse, style,
        hyphenated, style['hyphenate_character'])
Example #19
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import config
import langdetect 
import pyphen

from telegram import Update
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackContext

pyphen.language_fallback('ru')

def start(update: Update, context: CallbackContext) -> None:
	"""Send a message when the command /start is issued."""
	if(update.message.from_user.language_code == 'ru'):
		update.message.reply_text('Привет! Этот бот автоматически формирует хайку/хокку в ответ на 17 слоговые сообщения.')
	else:
		update.message.reply_text('Hi! This bot automatically generates haiku/hokku in response to 17 syllable messages.')

def haikudetect(update: Update, context: CallbackContext) -> None:
	if not update.message.text: return
	message = update.message.text
	words = message.split()
	# Checking words count
	if (len(words) < 3 or len(words) > 17): return
	# Loading pyphen dictionary
	dic = pyphen.Pyphen(lang=langdetect.detect(message))
	# Counting syllables
	syllable_count_in_message = 0
	syllable_count = 0
	line = 0